mm/memory.c

/* */
This source file includes following definitions.
oom
free_one_pte
free_one_table
clear_page_tables
free_page_tables
clone_page_tables
copy_page_tables
forget_pte
unmap_pte_range
unmap_pmd_range
unmap_page_range
zeromap_pte_range
zeromap_pmd_range
zeromap_page_range
remap_page_range
put_page
put_dirty_page
do_wp_page
verify_area
get_empty_page
try_to_share
share_page
get_empty_pgtable
do_swap_page
do_no_page
   1 /*
   2  *  linux/mm/memory.c
   3  *
   4  *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
   5  */
   6 
   7 /*
   8  * demand-loading started 01.12.91 - seems it is high on the list of
   9  * things wanted, and it should be easy to implement. - Linus
  10  */
  11 
  12 /*
  13  * Ok, demand-loading was easy, shared pages a little bit tricker. Shared
  14  * pages started 02.12.91, seems to work. - Linus.
  15  *
  16  * Tested sharing by executing about 30 /bin/sh: under the old kernel it
  17  * would have taken more than the 6M I have free, but it worked well as
  18  * far as I could see.
  19  *
  20  * Also corrected some "invalidate()"s - I wasn't doing enough of them.
  21  */
  22 
  23 /*
  24  * Real VM (paging to/from disk) started 18.12.91. Much more work and
  25  * thought has to go into this. Oh, well..
  26  * 19.12.91  -  works, somewhat. Sometimes I get faults, don't know why.
  27  *              Found it. Everything seems to work now.
  28  * 20.12.91  -  Ok, making the swap-device changeable like the root.
  29  */
  30 
  31 /*
  32  * 05.04.94  -  Multi-page memory management added for v1.1.
  33  *              Idea by Alex Bligh (alex@cconcepts.co.uk)
  34  */
  35 
  36 #include <linux/config.h>
  37 #include <linux/signal.h>
  38 #include <linux/sched.h>
  39 #include <linux/head.h>
  40 #include <linux/kernel.h>
  41 #include <linux/errno.h>
  42 #include <linux/string.h>
  43 #include <linux/types.h>
  44 #include <linux/ptrace.h>
  45 #include <linux/mman.h>
  46 #include <linux/mm.h>
  47 
  48 #include <asm/system.h>
  49 #include <asm/segment.h>
  50 #include <asm/pgtable.h>
  51 
  52 unsigned long high_memory = 0;
  53 
  54 /*
  55  * The free_area_list arrays point to the queue heads of the free areas
  56  * of different sizes
  57  */
  58 int nr_swap_pages = 0;
  59 int nr_free_pages = 0;
  60 struct mem_list free_area_list[NR_MEM_LISTS];
  61 unsigned char * free_area_map[NR_MEM_LISTS];
  62 
  63 #define copy_page(from,to) memcpy((void *) to, (void *) from, PAGE_SIZE)
  64 
  65 mem_map_t * mem_map = NULL;
  66 
  67 #define CODE_SPACE(addr,p) ((addr) < (p)->end_code)
  68 
  69 /*
  70  * oom() prints a message (so that the user knows why the process died),
  71  * and gives the process an untrappable SIGKILL.
  72  */
  73 void oom(struct task_struct * task)
     /*  */
  74 {
  75         printk("\nOut of memory for %s.\n", current->comm);
  76         task->sigaction[SIGKILL-1].sa_handler = NULL;
  77         task->blocked &= ~(1<<(SIGKILL-1));
  78         send_sig(SIGKILL,task,1);
  79 }
  80 
  81 static inline void free_one_pte(pte_t * page_table)
     /*  */
  82 {
  83         pte_t page = *page_table;
  84 
  85         if (pte_none(page))
  86                 return;
  87         pte_clear(page_table);
  88         if (!pte_present(page)) {
  89                 swap_free(pte_val(page));
  90                 return;
  91         }
  92         free_page(pte_page(page));
  93         return;
  94 }
  95 
  96 static void free_one_table(pgd_t * page_dir)
     /*  */
  97 {
  98         int j;
  99         pgd_t pg_table = *page_dir;
 100         pte_t * page_table;
 101         unsigned long page;
 102 
 103         if (pgd_none(pg_table))
 104                 return;
 105         pgd_clear(page_dir);
 106         if (pgd_bad(pg_table)) {
 107                 printk("Bad page table: [%p]=%08lx\n",page_dir,pgd_val(pg_table));
 108                 return;
 109         }
 110         page = pgd_page(pg_table);
 111         if (mem_map[MAP_NR(page)] & MAP_PAGE_RESERVED)
 112                 return;
 113         page_table = (pte_t *) page;
 114         for (j = 0 ; j < PTRS_PER_PAGE ; j++,page_table++)
 115                 free_one_pte(page_table);
 116         free_page(page);
 117 }
 118 
 119 /*
 120  * This function clears all user-level page tables of a process - this
 121  * is needed by execve(), so that old pages aren't in the way. Note that
 122  * unlike 'free_page_tables()', this function still leaves a valid
 123  * page-table-tree in memory: it just removes the user pages. The two
 124  * functions are similar, but there is a fundamental difference.
 125  */
 126 void clear_page_tables(struct task_struct * tsk)
     /*  */
 127 {
 128         int i;
 129         pgd_t * page_dir;
 130 
 131         if (!tsk)
 132                 return;
 133         if (tsk == task[0])
 134                 panic("task[0] (swapper) doesn't support exec()\n");
 135         page_dir = PAGE_DIR_OFFSET(tsk, 0);
 136         if (!page_dir || page_dir == swapper_pg_dir) {
 137                 printk("Trying to clear kernel page-directory: not good\n");
 138                 return;
 139         }
 140         if (mem_map[MAP_NR((unsigned long) page_dir)] > 1) {
 141                 pgd_t * new_pg;
 142 
 143                 if (!(new_pg = (pgd_t *) get_free_page(GFP_KERNEL))) {
 144                         oom(tsk);
 145                         return;
 146                 }
 147                 for (i = 768 ; i < 1024 ; i++)
 148                         new_pg[i] = page_dir[i];
 149                 free_page((unsigned long) page_dir);
 150                 SET_PAGE_DIR(tsk, new_pg);
 151                 return;
 152         }
 153         for (i = 0 ; i < 768 ; i++,page_dir++)
 154                 free_one_table(page_dir);
 155         invalidate();
 156         return;
 157 }
 158 
 159 /*
 160  * This function frees up all page tables of a process when it exits.
 161  */
 162 void free_page_tables(struct task_struct * tsk)
     /*  */
 163 {
 164         int i;
 165         pgd_t * page_dir;
 166 
 167         if (!tsk)
 168                 return;
 169         if (tsk == task[0]) {
 170                 printk("task[0] (swapper) killed: unable to recover\n");
 171                 panic("Trying to free up swapper memory space");
 172         }
 173         page_dir = PAGE_DIR_OFFSET(tsk, 0);
 174         if (!page_dir || page_dir == swapper_pg_dir) {
 175                 printk("Trying to free kernel page-directory: not good\n");
 176                 return;
 177         }
 178         SET_PAGE_DIR(tsk, swapper_pg_dir);
 179         if (mem_map[MAP_NR((unsigned long) page_dir)] > 1) {
 180                 free_page((unsigned long) page_dir);
 181                 return;
 182         }
 183         for (i = 0 ; i < PTRS_PER_PAGE ; i++)
 184                 free_one_table(page_dir + i);
 185         free_page((unsigned long) page_dir);
 186         invalidate();
 187 }
 188 
 189 /*
 190  * clone_page_tables() clones the page table for a process - both
 191  * processes will have the exact same pages in memory. There are
 192  * probably races in the memory management with cloning, but we'll
 193  * see..
 194  */
 195 int clone_page_tables(struct task_struct * tsk)
     /*  */
 196 {
 197         pgd_t * pg_dir;
 198 
 199         pg_dir = PAGE_DIR_OFFSET(current, 0);
 200         mem_map[MAP_NR((unsigned long) pg_dir)]++;
 201         SET_PAGE_DIR(tsk, pg_dir);
 202         return 0;
 203 }
 204 
 205 /*
 206  * copy_page_tables() just copies the whole process memory range:
 207  * note the special handling of RESERVED (ie kernel) pages, which
 208  * means that they are always shared by all processes.
 209  */
 210 int copy_page_tables(struct task_struct * tsk)
     /*  */
 211 {
 212         int i;
 213         pgd_t *old_page_dir;
 214         pgd_t *new_page_dir;
 215 
 216         new_page_dir = (pgd_t *) get_free_page(GFP_KERNEL);
 217         if (!new_page_dir)
 218                 return -ENOMEM;
 219         old_page_dir = PAGE_DIR_OFFSET(current, 0);
 220         SET_PAGE_DIR(tsk, new_page_dir);
 221         for (i = 0 ; i < PTRS_PER_PAGE ; i++,old_page_dir++,new_page_dir++) {
 222                 int j;
 223                 pgd_t old_pg_table;
 224                 pte_t *old_page_table, *new_page_table;
 225 
 226                 old_pg_table = *old_page_dir;
 227                 if (pgd_none(old_pg_table))
 228                         continue;
 229                 if (pgd_bad(old_pg_table)) {
 230                         printk("copy_page_tables: bad page table: "
 231                                 "probable memory corruption\n");
 232                         pgd_clear(old_page_dir);
 233                         continue;
 234                 }
 235                 if (mem_map[MAP_NR(pgd_page(old_pg_table))] & MAP_PAGE_RESERVED) {
 236                         *new_page_dir = old_pg_table;
 237                         continue;
 238                 }
 239                 if (!(new_page_table = (pte_t *) get_free_page(GFP_KERNEL))) {
 240                         free_page_tables(tsk);
 241                         return -ENOMEM;
 242                 }
 243                 old_page_table = (pte_t *) pgd_page(old_pg_table);
 244                 pgd_set(new_page_dir, new_page_table);
 245                 for (j = 0 ; j < PTRS_PER_PAGE ; j++,old_page_table++,new_page_table++) {
 246                         pte_t pte = *old_page_table;
 247                         if (pte_none(pte))
 248                                 continue;
 249                         if (!pte_present(pte)) {
 250                                 swap_duplicate(pte_val(pte));
 251                                 *new_page_table = pte;
 252                                 continue;
 253                         }
 254                         if (pte_page(pte) > high_memory || (mem_map[MAP_NR(pte_page(pte))] & MAP_PAGE_RESERVED)) {
 255                                 *new_page_table = pte;
 256                                 continue;
 257                         }
 258                         if (pte_cow(pte))
 259                                 pte = pte_wrprotect(pte);
 260                         if (delete_from_swap_cache(pte_page(pte)))
 261                                 pte = pte_mkdirty(pte);
 262                         *new_page_table = pte;
 263                         *old_page_table = pte;
 264                         mem_map[MAP_NR(pte_page(pte))]++;
 265                 }
 266         }
 267         invalidate();
 268         return 0;
 269 }
 270 
 271 static inline void forget_pte(pte_t page)
     /*  */
 272 {
 273         if (pte_none(page))
 274                 return;
 275         if (pte_present(page)) {
 276                 free_page(pte_page(page));
 277                 if (mem_map[MAP_NR(pte_page(page))] & MAP_PAGE_RESERVED)
 278                         return;
 279                 if (current->mm->rss <= 0)
 280                         return;
 281                 current->mm->rss--;
 282                 return;
 283         }
 284         swap_free(pte_val(page));
 285 }
 286 
 287 static inline void unmap_pte_range(pmd_t * pmd, unsigned long address, unsigned long size)
     /*  */
 288 {
 289         pte_t * pte;
 290         unsigned long end;
 291 
 292         if (pmd_none(*pmd))
 293                 return;
 294         if (pmd_bad(*pmd)) {
 295                 printk("unmap_pte_range: bad pmd (%08lx)\n", pmd_val(*pmd));
 296                 pmd_clear(pmd);
 297                 return;
 298         }
 299         pte = pte_offset(pmd, address);
 300         address &= ~PMD_MASK;
 301         end = address + size;
 302         if (end >= PMD_SIZE)
 303                 end = PMD_SIZE;
 304         do {
 305                 pte_t page = *pte;
 306                 pte_clear(pte);
 307                 forget_pte(page);
 308                 address += PAGE_SIZE;
 309                 pte++;
 310         } while (address < end);
 311 }
 312 
 313 static inline void unmap_pmd_range(pgd_t * dir, unsigned long address, unsigned long size)
     /*  */
 314 {
 315         pmd_t * pmd;
 316         unsigned long end;
 317 
 318         if (pgd_none(*dir))
 319                 return;
 320         if (pgd_bad(*dir)) {
 321                 printk("unmap_pmd_range: bad pgd (%08lx)\n", pgd_val(*dir));
 322                 pgd_clear(dir);
 323                 return;
 324         }
 325         pmd = pmd_offset(dir, address);
 326         address &= ~PGDIR_MASK;
 327         end = address + size;
 328         if (end > PGDIR_SIZE)
 329                 end = PGDIR_SIZE;
 330         do {
 331                 unmap_pte_range(pmd, address, end - address);
 332                 address = (address + PMD_SIZE) & PMD_MASK; 
 333                 pmd++;
 334         } while (address < end);
 335 }
 336 
 337 /*
 338  * a more complete version of free_page_tables which performs with page
 339  * granularity.
 340  */
 341 int unmap_page_range(unsigned long address, unsigned long size)
     /*  */
 342 {
 343         pgd_t * dir;
 344         unsigned long end = address + size;
 345 
 346         dir = pgd_offset(current, address);
 347         while (address < end) {
 348                 unmap_pmd_range(dir, address, end - address);
 349                 address = (address + PGDIR_SIZE) & PGDIR_MASK;
 350                 dir++;
 351         }
 352         invalidate();
 353         return 0;
 354 }
 355 
 356 static inline void zeromap_pte_range(pte_t * pte, unsigned long address, unsigned long size, pte_t zero_pte)
     /*  */
 357 {
 358         unsigned long end;
 359 
 360         address &= ~PMD_MASK;
 361         end = address + size;
 362         if (end > PMD_SIZE)
 363                 end = PMD_SIZE;
 364         do {
 365                 pte_t oldpage = *pte;
 366                 *pte = zero_pte;
 367                 forget_pte(oldpage);
 368                 address += PAGE_SIZE;
 369                 pte++;
 370         } while (address < end);
 371 }
 372 
 373 static inline int zeromap_pmd_range(pmd_t * pmd, unsigned long address, unsigned long size, pte_t zero_pte)
     /*  */
 374 {
 375         unsigned long end;
 376 
 377         address &= ~PGDIR_MASK;
 378         end = address + size;
 379         if (end > PGDIR_SIZE)
 380                 end = PGDIR_SIZE;
 381         do {
 382                 pte_t * pte = pte_alloc(pmd, address);
 383                 if (!pte)
 384                         return -ENOMEM;
 385                 zeromap_pte_range(pte, address, end - address, zero_pte);
 386                 address = (address + PMD_SIZE) & PMD_MASK;
 387                 pmd++;
 388         } while (address < end);
 389         return 0;
 390 }
 391 
 392 int zeromap_page_range(unsigned long address, unsigned long size, pgprot_t prot)
     /*  */
 393 {
 394         int error = 0;
 395         pgd_t * dir;
 396         unsigned long end = address + size;
 397         pte_t zero_pte;
 398 
 399         zero_pte = pte_wrprotect(mk_pte(ZERO_PAGE, prot));
 400         dir = pgd_offset(current, address);
 401         while (address < end) {
 402                 pmd_t *pmd = pmd_alloc(dir, address);
 403                 error = -ENOMEM;
 404                 if (!pmd)
 405                         break;
 406                 error = zeromap_pmd_range(pmd, address, end - address, zero_pte);
 407                 if (error)
 408                         break;
 409                 address = (address + PGDIR_SIZE) & PGDIR_MASK;
 410                 dir++;
 411         }
 412         invalidate();
 413         return error;
 414 }
 415 
 416 /*
 417  * maps a range of physical memory into the requested pages. the old
 418  * mappings are removed. any references to nonexistent pages results
 419  * in null mappings (currently treated as "copy-on-access")
 420  */
 421 int remap_page_range(unsigned long from, unsigned long to, unsigned long size, pgprot_t prot)
     /*  */
 422 {
 423         pgd_t * dir;
 424         pte_t * page_table;
 425         unsigned long poff, pcnt;
 426 
 427         if ((from & ~PAGE_MASK) || (to & ~PAGE_MASK)) {
 428                 printk("remap_page_range: from = %08lx, to=%08lx\n",from,to);
 429                 return -EINVAL;
 430         }
 431         dir = PAGE_DIR_OFFSET(current,from);
 432         size = (size + ~PAGE_MASK) >> PAGE_SHIFT;
 433         poff = (from >> PAGE_SHIFT) & (PTRS_PER_PAGE-1);
 434         if ((pcnt = PTRS_PER_PAGE - poff) > size)
 435                 pcnt = size;
 436 
 437         while (size > 0) {
 438                 if (!pgd_present(*dir)) {
 439                         if (!(page_table = (pte_t *) get_free_page(GFP_KERNEL))) {
 440                                 invalidate();
 441                                 return -1;
 442                         }
 443                         if (pgd_present(*dir)) {
 444                                 free_page((unsigned long) page_table);
 445                                 page_table = (pte_t *) pgd_page(*dir);
 446                         } else
 447                                 pgd_set(dir, page_table);
 448                 } else
 449                         page_table = (pte_t *) pgd_page(*dir);
 450                 dir++;
 451                 page_table += poff;
 452                 poff = 0;
 453 
 454                 for (size -= pcnt; pcnt-- ;) {
 455                         pte_t page = *page_table;
 456                         if (!pte_none(page)) {
 457                                 pte_clear(page_table);
 458                                 if (pte_present(page)) {
 459                                         if (!(mem_map[MAP_NR(pte_page(page))] & MAP_PAGE_RESERVED))
 460                                                 if (current->mm->rss > 0)
 461                                                         --current->mm->rss;
 462                                         free_page(pte_page(page));
 463                                 } else
 464                                         swap_free(pte_val(page));
 465                         }
 466                         if (to >= high_memory)
 467                                 *page_table = mk_pte(to, prot);
 468                         else if (mem_map[MAP_NR(to)]) {
 469                                 *page_table = mk_pte(to, prot);
 470                                 if (!(mem_map[MAP_NR(to)] & MAP_PAGE_RESERVED)) {
 471                                         ++current->mm->rss;
 472                                         mem_map[MAP_NR(to)]++;
 473                                 }
 474                         }
 475                         page_table++;
 476                         to += PAGE_SIZE;
 477                 }
 478                 pcnt = (size > PTRS_PER_PAGE ? PTRS_PER_PAGE : size);
 479         }
 480         invalidate();
 481         return 0;
 482 }
 483 
 484 /*
 485  * sanity-check function..
 486  */
 487 static void put_page(pte_t * page_table, pte_t pte)
     /*  */
 488 {
 489         if (!pte_none(*page_table)) {
 490                 printk("put_page: page already exists\n");
 491                 free_page(pte_page(pte));
 492                 return;
 493         }
 494 /* no need for invalidate */
 495         *page_table = pte;
 496 }
 497 
 498 /*
 499  * This routine is used to map in a page into an address space: needed by
 500  * execve() for the initial stack and environment pages.
 501  */
 502 unsigned long put_dirty_page(struct task_struct * tsk, unsigned long page, unsigned long address)
     /*  */
 503 {
 504         pgd_t * page_dir;
 505         pte_t * page_table;
 506 
 507         if (page >= high_memory)
 508                 printk("put_dirty_page: trying to put page %08lx at %08lx\n",page,address);
 509         if (mem_map[MAP_NR(page)] != 1)
 510                 printk("mem_map disagrees with %08lx at %08lx\n",page,address);
 511         page_dir = PAGE_DIR_OFFSET(tsk,address);
 512         if (pgd_present(*page_dir)) {
 513                 page_table = (pte_t *) pgd_page(*page_dir);
 514         } else {
 515                 if (!(page_table = (pte_t *) get_free_page(GFP_KERNEL)))
 516                         return 0;
 517                 if (pgd_present(*page_dir)) {
 518                         free_page((unsigned long) page_table);
 519                         page_table = (pte_t *) pgd_page(*page_dir);
 520                 } else {
 521                         pgd_set(page_dir, page_table);
 522                 }
 523         }
 524         page_table += (address >> PAGE_SHIFT) & (PTRS_PER_PAGE-1);
 525         if (!pte_none(*page_table)) {
 526                 printk("put_dirty_page: page already exists\n");
 527                 pte_clear(page_table);
 528                 invalidate();
 529         }
 530         *page_table = pte_mkwrite(pte_mkdirty(mk_pte(page, PAGE_COPY)));
 531 /* no need for invalidate */
 532         return page;
 533 }
 534 
 535 /*
 536  * This routine handles present pages, when users try to write
 537  * to a shared page. It is done by copying the page to a new address
 538  * and decrementing the shared-page counter for the old page.
 539  *
 540  * Goto-purists beware: the only reason for goto's here is that it results
 541  * in better assembly code.. The "default" path will see no jumps at all.
 542  *
 543  * Note that this routine assumes that the protection checks have been
 544  * done by the caller (the low-level page fault routine in most cases).
 545  * Thus we can safely just mark it writable once we've done any necessary
 546  * COW.
 547  *
 548  * We also mark the page dirty at this point even though the page will
 549  * change only once the write actually happens. This avoids a few races,
 550  * and potentially makes it more efficient.
 551  */
 552 void do_wp_page(struct vm_area_struct * vma, unsigned long address,
     /*  */
 553         int write_access)
 554 {
 555         pgd_t *page_dir;
 556         pte_t *page_table, pte;
 557         unsigned long old_page, new_page;
 558 
 559         new_page = __get_free_page(GFP_KERNEL);
 560         page_dir = PAGE_DIR_OFFSET(vma->vm_task,address);
 561         if (pgd_none(*page_dir))
 562                 goto end_wp_page;
 563         if (pgd_bad(*page_dir))
 564                 goto bad_wp_pagetable;
 565         page_table = (pte_t *) pgd_page(*page_dir);
 566         page_table += (address >> PAGE_SHIFT) & (PTRS_PER_PAGE-1);
 567         pte = *page_table;
 568         if (!pte_present(pte))
 569                 goto end_wp_page;
 570         if (pte_write(pte))
 571                 goto end_wp_page;
 572         old_page = pte_page(pte);
 573         if (old_page >= high_memory)
 574                 goto bad_wp_page;
 575         vma->vm_task->mm->min_flt++;
 576         /*
 577          * Do we need to copy?
 578          */
 579         if (mem_map[MAP_NR(old_page)] != 1) {
 580                 if (new_page) {
 581                         if (mem_map[MAP_NR(old_page)] & MAP_PAGE_RESERVED)
 582                                 ++vma->vm_task->mm->rss;
 583                         copy_page(old_page,new_page);
 584                         *page_table = pte_mkwrite(pte_mkdirty(mk_pte(new_page, vma->vm_page_prot)));
 585                         free_page(old_page);
 586                         invalidate();
 587                         return;
 588                 }
 589                 free_page(old_page);
 590                 oom(vma->vm_task);
 591                 *page_table = BAD_PAGE;
 592                 invalidate();
 593                 return;
 594         }
 595         *page_table = pte_mkdirty(pte_mkwrite(pte));
 596         invalidate();
 597         if (new_page)
 598                 free_page(new_page);
 599         return;
 600 bad_wp_page:
 601         printk("do_wp_page: bogus page at address %08lx (%08lx)\n",address,old_page);
 602         *page_table = BAD_PAGE;
 603         send_sig(SIGKILL, vma->vm_task, 1);
 604         goto end_wp_page;
 605 bad_wp_pagetable:
 606         printk("do_wp_page: bogus page-table at address %08lx (%08lx)\n", address, pgd_val(*page_dir));
 607         pgd_set(page_dir, BAD_PAGETABLE);
 608         send_sig(SIGKILL, vma->vm_task, 1);
 609 end_wp_page:
 610         if (new_page)
 611                 free_page(new_page);
 612         return;
 613 }
 614 
 615 /*
 616  * Ugly, ugly, but the goto's result in better assembly..
 617  */
 618 int verify_area(int type, const void * addr, unsigned long size)
     /*  */
 619 {
 620         struct vm_area_struct * vma;
 621         unsigned long start = (unsigned long) addr;
 622 
 623         /* If the current user space is mapped to kernel space (for the
 624          * case where we use a fake user buffer with get_fs/set_fs()) we
 625          * don't expect to find the address in the user vm map.
 626          */
 627         if (get_fs() == get_ds())
 628                 return 0;
 629 
 630         vma = find_vma(current, start);
 631         if (!vma)
 632                 goto bad_area;
 633         if (vma->vm_start <= start)
 634                 goto good_area;
 635         if (!(vma->vm_flags & VM_GROWSDOWN))
 636                 goto bad_area;
 637         if (vma->vm_end - start > current->rlim[RLIMIT_STACK].rlim_cur)
 638                 goto bad_area;
 639 
 640 good_area:
 641         if (type == VERIFY_WRITE)
 642                 goto check_write;
 643         for (;;) {
 644                 struct vm_area_struct * next;
 645                 if (!(vma->vm_flags & VM_READ))
 646                         goto bad_area;
 647                 if (vma->vm_end - start >= size)
 648                         return 0;
 649                 next = vma->vm_next;
 650                 if (!next || vma->vm_end != next->vm_start)
 651                         goto bad_area;
 652                 vma = next;
 653         }
 654 
 655 check_write:
 656         if (!(vma->vm_flags & VM_WRITE))
 657                 goto bad_area;
 658         if (!wp_works_ok)
 659                 goto check_wp_fault_by_hand;
 660         for (;;) {
 661                 if (vma->vm_end - start >= size)
 662                         break;
 663                 if (!vma->vm_next || vma->vm_end != vma->vm_next->vm_start)
 664                         goto bad_area;
 665                 vma = vma->vm_next;
 666                 if (!(vma->vm_flags & VM_WRITE))
 667                         goto bad_area;
 668         }
 669         return 0;
 670 
 671 check_wp_fault_by_hand:
 672         size--;
 673         size += start & ~PAGE_MASK;
 674         size >>= PAGE_SHIFT;
 675         start &= PAGE_MASK;
 676 
 677         for (;;) {
 678                 do_wp_page(vma, start, 1);
 679                 if (!size)
 680                         break;
 681                 size--;
 682                 start += PAGE_SIZE;
 683                 if (start < vma->vm_end)
 684                         continue;
 685                 vma = vma->vm_next;
 686                 if (!vma || vma->vm_start != start)
 687                         goto bad_area;
 688                 if (!(vma->vm_flags & VM_WRITE))
 689                         goto bad_area;;
 690         }
 691         return 0;
 692 
 693 bad_area:
 694         return -EFAULT;
 695 }
 696 
 697 static inline void get_empty_page(struct vm_area_struct * vma, pte_t * page_table)
     /*  */
 698 {
 699         unsigned long tmp;
 700 
 701         if (!(tmp = get_free_page(GFP_KERNEL))) {
 702                 oom(vma->vm_task);
 703                 put_page(page_table, BAD_PAGE);
 704                 return;
 705         }
 706         put_page(page_table, pte_mkwrite(mk_pte(tmp, vma->vm_page_prot)));
 707 }
 708 
 709 /*
 710  * try_to_share() checks the page at address "address" in the task "p",
 711  * to see if it exists, and if it is clean. If so, share it with the current
 712  * task.
 713  *
 714  * NOTE! This assumes we have checked that p != current, and that they
 715  * share the same inode and can generally otherwise be shared.
 716  */
 717 static int try_to_share(unsigned long to_address, struct vm_area_struct * to_area,
     /*  */
 718         unsigned long from_address, struct vm_area_struct * from_area,
 719         unsigned long newpage)
 720 {
 721         pgd_t * from_dir, * to_dir;
 722         pte_t * from_table, * to_table;
 723         pte_t from, to;
 724 
 725         from_dir = PAGE_DIR_OFFSET(from_area->vm_task,from_address);
 726 /* is there a page-directory at from? */
 727         if (!pgd_present(*from_dir))
 728                 return 0;
 729         from_table = (pte_t *) (pgd_page(*from_dir) + PAGE_PTR(from_address));
 730         from = *from_table;
 731 /* is the page present? */
 732         if (!pte_present(from))
 733                 return 0;
 734 /* if it is dirty it must be from a shared mapping to be shared */
 735         if (pte_dirty(from)) {
 736                 if (!(from_area->vm_flags & VM_SHARED))
 737                         return 0;
 738                 if (pte_write(from)) {
 739                         printk("nonwritable, but dirty, shared page\n");
 740                         return 0;
 741                 }
 742         }
 743 /* is the page reasonable at all? */
 744         if (pte_page(from) >= high_memory)
 745                 return 0;
 746         if (mem_map[MAP_NR(pte_page(from))] & MAP_PAGE_RESERVED)
 747                 return 0;
 748 /* is the destination ok? */
 749         to_dir = PAGE_DIR_OFFSET(to_area->vm_task,to_address);
 750         if (!pgd_present(*to_dir))
 751                 return 0;
 752         to_table = (pte_t *) (pgd_page(*to_dir) + PAGE_PTR(to_address));
 753         to = *to_table;
 754         if (!pte_none(to))
 755                 return 0;
 756 /* do we copy? */
 757         if (newpage) {
 758                 /* if it's in the swap cache, it's dirty by implication */
 759                 /* so we can't use it if it's not from a shared mapping */
 760                 if (in_swap_cache(pte_page(from))) {
 761                         if (!(from_area->vm_flags & VM_SHARED))
 762                                 return 0;
 763                         if (!pte_write(from)) {
 764                                 printk("nonwritable, but dirty, shared page\n");
 765                                 return 0;
 766                         }
 767                 }
 768                 copy_page(pte_page(from), newpage);
 769                 *to_table = mk_pte(newpage, to_area->vm_page_prot);
 770                 return 1;
 771         }
 772 /*
 773  * do a final swap-cache test before sharing them: if it's in the swap
 774  * cache, we have to remove it now, as we get two pointers to the same
 775  * physical page and the cache can't handle it. Mark the original dirty.
 776  *
 777  * NOTE! Even if "from" is dirty, "to" will be clean: if we get here
 778  * with a dirty "from", the from-mapping is a shared map, so we can trust
 779  * the page contents to be up-to-date
 780  */
 781         if (in_swap_cache(pte_page(from))) {
 782                 if (!(from_area->vm_flags & VM_SHARED))
 783                         return 0;
 784                 *from_table = pte_mkdirty(from);
 785                 delete_from_swap_cache(pte_page(from));
 786         }
 787         mem_map[MAP_NR(pte_page(from))]++;
 788         *to_table = mk_pte(pte_page(from), to_area->vm_page_prot);
 789 /* Check if we need to do anything at all to the 'from' field */
 790         if (!pte_write(from))
 791                 return 1;
 792         if (from_area->vm_flags & VM_SHARED)
 793                 return 1;
 794 /* ok, need to mark it read-only, so invalidate any possible old TB entry */
 795         *from_table = pte_wrprotect(from);
 796         invalidate();
 797         return 1;
 798 }
 799 
 800 /*
 801  * share_page() tries to find a process that could share a page with
 802  * the current one.
 803  *
 804  * We first check if it is at all feasible by checking inode->i_count.
 805  * It should be >1 if there are other tasks sharing this inode.
 806  */
 807 static int share_page(struct vm_area_struct * area, unsigned long address,
     /*  */
 808         int write_access, unsigned long newpage)
 809 {
 810         struct inode * inode;
 811         unsigned long offset;
 812         unsigned long from_address;
 813         unsigned long give_page;
 814         struct vm_area_struct * mpnt;
 815 
 816         if (!area || !(inode = area->vm_inode) || inode->i_count < 2)
 817                 return 0;
 818         /* do we need to copy or can we just share? */
 819         give_page = 0;
 820         if (write_access && !(area->vm_flags & VM_SHARED)) {
 821                 if (!newpage)
 822                         return 0;
 823                 give_page = newpage;
 824         }
 825         offset = address - area->vm_start + area->vm_offset;
 826         /* See if there is something in the VM we can share pages with. */
 827         /* Traverse the entire circular i_mmap list, except `area' itself. */
 828         for (mpnt = area->vm_next_share; mpnt != area; mpnt = mpnt->vm_next_share) {
 829                 /* must be same inode */
 830                 if (mpnt->vm_inode != inode) {
 831                         printk("Aiee! Corrupt vm_area_struct i_mmap ring\n");
 832                         break;  
 833                 }
 834                 /* offsets must be mutually page-aligned */
 835                 if ((mpnt->vm_offset ^ area->vm_offset) & ~PAGE_MASK)
 836                         continue;
 837                 /* the other area must actually cover the wanted page.. */
 838                 from_address = offset + mpnt->vm_start - mpnt->vm_offset;
 839                 if (from_address < mpnt->vm_start || from_address >= mpnt->vm_end)
 840                         continue;
 841                 /* .. NOW we can actually try to use the same physical page */
 842                 if (!try_to_share(address, area, from_address, mpnt, give_page))
 843                         continue;
 844                 /* free newpage if we never used it.. */
 845                 if (give_page || !newpage)
 846                         return 1;
 847                 free_page(newpage);
 848                 return 1;
 849         }
 850         return 0;
 851 }
 852 
 853 /*
 854  * fill in an empty page-table if none exists.
 855  */
 856 static inline pte_t * get_empty_pgtable(struct task_struct * tsk,unsigned long address)
     /*  */
 857 {
 858         pgd_t *p;
 859         unsigned long page;
 860 
 861         p = PAGE_DIR_OFFSET(tsk,address);
 862         if (pgd_present(*p))
 863                 return (pte_t *) (PAGE_PTR(address) + pgd_page(*p));
 864         if (!pgd_none(*p)) {
 865                 printk("get_empty_pgtable: bad page-directory entry \n");
 866                 pgd_clear(p);
 867         }
 868         page = get_free_page(GFP_KERNEL);
 869         if (pgd_present(*p)) {
 870                 free_page(page);
 871                 return (pte_t *) (PAGE_PTR(address) + pgd_page(*p));
 872         }
 873         if (!pgd_none(*p)) {
 874                 printk("get_empty_pgtable: bad page-directory entry \n");
 875                 pgd_clear(p);
 876         }
 877         if (page) {
 878                 pgd_set(p, (pte_t *) page);
 879                 return (pte_t *) (PAGE_PTR(address) + page);
 880         }
 881         oom(current);
 882         pgd_set(p, BAD_PAGETABLE);
 883         return NULL;
 884 }
 885 
 886 static inline void do_swap_page(struct vm_area_struct * vma, unsigned long address,
     /*  */
 887         pte_t * page_table, pte_t entry, int write_access)
 888 {
 889         pte_t page;
 890 
 891         if (!vma->vm_ops || !vma->vm_ops->swapin) {
 892                 swap_in(vma, page_table, pte_val(entry), write_access);
 893                 return;
 894         }
 895         page = vma->vm_ops->swapin(vma, address - vma->vm_start + vma->vm_offset, pte_val(entry));
 896         if (pte_val(*page_table) != pte_val(entry)) {
 897                 free_page(pte_page(page));
 898                 return;
 899         }
 900         if (mem_map[MAP_NR(pte_page(page))] > 1 && !(vma->vm_flags & VM_SHARED))
 901                 page = pte_wrprotect(page);
 902         ++vma->vm_task->mm->rss;
 903         ++vma->vm_task->mm->maj_flt;
 904         *page_table = page;
 905         return;
 906 }
 907 
 908 /*
 909  * do_no_page() tries to create a new page mapping. It aggressively
 910  * tries to share with existing pages, but makes a separate copy if
 911  * the "write_access" parameter is true in order to avoid the next
 912  * page fault.
 913  */
 914 void do_no_page(struct vm_area_struct * vma, unsigned long address,
     /*  */
 915         int write_access)
 916 {
 917         pte_t * page_table;
 918         pte_t entry;
 919         unsigned long page;
 920 
 921         page_table = get_empty_pgtable(vma->vm_task,address);
 922         if (!page_table)
 923                 return;
 924         entry = *page_table;
 925         if (pte_present(entry))
 926                 return;
 927         if (!pte_none(entry)) {
 928                 do_swap_page(vma, address, page_table, entry, write_access);
 929                 return;
 930         }
 931         address &= PAGE_MASK;
 932 
 933         if (!vma->vm_ops || !vma->vm_ops->nopage) {
 934                 ++vma->vm_task->mm->rss;
 935                 ++vma->vm_task->mm->min_flt;
 936                 get_empty_page(vma, page_table);
 937                 return;
 938         }
 939         page = get_free_page(GFP_KERNEL);
 940         if (share_page(vma, address, write_access, page)) {
 941                 ++vma->vm_task->mm->min_flt;
 942                 ++vma->vm_task->mm->rss;
 943                 return;
 944         }
 945         if (!page) {
 946                 oom(current);
 947                 put_page(page_table, BAD_PAGE);
 948                 return;
 949         }
 950         ++vma->vm_task->mm->maj_flt;
 951         ++vma->vm_task->mm->rss;
 952         /*
 953          * The fourth argument is "no_share", which tells the low-level code
 954          * to copy, not share the page even if sharing is possible.  It's
 955          * essentially an early COW detection 
 956          */
 957         page = vma->vm_ops->nopage(vma, address, page,
 958                 write_access && !(vma->vm_flags & VM_SHARED));
 959         if (share_page(vma, address, write_access, 0)) {
 960                 free_page(page);
 961                 return;
 962         }
 963         /*
 964          * This silly early PAGE_DIRTY setting removes a race
 965          * due to the bad i386 page protection. But it's valid
 966          * for other architectures too.
 967          *
 968          * Note that if write_access is true, we either now have
 969          * a exclusive copy of the page, or this is a shared mapping,
 970          * so we can make it writable and dirty to avoid having to
 971          * handle that later.
 972          */
 973         entry = mk_pte(page, vma->vm_page_prot);
 974         if (write_access) {
 975                 entry = pte_mkwrite(pte_mkdirty(entry));
 976         } else if (mem_map[MAP_NR(page)] > 1 && !(vma->vm_flags & VM_SHARED))
 977                 entry = pte_wrprotect(entry);
 978         put_page(page_table, entry);
 979 }
/* */
root/mm/memory.c

DEFINITIONS