mm/memory.c

/* */
This source file includes following definitions.
oom
free_one_pte
free_one_table
clear_page_tables
free_page_tables
clone_page_tables
copy_page_tables
unmap_page_range
zeromap_page_range
remap_page_range
put_page
put_dirty_page
do_wp_page
verify_area
get_empty_page
try_to_share
share_page
get_empty_pgtable
do_swap_page
do_no_page
   1 /*
   2  *  linux/mm/memory.c
   3  *
   4  *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
   5  */
   6 
   7 /*
   8  * demand-loading started 01.12.91 - seems it is high on the list of
   9  * things wanted, and it should be easy to implement. - Linus
  10  */
  11 
  12 /*
  13  * Ok, demand-loading was easy, shared pages a little bit tricker. Shared
  14  * pages started 02.12.91, seems to work. - Linus.
  15  *
  16  * Tested sharing by executing about 30 /bin/sh: under the old kernel it
  17  * would have taken more than the 6M I have free, but it worked well as
  18  * far as I could see.
  19  *
  20  * Also corrected some "invalidate()"s - I wasn't doing enough of them.
  21  */
  22 
  23 /*
  24  * Real VM (paging to/from disk) started 18.12.91. Much more work and
  25  * thought has to go into this. Oh, well..
  26  * 19.12.91  -  works, somewhat. Sometimes I get faults, don't know why.
  27  *              Found it. Everything seems to work now.
  28  * 20.12.91  -  Ok, making the swap-device changeable like the root.
  29  */
  30 
  31 /*
  32  * 05.04.94  -  Multi-page memory management added for v1.1.
  33  *              Idea by Alex Bligh (alex@cconcepts.co.uk)
  34  */
  35 
  36 #include <linux/config.h>
  37 #include <linux/signal.h>
  38 #include <linux/sched.h>
  39 #include <linux/head.h>
  40 #include <linux/kernel.h>
  41 #include <linux/errno.h>
  42 #include <linux/string.h>
  43 #include <linux/types.h>
  44 #include <linux/ptrace.h>
  45 #include <linux/mman.h>
  46 #include <linux/mm.h>
  47 
  48 #include <asm/system.h>
  49 #include <asm/segment.h>
  50 #include <asm/pgtable.h>
  51 
  52 unsigned long high_memory = 0;
  53 
  54 /*
  55  * The free_area_list arrays point to the queue heads of the free areas
  56  * of different sizes
  57  */
  58 int nr_swap_pages = 0;
  59 int nr_free_pages = 0;
  60 struct mem_list free_area_list[NR_MEM_LISTS];
  61 unsigned char * free_area_map[NR_MEM_LISTS];
  62 
  63 #define copy_page(from,to) memcpy((void *) to, (void *) from, PAGE_SIZE)
  64 
  65 mem_map_t * mem_map = NULL;
  66 
  67 #define CODE_SPACE(addr,p) ((addr) < (p)->end_code)
  68 
  69 /*
  70  * oom() prints a message (so that the user knows why the process died),
  71  * and gives the process an untrappable SIGKILL.
  72  */
  73 void oom(struct task_struct * task)
     /*  */
  74 {
  75         printk("\nOut of memory for %s.\n", current->comm);
  76         task->sigaction[SIGKILL-1].sa_handler = NULL;
  77         task->blocked &= ~(1<<(SIGKILL-1));
  78         send_sig(SIGKILL,task,1);
  79 }
  80 
  81 static inline void free_one_pte(pte_t * page_table)
     /*  */
  82 {
  83         pte_t page = *page_table;
  84 
  85         if (pte_none(page))
  86                 return;
  87         pte_clear(page_table);
  88         if (!pte_present(page)) {
  89                 swap_free(pte_val(page));
  90                 return;
  91         }
  92         free_page(pte_page(page));
  93         return;
  94 }
  95 
  96 static void free_one_table(pgd_t * page_dir)
     /*  */
  97 {
  98         int j;
  99         pgd_t pg_table = *page_dir;
 100         pte_t * page_table;
 101         unsigned long page;
 102 
 103         if (pgd_none(pg_table))
 104                 return;
 105         pgd_clear(page_dir);
 106         if (pgd_bad(pg_table)) {
 107                 printk("Bad page table: [%p]=%08lx\n",page_dir,pgd_val(pg_table));
 108                 return;
 109         }
 110         page = pgd_page(pg_table);
 111         if (mem_map[MAP_NR(page)] & MAP_PAGE_RESERVED)
 112                 return;
 113         page_table = (pte_t *) page;
 114         for (j = 0 ; j < PTRS_PER_PAGE ; j++,page_table++)
 115                 free_one_pte(page_table);
 116         free_page(page);
 117 }
 118 
 119 /*
 120  * This function clears all user-level page tables of a process - this
 121  * is needed by execve(), so that old pages aren't in the way. Note that
 122  * unlike 'free_page_tables()', this function still leaves a valid
 123  * page-table-tree in memory: it just removes the user pages. The two
 124  * functions are similar, but there is a fundamental difference.
 125  */
 126 void clear_page_tables(struct task_struct * tsk)
     /*  */
 127 {
 128         int i;
 129         pgd_t * page_dir;
 130 
 131         if (!tsk)
 132                 return;
 133         if (tsk == task[0])
 134                 panic("task[0] (swapper) doesn't support exec()\n");
 135         page_dir = PAGE_DIR_OFFSET(tsk, 0);
 136         if (!page_dir || page_dir == swapper_pg_dir) {
 137                 printk("Trying to clear kernel page-directory: not good\n");
 138                 return;
 139         }
 140         if (mem_map[MAP_NR((unsigned long) page_dir)] > 1) {
 141                 pgd_t * new_pg;
 142 
 143                 if (!(new_pg = (pgd_t *) get_free_page(GFP_KERNEL))) {
 144                         oom(tsk);
 145                         return;
 146                 }
 147                 for (i = 768 ; i < 1024 ; i++)
 148                         new_pg[i] = page_dir[i];
 149                 free_page((unsigned long) page_dir);
 150                 SET_PAGE_DIR(tsk, new_pg);
 151                 return;
 152         }
 153         for (i = 0 ; i < 768 ; i++,page_dir++)
 154                 free_one_table(page_dir);
 155         invalidate();
 156         return;
 157 }
 158 
 159 /*
 160  * This function frees up all page tables of a process when it exits.
 161  */
 162 void free_page_tables(struct task_struct * tsk)
     /*  */
 163 {
 164         int i;
 165         pgd_t * page_dir;
 166 
 167         if (!tsk)
 168                 return;
 169         if (tsk == task[0]) {
 170                 printk("task[0] (swapper) killed: unable to recover\n");
 171                 panic("Trying to free up swapper memory space");
 172         }
 173         page_dir = PAGE_DIR_OFFSET(tsk, 0);
 174         if (!page_dir || page_dir == swapper_pg_dir) {
 175                 printk("Trying to free kernel page-directory: not good\n");
 176                 return;
 177         }
 178         SET_PAGE_DIR(tsk, swapper_pg_dir);
 179         if (mem_map[MAP_NR((unsigned long) page_dir)] > 1) {
 180                 free_page((unsigned long) page_dir);
 181                 return;
 182         }
 183         for (i = 0 ; i < PTRS_PER_PAGE ; i++)
 184                 free_one_table(page_dir + i);
 185         free_page((unsigned long) page_dir);
 186         invalidate();
 187 }
 188 
 189 /*
 190  * clone_page_tables() clones the page table for a process - both
 191  * processes will have the exact same pages in memory. There are
 192  * probably races in the memory management with cloning, but we'll
 193  * see..
 194  */
 195 int clone_page_tables(struct task_struct * tsk)
     /*  */
 196 {
 197         pgd_t * pg_dir;
 198 
 199         pg_dir = PAGE_DIR_OFFSET(current, 0);
 200         mem_map[MAP_NR((unsigned long) pg_dir)]++;
 201         SET_PAGE_DIR(tsk, pg_dir);
 202         return 0;
 203 }
 204 
 205 /*
 206  * copy_page_tables() just copies the whole process memory range:
 207  * note the special handling of RESERVED (ie kernel) pages, which
 208  * means that they are always shared by all processes.
 209  */
 210 int copy_page_tables(struct task_struct * tsk)
     /*  */
 211 {
 212         int i;
 213         pgd_t *old_page_dir;
 214         pgd_t *new_page_dir;
 215 
 216         new_page_dir = (pgd_t *) get_free_page(GFP_KERNEL);
 217         if (!new_page_dir)
 218                 return -ENOMEM;
 219         old_page_dir = PAGE_DIR_OFFSET(current, 0);
 220         SET_PAGE_DIR(tsk, new_page_dir);
 221         for (i = 0 ; i < PTRS_PER_PAGE ; i++,old_page_dir++,new_page_dir++) {
 222                 int j;
 223                 pgd_t old_pg_table;
 224                 pte_t *old_page_table, *new_page_table;
 225 
 226                 old_pg_table = *old_page_dir;
 227                 if (pgd_none(old_pg_table))
 228                         continue;
 229                 if (pgd_bad(old_pg_table)) {
 230                         printk("copy_page_tables: bad page table: "
 231                                 "probable memory corruption\n");
 232                         pgd_clear(old_page_dir);
 233                         continue;
 234                 }
 235                 if (mem_map[MAP_NR(pgd_page(old_pg_table))] & MAP_PAGE_RESERVED) {
 236                         *new_page_dir = old_pg_table;
 237                         continue;
 238                 }
 239                 if (!(new_page_table = (pte_t *) get_free_page(GFP_KERNEL))) {
 240                         free_page_tables(tsk);
 241                         return -ENOMEM;
 242                 }
 243                 old_page_table = (pte_t *) pgd_page(old_pg_table);
 244                 pgd_set(new_page_dir, new_page_table);
 245                 for (j = 0 ; j < PTRS_PER_PAGE ; j++,old_page_table++,new_page_table++) {
 246                         pte_t pte = *old_page_table;
 247                         if (pte_none(pte))
 248                                 continue;
 249                         if (!pte_present(pte)) {
 250                                 swap_duplicate(pte_val(pte));
 251                                 *new_page_table = pte;
 252                                 continue;
 253                         }
 254                         if (pte_page(pte) > high_memory || (mem_map[MAP_NR(pte_page(pte))] & MAP_PAGE_RESERVED)) {
 255                                 *new_page_table = pte;
 256                                 continue;
 257                         }
 258                         if (pte_cow(pte))
 259                                 pte = pte_wrprotect(pte);
 260                         if (delete_from_swap_cache(pte_page(pte)))
 261                                 pte = pte_mkdirty(pte);
 262                         *new_page_table = pte;
 263                         *old_page_table = pte;
 264                         mem_map[MAP_NR(pte_page(pte))]++;
 265                 }
 266         }
 267         invalidate();
 268         return 0;
 269 }
 270 
 271 /*
 272  * a more complete version of free_page_tables which performs with page
 273  * granularity.
 274  */
 275 int unmap_page_range(unsigned long from, unsigned long size)
     /*  */
 276 {
 277         pgd_t page_dir, * dir;
 278         pte_t page, * page_table;
 279         unsigned long poff, pcnt, pc;
 280 
 281         if (from & ~PAGE_MASK) {
 282                 printk("unmap_page_range called with wrong alignment\n");
 283                 return -EINVAL;
 284         }
 285         size = (size + ~PAGE_MASK) >> PAGE_SHIFT;
 286         dir = PAGE_DIR_OFFSET(current,from);
 287         poff = (from >> PAGE_SHIFT) & (PTRS_PER_PAGE-1);
 288         if ((pcnt = PTRS_PER_PAGE - poff) > size)
 289                 pcnt = size;
 290 
 291         for ( ; size > 0; ++dir, size -= pcnt,
 292              pcnt = (size > PTRS_PER_PAGE ? PTRS_PER_PAGE : size)) {
 293                 page_dir = *dir;
 294                 if (pgd_none(page_dir)) {
 295                         poff = 0;
 296                         continue;
 297                 }
 298                 if (pgd_bad(page_dir)) {
 299                         printk("unmap_page_range: bad page directory.");
 300                         continue;
 301                 }
 302                 page_table = (pte_t *) pgd_page(page_dir);
 303                 if (poff) {
 304                         page_table += poff;
 305                         poff = 0;
 306                 }
 307                 for (pc = pcnt; pc--; page_table++) {
 308                         page = *page_table;
 309                         if (!pte_none(page)) {
 310                                 pte_clear(page_table);
 311                                 if (pte_present(page)) {
 312                                         if (!(mem_map[MAP_NR(pte_page(page))] & MAP_PAGE_RESERVED))
 313                                                 if (current->mm->rss > 0)
 314                                                         --current->mm->rss;
 315                                         free_page(pte_page(page));
 316                                 } else
 317                                         swap_free(pte_val(page));
 318                         }
 319                 }
 320                 if (pcnt == PTRS_PER_PAGE) {
 321                         pgd_clear(dir);
 322                         free_page(pgd_page(page_dir));
 323                 }
 324         }
 325         invalidate();
 326         return 0;
 327 }
 328 
 329 int zeromap_page_range(unsigned long from, unsigned long size, pgprot_t prot)
     /*  */
 330 {
 331         pgd_t * dir;
 332         pte_t * page_table;
 333         unsigned long poff, pcnt;
 334         pte_t zero_pte;
 335 
 336         if (from & ~PAGE_MASK) {
 337                 printk("zeromap_page_range: from = %08lx\n",from);
 338                 return -EINVAL;
 339         }
 340         zero_pte = pte_wrprotect(mk_pte(ZERO_PAGE, prot));
 341         dir = PAGE_DIR_OFFSET(current,from);
 342         size = (size + ~PAGE_MASK) >> PAGE_SHIFT;
 343         poff = (from >> PAGE_SHIFT) & (PTRS_PER_PAGE-1);
 344         if ((pcnt = PTRS_PER_PAGE - poff) > size)
 345                 pcnt = size;
 346 
 347         while (size > 0) {
 348                 if (!pgd_present(*dir)) {
 349                         if (!(page_table = (pte_t *) get_free_page(GFP_KERNEL))) {
 350                                 invalidate();
 351                                 return -ENOMEM;
 352                         }
 353                         if (pgd_present(*dir)) {
 354                                 free_page((unsigned long) page_table);
 355                                 page_table = (pte_t *) pgd_page(*dir);
 356                         } else
 357                                 pgd_set(dir, page_table);
 358                 } else
 359                         page_table = (pte_t *) pgd_page(*dir);
 360                 dir++;
 361                 page_table += poff;
 362                 poff = 0;
 363                 for (size -= pcnt; pcnt-- ;) {
 364                         pte_t page = *page_table;
 365                         if (!pte_none(page)) {
 366                                 pte_clear(page_table);
 367                                 if (pte_present(page)) {
 368                                         if (!(mem_map[MAP_NR(pte_page(page))] & MAP_PAGE_RESERVED))
 369                                                 if (current->mm->rss > 0)
 370                                                         --current->mm->rss;
 371                                         free_page(pte_page(page));
 372                                 } else
 373                                         swap_free(pte_val(page));
 374                         }
 375                         *page_table++ = zero_pte;
 376                 }
 377                 pcnt = (size > PTRS_PER_PAGE ? PTRS_PER_PAGE : size);
 378         }
 379         invalidate();
 380         return 0;
 381 }
 382 
 383 /*
 384  * maps a range of physical memory into the requested pages. the old
 385  * mappings are removed. any references to nonexistent pages results
 386  * in null mappings (currently treated as "copy-on-access")
 387  */
 388 int remap_page_range(unsigned long from, unsigned long to, unsigned long size, pgprot_t prot)
     /*  */
 389 {
 390         pgd_t * dir;
 391         pte_t * page_table;
 392         unsigned long poff, pcnt;
 393 
 394         if ((from & ~PAGE_MASK) || (to & ~PAGE_MASK)) {
 395                 printk("remap_page_range: from = %08lx, to=%08lx\n",from,to);
 396                 return -EINVAL;
 397         }
 398         dir = PAGE_DIR_OFFSET(current,from);
 399         size = (size + ~PAGE_MASK) >> PAGE_SHIFT;
 400         poff = (from >> PAGE_SHIFT) & (PTRS_PER_PAGE-1);
 401         if ((pcnt = PTRS_PER_PAGE - poff) > size)
 402                 pcnt = size;
 403 
 404         while (size > 0) {
 405                 if (!pgd_present(*dir)) {
 406                         if (!(page_table = (pte_t *) get_free_page(GFP_KERNEL))) {
 407                                 invalidate();
 408                                 return -1;
 409                         }
 410                         if (pgd_present(*dir)) {
 411                                 free_page((unsigned long) page_table);
 412                                 page_table = (pte_t *) pgd_page(*dir);
 413                         } else
 414                                 pgd_set(dir, page_table);
 415                 } else
 416                         page_table = (pte_t *) pgd_page(*dir);
 417                 dir++;
 418                 page_table += poff;
 419                 poff = 0;
 420 
 421                 for (size -= pcnt; pcnt-- ;) {
 422                         pte_t page = *page_table;
 423                         if (!pte_none(page)) {
 424                                 pte_clear(page_table);
 425                                 if (pte_present(page)) {
 426                                         if (!(mem_map[MAP_NR(pte_page(page))] & MAP_PAGE_RESERVED))
 427                                                 if (current->mm->rss > 0)
 428                                                         --current->mm->rss;
 429                                         free_page(pte_page(page));
 430                                 } else
 431                                         swap_free(pte_val(page));
 432                         }
 433                         if (to >= high_memory)
 434                                 *page_table = mk_pte(to, prot);
 435                         else if (mem_map[MAP_NR(to)]) {
 436                                 *page_table = mk_pte(to, prot);
 437                                 if (!(mem_map[MAP_NR(to)] & MAP_PAGE_RESERVED)) {
 438                                         ++current->mm->rss;
 439                                         mem_map[MAP_NR(to)]++;
 440                                 }
 441                         }
 442                         page_table++;
 443                         to += PAGE_SIZE;
 444                 }
 445                 pcnt = (size > PTRS_PER_PAGE ? PTRS_PER_PAGE : size);
 446         }
 447         invalidate();
 448         return 0;
 449 }
 450 
 451 /*
 452  * sanity-check function..
 453  */
 454 static void put_page(pte_t * page_table, pte_t pte)
     /*  */
 455 {
 456         if (!pte_none(*page_table)) {
 457                 printk("put_page: page already exists\n");
 458                 free_page(pte_page(pte));
 459                 return;
 460         }
 461 /* no need for invalidate */
 462         *page_table = pte;
 463 }
 464 
 465 /*
 466  * This routine is used to map in a page into an address space: needed by
 467  * execve() for the initial stack and environment pages.
 468  */
 469 unsigned long put_dirty_page(struct task_struct * tsk, unsigned long page, unsigned long address)
     /*  */
 470 {
 471         pgd_t * page_dir;
 472         pte_t * page_table;
 473 
 474         if (page >= high_memory)
 475                 printk("put_dirty_page: trying to put page %08lx at %08lx\n",page,address);
 476         if (mem_map[MAP_NR(page)] != 1)
 477                 printk("mem_map disagrees with %08lx at %08lx\n",page,address);
 478         page_dir = PAGE_DIR_OFFSET(tsk,address);
 479         if (pgd_present(*page_dir)) {
 480                 page_table = (pte_t *) pgd_page(*page_dir);
 481         } else {
 482                 if (!(page_table = (pte_t *) get_free_page(GFP_KERNEL)))
 483                         return 0;
 484                 if (pgd_present(*page_dir)) {
 485                         free_page((unsigned long) page_table);
 486                         page_table = (pte_t *) pgd_page(*page_dir);
 487                 } else {
 488                         pgd_set(page_dir, page_table);
 489                 }
 490         }
 491         page_table += (address >> PAGE_SHIFT) & (PTRS_PER_PAGE-1);
 492         if (!pte_none(*page_table)) {
 493                 printk("put_dirty_page: page already exists\n");
 494                 pte_clear(page_table);
 495                 invalidate();
 496         }
 497         *page_table = pte_mkwrite(pte_mkdirty(mk_pte(page, PAGE_COPY)));
 498 /* no need for invalidate */
 499         return page;
 500 }
 501 
 502 /*
 503  * This routine handles present pages, when users try to write
 504  * to a shared page. It is done by copying the page to a new address
 505  * and decrementing the shared-page counter for the old page.
 506  *
 507  * Goto-purists beware: the only reason for goto's here is that it results
 508  * in better assembly code.. The "default" path will see no jumps at all.
 509  *
 510  * Note that this routine assumes that the protection checks have been
 511  * done by the caller (the low-level page fault routine in most cases).
 512  * Thus we can safely just mark it writable once we've done any necessary
 513  * COW.
 514  *
 515  * We also mark the page dirty at this point even though the page will
 516  * change only once the write actually happens. This avoids a few races,
 517  * and potentially makes it more efficient.
 518  */
 519 void do_wp_page(struct vm_area_struct * vma, unsigned long address,
     /*  */
 520         int write_access)
 521 {
 522         pgd_t *page_dir;
 523         pte_t *page_table, pte;
 524         unsigned long old_page, new_page;
 525 
 526         new_page = __get_free_page(GFP_KERNEL);
 527         page_dir = PAGE_DIR_OFFSET(vma->vm_task,address);
 528         if (pgd_none(*page_dir))
 529                 goto end_wp_page;
 530         if (pgd_bad(*page_dir))
 531                 goto bad_wp_pagetable;
 532         page_table = (pte_t *) pgd_page(*page_dir);
 533         page_table += (address >> PAGE_SHIFT) & (PTRS_PER_PAGE-1);
 534         pte = *page_table;
 535         if (!pte_present(pte))
 536                 goto end_wp_page;
 537         if (pte_write(pte))
 538                 goto end_wp_page;
 539         old_page = pte_page(pte);
 540         if (old_page >= high_memory)
 541                 goto bad_wp_page;
 542         vma->vm_task->mm->min_flt++;
 543         /*
 544          * Do we need to copy?
 545          */
 546         if (mem_map[MAP_NR(old_page)] != 1) {
 547                 if (new_page) {
 548                         if (mem_map[MAP_NR(old_page)] & MAP_PAGE_RESERVED)
 549                                 ++vma->vm_task->mm->rss;
 550                         copy_page(old_page,new_page);
 551                         *page_table = pte_mkwrite(pte_mkdirty(mk_pte(new_page, vma->vm_page_prot)));
 552                         free_page(old_page);
 553                         invalidate();
 554                         return;
 555                 }
 556                 free_page(old_page);
 557                 oom(vma->vm_task);
 558                 *page_table = BAD_PAGE;
 559                 invalidate();
 560                 return;
 561         }
 562         *page_table = pte_mkdirty(pte_mkwrite(pte));
 563         invalidate();
 564         if (new_page)
 565                 free_page(new_page);
 566         return;
 567 bad_wp_page:
 568         printk("do_wp_page: bogus page at address %08lx (%08lx)\n",address,old_page);
 569         *page_table = BAD_PAGE;
 570         send_sig(SIGKILL, vma->vm_task, 1);
 571         goto end_wp_page;
 572 bad_wp_pagetable:
 573         printk("do_wp_page: bogus page-table at address %08lx (%08lx)\n", address, pgd_val(*page_dir));
 574         pgd_set(page_dir, BAD_PAGETABLE);
 575         send_sig(SIGKILL, vma->vm_task, 1);
 576 end_wp_page:
 577         if (new_page)
 578                 free_page(new_page);
 579         return;
 580 }
 581 
 582 /*
 583  * Ugly, ugly, but the goto's result in better assembly..
 584  */
 585 int verify_area(int type, const void * addr, unsigned long size)
     /*  */
 586 {
 587         struct vm_area_struct * vma;
 588         unsigned long start = (unsigned long) addr;
 589 
 590         /* If the current user space is mapped to kernel space (for the
 591          * case where we use a fake user buffer with get_fs/set_fs()) we
 592          * don't expect to find the address in the user vm map.
 593          */
 594         if (get_fs() == get_ds())
 595                 return 0;
 596 
 597         vma = find_vma(current, start);
 598         if (!vma)
 599                 goto bad_area;
 600         if (vma->vm_start <= start)
 601                 goto good_area;
 602         if (!(vma->vm_flags & VM_GROWSDOWN))
 603                 goto bad_area;
 604         if (vma->vm_end - start > current->rlim[RLIMIT_STACK].rlim_cur)
 605                 goto bad_area;
 606 
 607 good_area:
 608         if (type == VERIFY_WRITE)
 609                 goto check_write;
 610         for (;;) {
 611                 struct vm_area_struct * next;
 612                 if (!(vma->vm_flags & VM_READ))
 613                         goto bad_area;
 614                 if (vma->vm_end - start >= size)
 615                         return 0;
 616                 next = vma->vm_next;
 617                 if (!next || vma->vm_end != next->vm_start)
 618                         goto bad_area;
 619                 vma = next;
 620         }
 621 
 622 check_write:
 623         if (!(vma->vm_flags & VM_WRITE))
 624                 goto bad_area;
 625         if (!wp_works_ok)
 626                 goto check_wp_fault_by_hand;
 627         for (;;) {
 628                 if (vma->vm_end - start >= size)
 629                         break;
 630                 if (!vma->vm_next || vma->vm_end != vma->vm_next->vm_start)
 631                         goto bad_area;
 632                 vma = vma->vm_next;
 633                 if (!(vma->vm_flags & VM_WRITE))
 634                         goto bad_area;
 635         }
 636         return 0;
 637 
 638 check_wp_fault_by_hand:
 639         size--;
 640         size += start & ~PAGE_MASK;
 641         size >>= PAGE_SHIFT;
 642         start &= PAGE_MASK;
 643 
 644         for (;;) {
 645                 do_wp_page(vma, start, 1);
 646                 if (!size)
 647                         break;
 648                 size--;
 649                 start += PAGE_SIZE;
 650                 if (start < vma->vm_end)
 651                         continue;
 652                 vma = vma->vm_next;
 653                 if (!vma || vma->vm_start != start)
 654                         goto bad_area;
 655                 if (!(vma->vm_flags & VM_WRITE))
 656                         goto bad_area;;
 657         }
 658         return 0;
 659 
 660 bad_area:
 661         return -EFAULT;
 662 }
 663 
 664 static inline void get_empty_page(struct vm_area_struct * vma, pte_t * page_table)
     /*  */
 665 {
 666         unsigned long tmp;
 667 
 668         if (!(tmp = get_free_page(GFP_KERNEL))) {
 669                 oom(vma->vm_task);
 670                 put_page(page_table, BAD_PAGE);
 671                 return;
 672         }
 673         put_page(page_table, pte_mkwrite(mk_pte(tmp, vma->vm_page_prot)));
 674 }
 675 
 676 /*
 677  * try_to_share() checks the page at address "address" in the task "p",
 678  * to see if it exists, and if it is clean. If so, share it with the current
 679  * task.
 680  *
 681  * NOTE! This assumes we have checked that p != current, and that they
 682  * share the same inode and can generally otherwise be shared.
 683  */
 684 static int try_to_share(unsigned long to_address, struct vm_area_struct * to_area,
     /*  */
 685         unsigned long from_address, struct vm_area_struct * from_area,
 686         unsigned long newpage)
 687 {
 688         pgd_t * from_dir, * to_dir;
 689         pte_t * from_table, * to_table;
 690         pte_t from, to;
 691 
 692         from_dir = PAGE_DIR_OFFSET(from_area->vm_task,from_address);
 693 /* is there a page-directory at from? */
 694         if (!pgd_present(*from_dir))
 695                 return 0;
 696         from_table = (pte_t *) (pgd_page(*from_dir) + PAGE_PTR(from_address));
 697         from = *from_table;
 698 /* is the page present? */
 699         if (!pte_present(from))
 700                 return 0;
 701 /* if it is dirty it must be from a shared mapping to be shared */
 702         if (pte_dirty(from)) {
 703                 if (!(from_area->vm_flags & VM_SHARED))
 704                         return 0;
 705                 if (pte_write(from)) {
 706                         printk("nonwritable, but dirty, shared page\n");
 707                         return 0;
 708                 }
 709         }
 710 /* is the page reasonable at all? */
 711         if (pte_page(from) >= high_memory)
 712                 return 0;
 713         if (mem_map[MAP_NR(pte_page(from))] & MAP_PAGE_RESERVED)
 714                 return 0;
 715 /* is the destination ok? */
 716         to_dir = PAGE_DIR_OFFSET(to_area->vm_task,to_address);
 717         if (!pgd_present(*to_dir))
 718                 return 0;
 719         to_table = (pte_t *) (pgd_page(*to_dir) + PAGE_PTR(to_address));
 720         to = *to_table;
 721         if (!pte_none(to))
 722                 return 0;
 723 /* do we copy? */
 724         if (newpage) {
 725                 /* if it's in the swap cache, it's dirty by implication */
 726                 /* so we can't use it if it's not from a shared mapping */
 727                 if (in_swap_cache(pte_page(from))) {
 728                         if (!(from_area->vm_flags & VM_SHARED))
 729                                 return 0;
 730                         if (!pte_write(from)) {
 731                                 printk("nonwritable, but dirty, shared page\n");
 732                                 return 0;
 733                         }
 734                 }
 735                 copy_page(pte_page(from), newpage);
 736                 *to_table = mk_pte(newpage, to_area->vm_page_prot);
 737                 return 1;
 738         }
 739 /*
 740  * do a final swap-cache test before sharing them: if it's in the swap
 741  * cache, we have to remove it now, as we get two pointers to the same
 742  * physical page and the cache can't handle it. Mark the original dirty.
 743  *
 744  * NOTE! Even if "from" is dirty, "to" will be clean: if we get here
 745  * with a dirty "from", the from-mapping is a shared map, so we can trust
 746  * the page contents to be up-to-date
 747  */
 748         if (in_swap_cache(pte_page(from))) {
 749                 if (!(from_area->vm_flags & VM_SHARED))
 750                         return 0;
 751                 *from_table = pte_mkdirty(from);
 752                 delete_from_swap_cache(pte_page(from));
 753         }
 754         mem_map[MAP_NR(pte_page(from))]++;
 755         *to_table = mk_pte(pte_page(from), to_area->vm_page_prot);
 756 /* Check if we need to do anything at all to the 'from' field */
 757         if (!pte_write(from))
 758                 return 1;
 759         if (from_area->vm_flags & VM_SHARED)
 760                 return 1;
 761 /* ok, need to mark it read-only, so invalidate any possible old TB entry */
 762         *from_table = pte_wrprotect(from);
 763         invalidate();
 764         return 1;
 765 }
 766 
 767 /*
 768  * share_page() tries to find a process that could share a page with
 769  * the current one.
 770  *
 771  * We first check if it is at all feasible by checking inode->i_count.
 772  * It should be >1 if there are other tasks sharing this inode.
 773  */
 774 static int share_page(struct vm_area_struct * area, unsigned long address,
     /*  */
 775         int write_access, unsigned long newpage)
 776 {
 777         struct inode * inode;
 778         unsigned long offset;
 779         unsigned long from_address;
 780         unsigned long give_page;
 781         struct vm_area_struct * mpnt;
 782 
 783         if (!area || !(inode = area->vm_inode) || inode->i_count < 2)
 784                 return 0;
 785         /* do we need to copy or can we just share? */
 786         give_page = 0;
 787         if (write_access && !(area->vm_flags & VM_SHARED)) {
 788                 if (!newpage)
 789                         return 0;
 790                 give_page = newpage;
 791         }
 792         offset = address - area->vm_start + area->vm_offset;
 793         /* See if there is something in the VM we can share pages with. */
 794         /* Traverse the entire circular i_mmap list, except `area' itself. */
 795         for (mpnt = area->vm_next_share; mpnt != area; mpnt = mpnt->vm_next_share) {
 796                 /* must be same inode */
 797                 if (mpnt->vm_inode != inode) {
 798                         printk("Aiee! Corrupt vm_area_struct i_mmap ring\n");
 799                         break;  
 800                 }
 801                 /* offsets must be mutually page-aligned */
 802                 if ((mpnt->vm_offset ^ area->vm_offset) & ~PAGE_MASK)
 803                         continue;
 804                 /* the other area must actually cover the wanted page.. */
 805                 from_address = offset + mpnt->vm_start - mpnt->vm_offset;
 806                 if (from_address < mpnt->vm_start || from_address >= mpnt->vm_end)
 807                         continue;
 808                 /* .. NOW we can actually try to use the same physical page */
 809                 if (!try_to_share(address, area, from_address, mpnt, give_page))
 810                         continue;
 811                 /* free newpage if we never used it.. */
 812                 if (give_page || !newpage)
 813                         return 1;
 814                 free_page(newpage);
 815                 return 1;
 816         }
 817         return 0;
 818 }
 819 
 820 /*
 821  * fill in an empty page-table if none exists.
 822  */
 823 static inline pte_t * get_empty_pgtable(struct task_struct * tsk,unsigned long address)
     /*  */
 824 {
 825         pgd_t *p;
 826         unsigned long page;
 827 
 828         p = PAGE_DIR_OFFSET(tsk,address);
 829         if (pgd_present(*p))
 830                 return (pte_t *) (PAGE_PTR(address) + pgd_page(*p));
 831         if (!pgd_none(*p)) {
 832                 printk("get_empty_pgtable: bad page-directory entry \n");
 833                 pgd_clear(p);
 834         }
 835         page = get_free_page(GFP_KERNEL);
 836         if (pgd_present(*p)) {
 837                 free_page(page);
 838                 return (pte_t *) (PAGE_PTR(address) + pgd_page(*p));
 839         }
 840         if (!pgd_none(*p)) {
 841                 printk("get_empty_pgtable: bad page-directory entry \n");
 842                 pgd_clear(p);
 843         }
 844         if (page) {
 845                 pgd_set(p, (pte_t *) page);
 846                 return (pte_t *) (PAGE_PTR(address) + page);
 847         }
 848         oom(current);
 849         pgd_set(p, BAD_PAGETABLE);
 850         return NULL;
 851 }
 852 
 853 static inline void do_swap_page(struct vm_area_struct * vma, unsigned long address,
     /*  */
 854         pte_t * page_table, pte_t entry, int write_access)
 855 {
 856         pte_t page;
 857 
 858         if (!vma->vm_ops || !vma->vm_ops->swapin) {
 859                 swap_in(vma, page_table, pte_val(entry), write_access);
 860                 return;
 861         }
 862         page = vma->vm_ops->swapin(vma, address - vma->vm_start + vma->vm_offset, pte_val(entry));
 863         if (pte_val(*page_table) != pte_val(entry)) {
 864                 free_page(pte_page(page));
 865                 return;
 866         }
 867         if (mem_map[MAP_NR(pte_page(page))] > 1 && !(vma->vm_flags & VM_SHARED))
 868                 page = pte_wrprotect(page);
 869         ++vma->vm_task->mm->rss;
 870         ++vma->vm_task->mm->maj_flt;
 871         *page_table = page;
 872         return;
 873 }
 874 
 875 /*
 876  * do_no_page() tries to create a new page mapping. It aggressively
 877  * tries to share with existing pages, but makes a separate copy if
 878  * the "write_access" parameter is true in order to avoid the next
 879  * page fault.
 880  */
 881 void do_no_page(struct vm_area_struct * vma, unsigned long address,
     /*  */
 882         int write_access)
 883 {
 884         pte_t * page_table;
 885         pte_t entry;
 886         unsigned long page;
 887 
 888         page_table = get_empty_pgtable(vma->vm_task,address);
 889         if (!page_table)
 890                 return;
 891         entry = *page_table;
 892         if (pte_present(entry))
 893                 return;
 894         if (!pte_none(entry)) {
 895                 do_swap_page(vma, address, page_table, entry, write_access);
 896                 return;
 897         }
 898         address &= PAGE_MASK;
 899 
 900         if (!vma->vm_ops || !vma->vm_ops->nopage) {
 901                 ++vma->vm_task->mm->rss;
 902                 ++vma->vm_task->mm->min_flt;
 903                 get_empty_page(vma, page_table);
 904                 return;
 905         }
 906         page = get_free_page(GFP_KERNEL);
 907         if (share_page(vma, address, write_access, page)) {
 908                 ++vma->vm_task->mm->min_flt;
 909                 ++vma->vm_task->mm->rss;
 910                 return;
 911         }
 912         if (!page) {
 913                 oom(current);
 914                 put_page(page_table, BAD_PAGE);
 915                 return;
 916         }
 917         ++vma->vm_task->mm->maj_flt;
 918         ++vma->vm_task->mm->rss;
 919         /*
 920          * The fourth argument is "no_share", which tells the low-level code
 921          * to copy, not share the page even if sharing is possible.  It's
 922          * essentially an early COW detection 
 923          */
 924         page = vma->vm_ops->nopage(vma, address, page,
 925                 write_access && !(vma->vm_flags & VM_SHARED));
 926         if (share_page(vma, address, write_access, 0)) {
 927                 free_page(page);
 928                 return;
 929         }
 930         /*
 931          * This silly early PAGE_DIRTY setting removes a race
 932          * due to the bad i386 page protection. But it's valid
 933          * for other architectures too.
 934          *
 935          * Note that if write_access is true, we either now have
 936          * a exclusive copy of the page, or this is a shared mapping,
 937          * so we can make it writable and dirty to avoid having to
 938          * handle that later.
 939          */
 940         entry = mk_pte(page, vma->vm_page_prot);
 941         if (write_access) {
 942                 entry = pte_mkwrite(pte_mkdirty(entry));
 943         } else if (mem_map[MAP_NR(page)] > 1 && !(vma->vm_flags & VM_SHARED))
 944                 entry = pte_wrprotect(entry);
 945         put_page(page_table, entry);
 946 }
/* */
root/mm/memory.c

DEFINITIONS