root/mm/memory.c

/* [previous][next][first][last][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. oom
  2. free_one_pte
  3. free_one_table
  4. clear_page_tables
  5. free_page_tables
  6. clone_page_tables
  7. copy_page_tables
  8. unmap_page_range
  9. zeromap_page_range
  10. remap_page_range
  11. put_page
  12. put_dirty_page
  13. do_wp_page
  14. verify_area
  15. get_empty_page
  16. try_to_share
  17. share_page
  18. get_empty_pgtable
  19. do_swap_page
  20. do_no_page

   1 /*
   2  *  linux/mm/memory.c
   3  *
   4  *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
   5  */
   6 
   7 /*
   8  * demand-loading started 01.12.91 - seems it is high on the list of
   9  * things wanted, and it should be easy to implement. - Linus
  10  */
  11 
  12 /*
  13  * Ok, demand-loading was easy, shared pages a little bit tricker. Shared
  14  * pages started 02.12.91, seems to work. - Linus.
  15  *
  16  * Tested sharing by executing about 30 /bin/sh: under the old kernel it
  17  * would have taken more than the 6M I have free, but it worked well as
  18  * far as I could see.
  19  *
  20  * Also corrected some "invalidate()"s - I wasn't doing enough of them.
  21  */
  22 
  23 /*
  24  * Real VM (paging to/from disk) started 18.12.91. Much more work and
  25  * thought has to go into this. Oh, well..
  26  * 19.12.91  -  works, somewhat. Sometimes I get faults, don't know why.
  27  *              Found it. Everything seems to work now.
  28  * 20.12.91  -  Ok, making the swap-device changeable like the root.
  29  */
  30 
  31 /*
  32  * 05.04.94  -  Multi-page memory management added for v1.1.
  33  *              Idea by Alex Bligh (alex@cconcepts.co.uk)
  34  */
  35 
  36 #include <linux/config.h>
  37 #include <linux/signal.h>
  38 #include <linux/sched.h>
  39 #include <linux/head.h>
  40 #include <linux/kernel.h>
  41 #include <linux/errno.h>
  42 #include <linux/string.h>
  43 #include <linux/types.h>
  44 #include <linux/ptrace.h>
  45 #include <linux/mman.h>
  46 
  47 #include <asm/system.h>
  48 #include <asm/segment.h>
  49 
  50 unsigned long high_memory = 0;
  51 
  52 /*
  53  * The free_area_list arrays point to the queue heads of the free areas
  54  * of different sizes
  55  */
  56 int nr_swap_pages = 0;
  57 int nr_free_pages = 0;
  58 struct mem_list free_area_list[NR_MEM_LISTS];
  59 unsigned char * free_area_map[NR_MEM_LISTS];
  60 
  61 #define copy_page(from,to) memcpy((void *) to, (void *) from, PAGE_SIZE)
  62 
  63 mem_map_t * mem_map = NULL;
  64 
  65 #define CODE_SPACE(addr,p) ((addr) < (p)->end_code)
  66 
  67 /*
  68  * oom() prints a message (so that the user knows why the process died),
  69  * and gives the process an untrappable SIGKILL.
  70  */
  71 void oom(struct task_struct * task)
     /* [previous][next][first][last][top][bottom][index][help] */
  72 {
  73         printk("\nOut of memory for %s.\n", current->comm);
  74         task->sigaction[SIGKILL-1].sa_handler = NULL;
  75         task->blocked &= ~(1<<(SIGKILL-1));
  76         send_sig(SIGKILL,task,1);
  77 }
  78 
  79 static inline void free_one_pte(pte_t * page_table)
     /* [previous][next][first][last][top][bottom][index][help] */
  80 {
  81         pte_t page = *page_table;
  82 
  83         if (pte_none(page))
  84                 return;
  85         pte_clear(page_table);
  86         if (!pte_present(page)) {
  87                 swap_free(pte_val(page));
  88                 return;
  89         }
  90         free_page(pte_page(page));
  91         return;
  92 }
  93 
  94 static void free_one_table(pgd_t * page_dir)
     /* [previous][next][first][last][top][bottom][index][help] */
  95 {
  96         int j;
  97         pgd_t pg_table = *page_dir;
  98         pte_t * page_table;
  99         unsigned long page;
 100 
 101         if (pgd_none(pg_table))
 102                 return;
 103         pgd_clear(page_dir);
 104         if (pgd_bad(pg_table)) {
 105                 printk("Bad page table: [%p]=%08lx\n",page_dir,pgd_val(pg_table));
 106                 return;
 107         }
 108         page = pgd_page(pg_table);
 109         if (mem_map[MAP_NR(page)] & MAP_PAGE_RESERVED)
 110                 return;
 111         page_table = (pte_t *) page;
 112         for (j = 0 ; j < PTRS_PER_PAGE ; j++,page_table++)
 113                 free_one_pte(page_table);
 114         free_page(page);
 115 }
 116 
 117 /*
 118  * This function clears all user-level page tables of a process - this
 119  * is needed by execve(), so that old pages aren't in the way. Note that
 120  * unlike 'free_page_tables()', this function still leaves a valid
 121  * page-table-tree in memory: it just removes the user pages. The two
 122  * functions are similar, but there is a fundamental difference.
 123  */
 124 void clear_page_tables(struct task_struct * tsk)
     /* [previous][next][first][last][top][bottom][index][help] */
 125 {
 126         int i;
 127         pgd_t * page_dir;
 128 
 129         if (!tsk)
 130                 return;
 131         if (tsk == task[0])
 132                 panic("task[0] (swapper) doesn't support exec()\n");
 133         page_dir = PAGE_DIR_OFFSET(tsk, 0);
 134         if (!page_dir || page_dir == swapper_pg_dir) {
 135                 printk("Trying to clear kernel page-directory: not good\n");
 136                 return;
 137         }
 138         if (mem_map[MAP_NR((unsigned long) page_dir)] > 1) {
 139                 pgd_t * new_pg;
 140 
 141                 if (!(new_pg = (pgd_t *) get_free_page(GFP_KERNEL))) {
 142                         oom(tsk);
 143                         return;
 144                 }
 145                 for (i = 768 ; i < 1024 ; i++)
 146                         new_pg[i] = page_dir[i];
 147                 free_page((unsigned long) page_dir);
 148                 SET_PAGE_DIR(tsk, new_pg);
 149                 return;
 150         }
 151         for (i = 0 ; i < 768 ; i++,page_dir++)
 152                 free_one_table(page_dir);
 153         invalidate();
 154         return;
 155 }
 156 
 157 /*
 158  * This function frees up all page tables of a process when it exits.
 159  */
 160 void free_page_tables(struct task_struct * tsk)
     /* [previous][next][first][last][top][bottom][index][help] */
 161 {
 162         int i;
 163         pgd_t * page_dir;
 164 
 165         if (!tsk)
 166                 return;
 167         if (tsk == task[0]) {
 168                 printk("task[0] (swapper) killed: unable to recover\n");
 169                 panic("Trying to free up swapper memory space");
 170         }
 171         page_dir = PAGE_DIR_OFFSET(tsk, 0);
 172         if (!page_dir || page_dir == swapper_pg_dir) {
 173                 printk("Trying to free kernel page-directory: not good\n");
 174                 return;
 175         }
 176         SET_PAGE_DIR(tsk, swapper_pg_dir);
 177         if (mem_map[MAP_NR((unsigned long) page_dir)] > 1) {
 178                 free_page((unsigned long) page_dir);
 179                 return;
 180         }
 181         for (i = 0 ; i < PTRS_PER_PAGE ; i++)
 182                 free_one_table(page_dir + i);
 183         free_page((unsigned long) page_dir);
 184         invalidate();
 185 }
 186 
 187 /*
 188  * clone_page_tables() clones the page table for a process - both
 189  * processes will have the exact same pages in memory. There are
 190  * probably races in the memory management with cloning, but we'll
 191  * see..
 192  */
 193 int clone_page_tables(struct task_struct * tsk)
     /* [previous][next][first][last][top][bottom][index][help] */
 194 {
 195         unsigned long pg_dir;
 196 
 197         pg_dir = (unsigned long) PAGE_DIR_OFFSET(current, 0);
 198         mem_map[MAP_NR(pg_dir)]++;
 199         SET_PAGE_DIR(tsk, pg_dir);
 200         return 0;
 201 }
 202 
 203 /*
 204  * copy_page_tables() just copies the whole process memory range:
 205  * note the special handling of RESERVED (ie kernel) pages, which
 206  * means that they are always shared by all processes.
 207  */
 208 int copy_page_tables(struct task_struct * tsk)
     /* [previous][next][first][last][top][bottom][index][help] */
 209 {
 210         int i;
 211         pgd_t *old_page_dir;
 212         pgd_t *new_page_dir;
 213 
 214         new_page_dir = (pgd_t *) get_free_page(GFP_KERNEL);
 215         if (!new_page_dir)
 216                 return -ENOMEM;
 217         old_page_dir = PAGE_DIR_OFFSET(current, 0);
 218         SET_PAGE_DIR(tsk, new_page_dir);
 219         for (i = 0 ; i < PTRS_PER_PAGE ; i++,old_page_dir++,new_page_dir++) {
 220                 int j;
 221                 pgd_t old_pg_table;
 222                 pte_t *old_page_table, *new_page_table;
 223 
 224                 old_pg_table = *old_page_dir;
 225                 if (pgd_none(old_pg_table))
 226                         continue;
 227                 if (pgd_bad(old_pg_table)) {
 228                         printk("copy_page_tables: bad page table: "
 229                                 "probable memory corruption\n");
 230                         pgd_clear(old_page_dir);
 231                         continue;
 232                 }
 233                 if (mem_map[MAP_NR(pgd_page(old_pg_table))] & MAP_PAGE_RESERVED) {
 234                         *new_page_dir = old_pg_table;
 235                         continue;
 236                 }
 237                 if (!(new_page_table = (pte_t *) get_free_page(GFP_KERNEL))) {
 238                         free_page_tables(tsk);
 239                         return -ENOMEM;
 240                 }
 241                 old_page_table = (pte_t *) pgd_page(old_pg_table);
 242                 pgd_set(new_page_dir, new_page_table);
 243                 for (j = 0 ; j < PTRS_PER_PAGE ; j++,old_page_table++,new_page_table++) {
 244                         pte_t pte = *old_page_table;
 245                         if (pte_none(pte))
 246                                 continue;
 247                         if (!pte_present(pte)) {
 248                                 swap_duplicate(pte_val(pte));
 249                                 *new_page_table = pte;
 250                                 continue;
 251                         }
 252                         if (pte_page(pte) > high_memory || (mem_map[MAP_NR(pte_page(pte))] & MAP_PAGE_RESERVED)) {
 253                                 *new_page_table = pte;
 254                                 continue;
 255                         }
 256                         if (pte_cow(pte))
 257                                 pte = pte_wrprotect(pte);
 258                         if (delete_from_swap_cache(pte_page(pte)))
 259                                 pte = pte_mkdirty(pte);
 260                         *new_page_table = pte;
 261                         *old_page_table = pte;
 262                         mem_map[MAP_NR(pte_page(pte))]++;
 263                 }
 264         }
 265         invalidate();
 266         return 0;
 267 }
 268 
 269 /*
 270  * a more complete version of free_page_tables which performs with page
 271  * granularity.
 272  */
 273 int unmap_page_range(unsigned long from, unsigned long size)
     /* [previous][next][first][last][top][bottom][index][help] */
 274 {
 275         pgd_t page_dir, * dir;
 276         pte_t page, * page_table;
 277         unsigned long poff, pcnt, pc;
 278 
 279         if (from & ~PAGE_MASK) {
 280                 printk("unmap_page_range called with wrong alignment\n");
 281                 return -EINVAL;
 282         }
 283         size = (size + ~PAGE_MASK) >> PAGE_SHIFT;
 284         dir = PAGE_DIR_OFFSET(current,from);
 285         poff = (from >> PAGE_SHIFT) & (PTRS_PER_PAGE-1);
 286         if ((pcnt = PTRS_PER_PAGE - poff) > size)
 287                 pcnt = size;
 288 
 289         for ( ; size > 0; ++dir, size -= pcnt,
 290              pcnt = (size > PTRS_PER_PAGE ? PTRS_PER_PAGE : size)) {
 291                 page_dir = *dir;
 292                 if (pgd_none(page_dir)) {
 293                         poff = 0;
 294                         continue;
 295                 }
 296                 if (pgd_bad(page_dir)) {
 297                         printk("unmap_page_range: bad page directory.");
 298                         continue;
 299                 }
 300                 page_table = (pte_t *) pgd_page(page_dir);
 301                 if (poff) {
 302                         page_table += poff;
 303                         poff = 0;
 304                 }
 305                 for (pc = pcnt; pc--; page_table++) {
 306                         page = *page_table;
 307                         if (!pte_none(page)) {
 308                                 pte_clear(page_table);
 309                                 if (pte_present(page)) {
 310                                         if (!(mem_map[MAP_NR(pte_page(page))] & MAP_PAGE_RESERVED))
 311                                                 if (current->mm->rss > 0)
 312                                                         --current->mm->rss;
 313                                         free_page(pte_page(page));
 314                                 } else
 315                                         swap_free(pte_val(page));
 316                         }
 317                 }
 318                 if (pcnt == PTRS_PER_PAGE) {
 319                         pgd_clear(dir);
 320                         free_page(pgd_page(page_dir));
 321                 }
 322         }
 323         invalidate();
 324         return 0;
 325 }
 326 
 327 int zeromap_page_range(unsigned long from, unsigned long size, pgprot_t prot)
     /* [previous][next][first][last][top][bottom][index][help] */
 328 {
 329         pgd_t * dir;
 330         pte_t * page_table;
 331         unsigned long poff, pcnt;
 332         pte_t zero_pte;
 333 
 334         if (from & ~PAGE_MASK) {
 335                 printk("zeromap_page_range: from = %08lx\n",from);
 336                 return -EINVAL;
 337         }
 338         zero_pte = pte_wrprotect(mk_pte(ZERO_PAGE, prot));
 339         dir = PAGE_DIR_OFFSET(current,from);
 340         size = (size + ~PAGE_MASK) >> PAGE_SHIFT;
 341         poff = (from >> PAGE_SHIFT) & (PTRS_PER_PAGE-1);
 342         if ((pcnt = PTRS_PER_PAGE - poff) > size)
 343                 pcnt = size;
 344 
 345         while (size > 0) {
 346                 if (!pgd_present(*dir)) {
 347                         if (!(page_table = (pte_t *) get_free_page(GFP_KERNEL))) {
 348                                 invalidate();
 349                                 return -ENOMEM;
 350                         }
 351                         if (pgd_present(*dir)) {
 352                                 free_page((unsigned long) page_table);
 353                                 page_table = (pte_t *) pgd_page(*dir);
 354                         } else
 355                                 pgd_set(dir, page_table);
 356                 } else
 357                         page_table = (pte_t *) pgd_page(*dir);
 358                 dir++;
 359                 page_table += poff;
 360                 poff = 0;
 361                 for (size -= pcnt; pcnt-- ;) {
 362                         pte_t page = *page_table;
 363                         if (!pte_none(page)) {
 364                                 pte_clear(page_table);
 365                                 if (pte_present(page)) {
 366                                         if (!(mem_map[MAP_NR(pte_page(page))] & MAP_PAGE_RESERVED))
 367                                                 if (current->mm->rss > 0)
 368                                                         --current->mm->rss;
 369                                         free_page(pte_page(page));
 370                                 } else
 371                                         swap_free(pte_val(page));
 372                         }
 373                         *page_table++ = zero_pte;
 374                 }
 375                 pcnt = (size > PTRS_PER_PAGE ? PTRS_PER_PAGE : size);
 376         }
 377         invalidate();
 378         return 0;
 379 }
 380 
 381 /*
 382  * maps a range of physical memory into the requested pages. the old
 383  * mappings are removed. any references to nonexistent pages results
 384  * in null mappings (currently treated as "copy-on-access")
 385  */
 386 int remap_page_range(unsigned long from, unsigned long to, unsigned long size, pgprot_t prot)
     /* [previous][next][first][last][top][bottom][index][help] */
 387 {
 388         pgd_t * dir;
 389         pte_t * page_table;
 390         unsigned long poff, pcnt;
 391 
 392         if ((from & ~PAGE_MASK) || (to & ~PAGE_MASK)) {
 393                 printk("remap_page_range: from = %08lx, to=%08lx\n",from,to);
 394                 return -EINVAL;
 395         }
 396         dir = PAGE_DIR_OFFSET(current,from);
 397         size = (size + ~PAGE_MASK) >> PAGE_SHIFT;
 398         poff = (from >> PAGE_SHIFT) & (PTRS_PER_PAGE-1);
 399         if ((pcnt = PTRS_PER_PAGE - poff) > size)
 400                 pcnt = size;
 401 
 402         while (size > 0) {
 403                 if (!pgd_present(*dir)) {
 404                         if (!(page_table = (pte_t *) get_free_page(GFP_KERNEL))) {
 405                                 invalidate();
 406                                 return -1;
 407                         }
 408                         if (pgd_present(*dir)) {
 409                                 free_page((unsigned long) page_table);
 410                                 page_table = (pte_t *) pgd_page(*dir);
 411                         } else
 412                                 pgd_set(dir, page_table);
 413                 } else
 414                         page_table = (pte_t *) pgd_page(*dir);
 415                 dir++;
 416                 page_table += poff;
 417                 poff = 0;
 418 
 419                 for (size -= pcnt; pcnt-- ;) {
 420                         pte_t page = *page_table;
 421                         if (!pte_none(page)) {
 422                                 pte_clear(page_table);
 423                                 if (pte_present(page)) {
 424                                         if (!(mem_map[MAP_NR(pte_page(page))] & MAP_PAGE_RESERVED))
 425                                                 if (current->mm->rss > 0)
 426                                                         --current->mm->rss;
 427                                         free_page(pte_page(page));
 428                                 } else
 429                                         swap_free(pte_val(page));
 430                         }
 431                         if (to >= high_memory)
 432                                 *page_table = mk_pte(to, prot);
 433                         else if (mem_map[MAP_NR(to)]) {
 434                                 *page_table = mk_pte(to, prot);
 435                                 if (!(mem_map[MAP_NR(to)] & MAP_PAGE_RESERVED)) {
 436                                         ++current->mm->rss;
 437                                         mem_map[MAP_NR(to)]++;
 438                                 }
 439                         }
 440                         page_table++;
 441                         to += PAGE_SIZE;
 442                 }
 443                 pcnt = (size > PTRS_PER_PAGE ? PTRS_PER_PAGE : size);
 444         }
 445         invalidate();
 446         return 0;
 447 }
 448 
 449 /*
 450  * sanity-check function..
 451  */
 452 static void put_page(pte_t * page_table, pte_t pte)
     /* [previous][next][first][last][top][bottom][index][help] */
 453 {
 454         if (!pte_none(*page_table)) {
 455                 printk("put_page: page already exists\n");
 456                 free_page(pte_page(pte));
 457                 return;
 458         }
 459 /* no need for invalidate */
 460         *page_table = pte;
 461 }
 462 
 463 /*
 464  * This routine is used to map in a page into an address space: needed by
 465  * execve() for the initial stack and environment pages.
 466  */
 467 unsigned long put_dirty_page(struct task_struct * tsk, unsigned long page, unsigned long address)
     /* [previous][next][first][last][top][bottom][index][help] */
 468 {
 469         pgd_t * page_dir;
 470         pte_t * page_table;
 471 
 472         if (page >= high_memory)
 473                 printk("put_dirty_page: trying to put page %08lx at %08lx\n",page,address);
 474         if (mem_map[MAP_NR(page)] != 1)
 475                 printk("mem_map disagrees with %08lx at %08lx\n",page,address);
 476         page_dir = PAGE_DIR_OFFSET(tsk,address);
 477         if (pgd_present(*page_dir)) {
 478                 page_table = (pte_t *) pgd_page(*page_dir);
 479         } else {
 480                 if (!(page_table = (pte_t *) get_free_page(GFP_KERNEL)))
 481                         return 0;
 482                 if (pgd_present(*page_dir)) {
 483                         free_page((unsigned long) page_table);
 484                         page_table = (pte_t *) pgd_page(*page_dir);
 485                 } else {
 486                         pgd_set(page_dir, page_table);
 487                 }
 488         }
 489         page_table += (address >> PAGE_SHIFT) & (PTRS_PER_PAGE-1);
 490         if (!pte_none(*page_table)) {
 491                 printk("put_dirty_page: page already exists\n");
 492                 pte_clear(page_table);
 493                 invalidate();
 494         }
 495         *page_table = pte_mkwrite(pte_mkdirty(mk_pte(page, PAGE_COPY)));
 496 /* no need for invalidate */
 497         return page;
 498 }
 499 
 500 /*
 501  * This routine handles present pages, when users try to write
 502  * to a shared page. It is done by copying the page to a new address
 503  * and decrementing the shared-page counter for the old page.
 504  *
 505  * Goto-purists beware: the only reason for goto's here is that it results
 506  * in better assembly code.. The "default" path will see no jumps at all.
 507  *
 508  * Note that this routine assumes that the protection checks have been
 509  * done by the caller (the low-level page fault routine in most cases).
 510  * Thus we can safely just mark it writable once we've done any necessary
 511  * COW.
 512  *
 513  * We also mark the page dirty at this point even though the page will
 514  * change only once the write actually happens. This avoids a few races,
 515  * and potentially makes it more efficient.
 516  */
 517 void do_wp_page(struct vm_area_struct * vma, unsigned long address,
     /* [previous][next][first][last][top][bottom][index][help] */
 518         int write_access)
 519 {
 520         pgd_t *page_dir;
 521         pte_t *page_table, pte;
 522         unsigned long old_page, new_page;
 523 
 524         new_page = __get_free_page(GFP_KERNEL);
 525         page_dir = PAGE_DIR_OFFSET(vma->vm_task,address);
 526         if (pgd_none(*page_dir))
 527                 goto end_wp_page;
 528         if (pgd_bad(*page_dir))
 529                 goto bad_wp_pagetable;
 530         page_table = (pte_t *) pgd_page(*page_dir);
 531         page_table += (address >> PAGE_SHIFT) & (PTRS_PER_PAGE-1);
 532         pte = *page_table;
 533         if (!pte_present(pte))
 534                 goto end_wp_page;
 535         if (pte_write(pte))
 536                 goto end_wp_page;
 537         old_page = pte_page(pte);
 538         if (old_page >= high_memory)
 539                 goto bad_wp_page;
 540         vma->vm_task->mm->min_flt++;
 541         /*
 542          * Do we need to copy?
 543          */
 544         if (mem_map[MAP_NR(old_page)] != 1) {
 545                 if (new_page) {
 546                         if (mem_map[MAP_NR(old_page)] & MAP_PAGE_RESERVED)
 547                                 ++vma->vm_task->mm->rss;
 548                         copy_page(old_page,new_page);
 549                         *page_table = pte_mkwrite(pte_mkdirty(mk_pte(new_page, vma->vm_page_prot)));
 550                         free_page(old_page);
 551                         invalidate();
 552                         return;
 553                 }
 554                 free_page(old_page);
 555                 oom(vma->vm_task);
 556                 *page_table = BAD_PAGE;
 557                 invalidate();
 558                 return;
 559         }
 560         *page_table = pte_mkdirty(pte_mkwrite(pte));
 561         invalidate();
 562         if (new_page)
 563                 free_page(new_page);
 564         return;
 565 bad_wp_page:
 566         printk("do_wp_page: bogus page at address %08lx (%08lx)\n",address,old_page);
 567         *page_table = BAD_PAGE;
 568         send_sig(SIGKILL, vma->vm_task, 1);
 569         goto end_wp_page;
 570 bad_wp_pagetable:
 571         printk("do_wp_page: bogus page-table at address %08lx (%08lx)\n", address, pgd_val(*page_dir));
 572         pgd_set(page_dir, BAD_PAGETABLE);
 573         send_sig(SIGKILL, vma->vm_task, 1);
 574 end_wp_page:
 575         if (new_page)
 576                 free_page(new_page);
 577         return;
 578 }
 579 
 580 /*
 581  * Ugly, ugly, but the goto's result in better assembly..
 582  */
 583 int verify_area(int type, const void * addr, unsigned long size)
     /* [previous][next][first][last][top][bottom][index][help] */
 584 {
 585         struct vm_area_struct * vma;
 586         unsigned long start = (unsigned long) addr;
 587 
 588         /* If the current user space is mapped to kernel space (for the
 589          * case where we use a fake user buffer with get_fs/set_fs()) we
 590          * don't expect to find the address in the user vm map.
 591          */
 592         if (get_fs() == get_ds())
 593                 return 0;
 594 
 595         vma = find_vma(current, start);
 596         if (!vma)
 597                 goto bad_area;
 598         if (vma->vm_start <= start)
 599                 goto good_area;
 600         if (!(vma->vm_flags & VM_GROWSDOWN))
 601                 goto bad_area;
 602         if (vma->vm_end - start > current->rlim[RLIMIT_STACK].rlim_cur)
 603                 goto bad_area;
 604 
 605 good_area:
 606         if (type == VERIFY_WRITE)
 607                 goto check_write;
 608         for (;;) {
 609                 struct vm_area_struct * next;
 610                 if (!(vma->vm_flags & VM_READ))
 611                         goto bad_area;
 612                 if (vma->vm_end - start >= size)
 613                         return 0;
 614                 next = vma->vm_next;
 615                 if (!next || vma->vm_end != next->vm_start)
 616                         goto bad_area;
 617                 vma = next;
 618         }
 619 
 620 check_write:
 621         if (!(vma->vm_flags & VM_WRITE))
 622                 goto bad_area;
 623         if (!wp_works_ok)
 624                 goto check_wp_fault_by_hand;
 625         for (;;) {
 626                 if (vma->vm_end - start >= size)
 627                         break;
 628                 if (!vma->vm_next || vma->vm_end != vma->vm_next->vm_start)
 629                         goto bad_area;
 630                 vma = vma->vm_next;
 631                 if (!(vma->vm_flags & VM_WRITE))
 632                         goto bad_area;
 633         }
 634         return 0;
 635 
 636 check_wp_fault_by_hand:
 637         size--;
 638         size += start & ~PAGE_MASK;
 639         size >>= PAGE_SHIFT;
 640         start &= PAGE_MASK;
 641 
 642         for (;;) {
 643                 do_wp_page(vma, start, 1);
 644                 if (!size)
 645                         break;
 646                 size--;
 647                 start += PAGE_SIZE;
 648                 if (start < vma->vm_end)
 649                         continue;
 650                 vma = vma->vm_next;
 651                 if (!vma || vma->vm_start != start)
 652                         goto bad_area;
 653                 if (!(vma->vm_flags & VM_WRITE))
 654                         goto bad_area;;
 655         }
 656         return 0;
 657 
 658 bad_area:
 659         return -EFAULT;
 660 }
 661 
 662 static inline void get_empty_page(struct vm_area_struct * vma, pte_t * page_table)
     /* [previous][next][first][last][top][bottom][index][help] */
 663 {
 664         unsigned long tmp;
 665 
 666         if (!(tmp = get_free_page(GFP_KERNEL))) {
 667                 oom(vma->vm_task);
 668                 put_page(page_table, BAD_PAGE);
 669                 return;
 670         }
 671         put_page(page_table, pte_mkwrite(mk_pte(tmp, vma->vm_page_prot)));
 672 }
 673 
 674 /*
 675  * try_to_share() checks the page at address "address" in the task "p",
 676  * to see if it exists, and if it is clean. If so, share it with the current
 677  * task.
 678  *
 679  * NOTE! This assumes we have checked that p != current, and that they
 680  * share the same inode and can generally otherwise be shared.
 681  */
 682 static int try_to_share(unsigned long to_address, struct vm_area_struct * to_area,
     /* [previous][next][first][last][top][bottom][index][help] */
 683         unsigned long from_address, struct vm_area_struct * from_area,
 684         unsigned long newpage)
 685 {
 686         pgd_t * from_dir, * to_dir;
 687         pte_t * from_table, * to_table;
 688         pte_t from, to;
 689 
 690         from_dir = PAGE_DIR_OFFSET(from_area->vm_task,from_address);
 691 /* is there a page-directory at from? */
 692         if (!pgd_present(*from_dir))
 693                 return 0;
 694         from_table = (pte_t *) (pgd_page(*from_dir) + PAGE_PTR(from_address));
 695         from = *from_table;
 696 /* is the page present? */
 697         if (!pte_present(from))
 698                 return 0;
 699 /* if it is dirty it must be from a shared mapping to be shared */
 700         if (pte_dirty(from)) {
 701                 if (!(from_area->vm_flags & VM_SHARED))
 702                         return 0;
 703                 if (pte_write(from)) {
 704                         printk("nonwritable, but dirty, shared page\n");
 705                         return 0;
 706                 }
 707         }
 708 /* is the page reasonable at all? */
 709         if (pte_page(from) >= high_memory)
 710                 return 0;
 711         if (mem_map[MAP_NR(pte_page(from))] & MAP_PAGE_RESERVED)
 712                 return 0;
 713 /* is the destination ok? */
 714         to_dir = PAGE_DIR_OFFSET(to_area->vm_task,to_address);
 715         if (!pgd_present(*to_dir))
 716                 return 0;
 717         to_table = (pte_t *) (pgd_page(*to_dir) + PAGE_PTR(to_address));
 718         to = *to_table;
 719         if (!pte_none(to))
 720                 return 0;
 721 /* do we copy? */
 722         if (newpage) {
 723                 /* if it's in the swap cache, it's dirty by implication */
 724                 /* so we can't use it if it's not from a shared mapping */
 725                 if (in_swap_cache(pte_page(from))) {
 726                         if (!(from_area->vm_flags & VM_SHARED))
 727                                 return 0;
 728                         if (!pte_write(from)) {
 729                                 printk("nonwritable, but dirty, shared page\n");
 730                                 return 0;
 731                         }
 732                 }
 733                 copy_page(pte_page(from), newpage);
 734                 *to_table = mk_pte(newpage, to_area->vm_page_prot);
 735                 return 1;
 736         }
 737 /*
 738  * do a final swap-cache test before sharing them: if it's in the swap
 739  * cache, we have to remove it now, as we get two pointers to the same
 740  * physical page and the cache can't handle it. Mark the original dirty.
 741  *
 742  * NOTE! Even if "from" is dirty, "to" will be clean: if we get here
 743  * with a dirty "from", the from-mapping is a shared map, so we can trust
 744  * the page contents to be up-to-date
 745  */
 746         if (in_swap_cache(pte_page(from))) {
 747                 if (!(from_area->vm_flags & VM_SHARED))
 748                         return 0;
 749                 *from_table = pte_mkdirty(from);
 750                 delete_from_swap_cache(pte_page(from));
 751         }
 752         mem_map[MAP_NR(pte_page(from))]++;
 753         *to_table = mk_pte(pte_page(from), to_area->vm_page_prot);
 754 /* Check if we need to do anything at all to the 'from' field */
 755         if (!pte_write(from))
 756                 return 1;
 757         if (from_area->vm_flags & VM_SHARED)
 758                 return 1;
 759 /* ok, need to mark it read-only, so invalidate any possible old TB entry */
 760         *from_table = pte_wrprotect(from);
 761         invalidate();
 762         return 1;
 763 }
 764 
 765 /*
 766  * share_page() tries to find a process that could share a page with
 767  * the current one.
 768  *
 769  * We first check if it is at all feasible by checking inode->i_count.
 770  * It should be >1 if there are other tasks sharing this inode.
 771  */
 772 static int share_page(struct vm_area_struct * area, unsigned long address,
     /* [previous][next][first][last][top][bottom][index][help] */
 773         int write_access, unsigned long newpage)
 774 {
 775         struct inode * inode;
 776         unsigned long offset;
 777         unsigned long from_address;
 778         unsigned long give_page;
 779         struct vm_area_struct * mpnt;
 780 
 781         if (!area || !(inode = area->vm_inode) || inode->i_count < 2)
 782                 return 0;
 783         /* do we need to copy or can we just share? */
 784         give_page = 0;
 785         if (write_access && !(area->vm_flags & VM_SHARED)) {
 786                 if (!newpage)
 787                         return 0;
 788                 give_page = newpage;
 789         }
 790         offset = address - area->vm_start + area->vm_offset;
 791         /* See if there is something in the VM we can share pages with. */
 792         /* Traverse the entire circular i_mmap list, except `area' itself. */
 793         for (mpnt = area->vm_next_share; mpnt != area; mpnt = mpnt->vm_next_share) {
 794                 /* must be same inode */
 795                 if (mpnt->vm_inode != inode) {
 796                         printk("Aiee! Corrupt vm_area_struct i_mmap ring\n");
 797                         break;  
 798                 }
 799                 /* offsets must be mutually page-aligned */
 800                 if ((mpnt->vm_offset ^ area->vm_offset) & ~PAGE_MASK)
 801                         continue;
 802                 /* the other area must actually cover the wanted page.. */
 803                 from_address = offset + mpnt->vm_start - mpnt->vm_offset;
 804                 if (from_address < mpnt->vm_start || from_address >= mpnt->vm_end)
 805                         continue;
 806                 /* .. NOW we can actually try to use the same physical page */
 807                 if (!try_to_share(address, area, from_address, mpnt, give_page))
 808                         continue;
 809                 /* free newpage if we never used it.. */
 810                 if (give_page || !newpage)
 811                         return 1;
 812                 free_page(newpage);
 813                 return 1;
 814         }
 815         return 0;
 816 }
 817 
 818 /*
 819  * fill in an empty page-table if none exists.
 820  */
 821 static inline pte_t * get_empty_pgtable(struct task_struct * tsk,unsigned long address)
     /* [previous][next][first][last][top][bottom][index][help] */
 822 {
 823         pgd_t *p;
 824         unsigned long page;
 825 
 826         p = PAGE_DIR_OFFSET(tsk,address);
 827         if (pgd_present(*p))
 828                 return (pte_t *) (PAGE_PTR(address) + pgd_page(*p));
 829         if (!pgd_none(*p)) {
 830                 printk("get_empty_pgtable: bad page-directory entry \n");
 831                 pgd_clear(p);
 832         }
 833         page = get_free_page(GFP_KERNEL);
 834         if (pgd_present(*p)) {
 835                 free_page(page);
 836                 return (pte_t *) (PAGE_PTR(address) + pgd_page(*p));
 837         }
 838         if (!pgd_none(*p)) {
 839                 printk("get_empty_pgtable: bad page-directory entry \n");
 840                 pgd_clear(p);
 841         }
 842         if (page) {
 843                 pgd_set(p, (pte_t *) page);
 844                 return (pte_t *) (PAGE_PTR(address) + page);
 845         }
 846         oom(current);
 847         pgd_set(p, BAD_PAGETABLE);
 848         return NULL;
 849 }
 850 
 851 static inline void do_swap_page(struct vm_area_struct * vma, unsigned long address,
     /* [previous][next][first][last][top][bottom][index][help] */
 852         pte_t * page_table, pte_t entry, int write_access)
 853 {
 854         pte_t page;
 855 
 856         if (!vma->vm_ops || !vma->vm_ops->swapin) {
 857                 swap_in(vma, page_table, pte_val(entry), write_access);
 858                 return;
 859         }
 860         page = vma->vm_ops->swapin(vma, address - vma->vm_start + vma->vm_offset, pte_val(entry));
 861         if (pte_val(*page_table) != pte_val(entry)) {
 862                 free_page(pte_page(page));
 863                 return;
 864         }
 865         if (mem_map[MAP_NR(pte_page(page))] > 1 && !(vma->vm_flags & VM_SHARED))
 866                 page = pte_wrprotect(page);
 867         ++vma->vm_task->mm->rss;
 868         ++vma->vm_task->mm->maj_flt;
 869         *page_table = page;
 870         return;
 871 }
 872 
 873 /*
 874  * do_no_page() tries to create a new page mapping. It aggressively
 875  * tries to share with existing pages, but makes a separate copy if
 876  * the "write_access" parameter is true in order to avoid the next
 877  * page fault.
 878  */
 879 void do_no_page(struct vm_area_struct * vma, unsigned long address,
     /* [previous][next][first][last][top][bottom][index][help] */
 880         int write_access)
 881 {
 882         pte_t * page_table;
 883         pte_t entry;
 884         unsigned long page;
 885 
 886         page_table = get_empty_pgtable(vma->vm_task,address);
 887         if (!page_table)
 888                 return;
 889         entry = *page_table;
 890         if (pte_present(entry))
 891                 return;
 892         if (!pte_none(entry)) {
 893                 do_swap_page(vma, address, page_table, entry, write_access);
 894                 return;
 895         }
 896         address &= PAGE_MASK;
 897 
 898         if (!vma->vm_ops || !vma->vm_ops->nopage) {
 899                 ++vma->vm_task->mm->rss;
 900                 ++vma->vm_task->mm->min_flt;
 901                 get_empty_page(vma, page_table);
 902                 return;
 903         }
 904         page = get_free_page(GFP_KERNEL);
 905         if (share_page(vma, address, write_access, page)) {
 906                 ++vma->vm_task->mm->min_flt;
 907                 ++vma->vm_task->mm->rss;
 908                 return;
 909         }
 910         if (!page) {
 911                 oom(current);
 912                 put_page(page_table, BAD_PAGE);
 913                 return;
 914         }
 915         ++vma->vm_task->mm->maj_flt;
 916         ++vma->vm_task->mm->rss;
 917         /*
 918          * The fourth argument is "no_share", which tells the low-level code
 919          * to copy, not share the page even if sharing is possible.  It's
 920          * essentially an early COW detection 
 921          */
 922         page = vma->vm_ops->nopage(vma, address, page,
 923                 write_access && !(vma->vm_flags & VM_SHARED));
 924         if (share_page(vma, address, write_access, 0)) {
 925                 free_page(page);
 926                 return;
 927         }
 928         /*
 929          * This silly early PAGE_DIRTY setting removes a race
 930          * due to the bad i386 page protection. But it's valid
 931          * for other architectures too.
 932          *
 933          * Note that if write_access is true, we either now have
 934          * a exclusive copy of the page, or this is a shared mapping,
 935          * so we can make it writable and dirty to avoid having to
 936          * handle that later.
 937          */
 938         entry = mk_pte(page, vma->vm_page_prot);
 939         if (write_access) {
 940                 entry = pte_mkwrite(pte_mkdirty(entry));
 941         } else if (mem_map[MAP_NR(page)] > 1 && !(vma->vm_flags & VM_SHARED))
 942                 entry = pte_wrprotect(entry);
 943         put_page(page_table, entry);
 944 }

/* [previous][next][first][last][top][bottom][index][help] */