root/mm/memory.c

/* [previous][next][first][last][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. oom
  2. free_one_pte
  3. free_one_pmd
  4. free_one_pgd
  5. clear_page_tables
  6. free_page_tables
  7. clone_page_tables
  8. copy_one_pte
  9. copy_one_pmd
  10. copy_one_pgd
  11. copy_page_tables
  12. forget_pte
  13. unmap_pte_range
  14. unmap_pmd_range
  15. unmap_page_range
  16. zeromap_pte_range
  17. zeromap_pmd_range
  18. zeromap_page_range
  19. remap_pte_range
  20. remap_pmd_range
  21. remap_page_range
  22. put_page
  23. put_dirty_page
  24. do_wp_page
  25. verify_area
  26. get_empty_page
  27. try_to_share
  28. share_page
  29. get_empty_pgtable
  30. do_swap_page
  31. do_no_page
  32. handle_pte_fault
  33. handle_mm_fault

   1 /*
   2  *  linux/mm/memory.c
   3  *
   4  *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
   5  */
   6 
   7 /*
   8  * demand-loading started 01.12.91 - seems it is high on the list of
   9  * things wanted, and it should be easy to implement. - Linus
  10  */
  11 
  12 /*
  13  * Ok, demand-loading was easy, shared pages a little bit tricker. Shared
  14  * pages started 02.12.91, seems to work. - Linus.
  15  *
  16  * Tested sharing by executing about 30 /bin/sh: under the old kernel it
  17  * would have taken more than the 6M I have free, but it worked well as
  18  * far as I could see.
  19  *
  20  * Also corrected some "invalidate()"s - I wasn't doing enough of them.
  21  */
  22 
  23 /*
  24  * Real VM (paging to/from disk) started 18.12.91. Much more work and
  25  * thought has to go into this. Oh, well..
  26  * 19.12.91  -  works, somewhat. Sometimes I get faults, don't know why.
  27  *              Found it. Everything seems to work now.
  28  * 20.12.91  -  Ok, making the swap-device changeable like the root.
  29  */
  30 
  31 /*
  32  * 05.04.94  -  Multi-page memory management added for v1.1.
  33  *              Idea by Alex Bligh (alex@cconcepts.co.uk)
  34  */
  35 
  36 #include <linux/config.h>
  37 #include <linux/signal.h>
  38 #include <linux/sched.h>
  39 #include <linux/head.h>
  40 #include <linux/kernel.h>
  41 #include <linux/errno.h>
  42 #include <linux/string.h>
  43 #include <linux/types.h>
  44 #include <linux/ptrace.h>
  45 #include <linux/mman.h>
  46 #include <linux/mm.h>
  47 
  48 #include <asm/system.h>
  49 #include <asm/segment.h>
  50 #include <asm/pgtable.h>
  51 
  52 unsigned long high_memory = 0;
  53 
  54 /*
  55  * The free_area_list arrays point to the queue heads of the free areas
  56  * of different sizes
  57  */
  58 int nr_swap_pages = 0;
  59 int nr_free_pages = 0;
  60 struct mem_list free_area_list[NR_MEM_LISTS];
  61 unsigned char * free_area_map[NR_MEM_LISTS];
  62 
  63 #define copy_page(from,to) memcpy((void *) to, (void *) from, PAGE_SIZE)
  64 
  65 #define USER_PTRS_PER_PGD (TASK_SIZE / PGDIR_SIZE)
  66 
  67 mem_map_t * mem_map = NULL;
  68 
  69 /*
  70  * oom() prints a message (so that the user knows why the process died),
  71  * and gives the process an untrappable SIGKILL.
  72  */
  73 void oom(struct task_struct * task)
     /* [previous][next][first][last][top][bottom][index][help] */
  74 {
  75         printk("\nOut of memory for %s.\n", current->comm);
  76         task->sigaction[SIGKILL-1].sa_handler = NULL;
  77         task->blocked &= ~(1<<(SIGKILL-1));
  78         send_sig(SIGKILL,task,1);
  79 }
  80 
  81 static inline void free_one_pte(pte_t * page_table)
     /* [previous][next][first][last][top][bottom][index][help] */
  82 {
  83         pte_t page = *page_table;
  84 
  85         if (pte_none(page))
  86                 return;
  87         pte_clear(page_table);
  88         if (!pte_present(page)) {
  89                 swap_free(pte_val(page));
  90                 return;
  91         }
  92         free_page(pte_page(page));
  93         return;
  94 }
  95 
  96 static inline void free_one_pmd(pmd_t * dir)
     /* [previous][next][first][last][top][bottom][index][help] */
  97 {
  98         int j;
  99         pte_t * pte;
 100 
 101         if (pmd_none(*dir))
 102                 return;
 103         if (pmd_bad(*dir)) {
 104                 printk("free_one_pmd: bad directory entry %08lx\n", pmd_val(*dir));
 105                 pmd_clear(dir);
 106                 return;
 107         }
 108         pte = pte_offset(dir, 0);
 109         pmd_clear(dir);
 110         if (pte_inuse(pte)) {
 111                 pte_free(pte);
 112                 return;
 113         }
 114         for (j = 0; j < PTRS_PER_PTE ; j++)
 115                 free_one_pte(pte+j);
 116         pte_free(pte);
 117 }
 118 
 119 static inline void free_one_pgd(pgd_t * dir)
     /* [previous][next][first][last][top][bottom][index][help] */
 120 {
 121         int j;
 122         pmd_t * pmd;
 123 
 124         if (pgd_none(*dir))
 125                 return;
 126         if (pgd_bad(*dir)) {
 127                 printk("free_one_pgd: bad directory entry %08lx\n", pgd_val(*dir));
 128                 pgd_clear(dir);
 129                 return;
 130         }
 131         pmd = pmd_offset(dir, 0);
 132         pgd_clear(dir);
 133         if (pmd_inuse(pmd)) {
 134                 pmd_free(pmd);
 135                 return;
 136         }
 137         for (j = 0; j < PTRS_PER_PMD ; j++)
 138                 free_one_pmd(pmd+j);
 139         pmd_free(pmd);
 140 }
 141         
 142 
 143 /*
 144  * This function clears all user-level page tables of a process - this
 145  * is needed by execve(), so that old pages aren't in the way. Note that
 146  * unlike 'free_page_tables()', this function still leaves a valid
 147  * page-table-tree in memory: it just removes the user pages. The two
 148  * functions are similar, but there is a fundamental difference.
 149  */
 150 void clear_page_tables(struct task_struct * tsk)
     /* [previous][next][first][last][top][bottom][index][help] */
 151 {
 152         int i;
 153         pgd_t * page_dir;
 154 
 155         if (!tsk)
 156                 return;
 157         if (tsk == task[0])
 158                 panic("task[0] (swapper) doesn't support exec()\n");
 159         page_dir = pgd_offset(tsk, 0);
 160         if (!page_dir || page_dir == swapper_pg_dir) {
 161                 printk("%s trying to clear kernel page-directory: not good\n", tsk->comm);
 162                 return;
 163         }
 164         if (pgd_inuse(page_dir)) {
 165                 pgd_t * new_pg;
 166 
 167                 if (!(new_pg = pgd_alloc())) {
 168                         oom(tsk);
 169                         return;
 170                 }
 171                 for (i = USER_PTRS_PER_PGD ; i < PTRS_PER_PGD ; i++)
 172                         new_pg[i] = page_dir[i];
 173                 SET_PAGE_DIR(tsk, new_pg);
 174                 pgd_free(page_dir);
 175                 return;
 176         }
 177         for (i = 0 ; i < USER_PTRS_PER_PGD ; i++)
 178                 free_one_pgd(page_dir + i);
 179         invalidate();
 180         return;
 181 }
 182 
 183 /*
 184  * This function frees up all page tables of a process when it exits.
 185  */
 186 void free_page_tables(struct task_struct * tsk)
     /* [previous][next][first][last][top][bottom][index][help] */
 187 {
 188         int i;
 189         pgd_t * page_dir;
 190 
 191         if (!tsk)
 192                 return;
 193         if (tsk == task[0]) {
 194                 printk("task[0] (swapper) killed: unable to recover\n");
 195                 panic("Trying to free up swapper memory space");
 196         }
 197         page_dir = pgd_offset(tsk, 0);
 198         if (!page_dir || page_dir == swapper_pg_dir) {
 199                 printk("%s trying to free kernel page-directory: not good\n", tsk->comm);
 200                 return;
 201         }
 202         SET_PAGE_DIR(tsk, swapper_pg_dir);
 203         if (pgd_inuse(page_dir)) {
 204                 pgd_free(page_dir);
 205                 return;
 206         }
 207         for (i = 0 ; i < PTRS_PER_PGD ; i++)
 208                 free_one_pgd(page_dir + i);
 209         pgd_free(page_dir);
 210         invalidate();
 211 }
 212 
 213 /*
 214  * clone_page_tables() clones the page table for a process - both
 215  * processes will have the exact same pages in memory. There are
 216  * probably races in the memory management with cloning, but we'll
 217  * see..
 218  */
 219 int clone_page_tables(struct task_struct * tsk)
     /* [previous][next][first][last][top][bottom][index][help] */
 220 {
 221         pgd_t * pg_dir;
 222 
 223         pg_dir = pgd_offset(current, 0);
 224         pgd_reuse(pg_dir);
 225         SET_PAGE_DIR(tsk, pg_dir);
 226         return 0;
 227 }
 228 
 229 static inline void copy_one_pte(pte_t * old_pte, pte_t * new_pte)
     /* [previous][next][first][last][top][bottom][index][help] */
 230 {
 231         pte_t pte = *old_pte;
 232 
 233         if (pte_none(pte))
 234                 return;
 235         if (!pte_present(pte)) {
 236                 swap_duplicate(pte_val(pte));
 237                 *new_pte = pte;
 238                 return;
 239         }
 240         if (pte_page(pte) > high_memory || (mem_map[MAP_NR(pte_page(pte))] & MAP_PAGE_RESERVED)) {
 241                 *new_pte = pte;
 242                 return;
 243         }
 244         if (pte_cow(pte))
 245                 pte = pte_wrprotect(pte);
 246         if (delete_from_swap_cache(pte_page(pte)))
 247                 pte = pte_mkdirty(pte);
 248         *new_pte = pte_mkold(pte);
 249         *old_pte = pte;
 250         mem_map[MAP_NR(pte_page(pte))]++;
 251 }
 252 
 253 static inline int copy_one_pmd(pmd_t * old_pmd, pmd_t * new_pmd)
     /* [previous][next][first][last][top][bottom][index][help] */
 254 {
 255         int j;
 256         pte_t *old_pte, *new_pte;
 257 
 258         if (pmd_none(*old_pmd))
 259                 return 0;
 260         if (pmd_bad(*old_pmd)) {
 261                 printk("copy_one_pmd: bad page table: probable memory corruption\n");
 262                 pmd_clear(old_pmd);
 263                 return 0;
 264         }
 265         old_pte = pte_offset(old_pmd, 0);
 266         if (pte_inuse(old_pte)) {
 267                 pte_reuse(old_pte);
 268                 *new_pmd = *old_pmd;
 269                 return 0;
 270         }
 271         new_pte = pte_alloc(new_pmd, 0);
 272         if (!new_pte)
 273                 return -ENOMEM;
 274         for (j = 0 ; j < PTRS_PER_PTE ; j++) {
 275                 copy_one_pte(old_pte, new_pte);
 276                 old_pte++;
 277                 new_pte++;
 278         }
 279         return 0;
 280 }
 281 
 282 static inline int copy_one_pgd(pgd_t * old_pgd, pgd_t * new_pgd)
     /* [previous][next][first][last][top][bottom][index][help] */
 283 {
 284         int j;
 285         pmd_t *old_pmd, *new_pmd;
 286 
 287         if (pgd_none(*old_pgd))
 288                 return 0;
 289         if (pgd_bad(*old_pgd)) {
 290                 printk("copy_one_pgd: bad page table (%p: %08lx): probable memory corruption\n", old_pgd, pgd_val(*old_pgd));
 291                 pgd_clear(old_pgd);
 292                 return 0;
 293         }
 294         old_pmd = pmd_offset(old_pgd, 0);
 295         if (pmd_inuse(old_pmd)) {
 296                 pmd_reuse(old_pmd);
 297                 *new_pgd = *old_pgd;
 298                 return 0;
 299         }
 300         new_pmd = pmd_alloc(new_pgd, 0);
 301         if (!new_pmd)
 302                 return -ENOMEM;
 303         for (j = 0 ; j < PTRS_PER_PMD ; j++) {
 304                 int error = copy_one_pmd(old_pmd, new_pmd);
 305                 if (error)
 306                         return error;
 307                 old_pmd++;
 308                 new_pmd++;
 309         }
 310         return 0;
 311 }
 312 
 313 /*
 314  * copy_page_tables() just copies the whole process memory range:
 315  * note the special handling of RESERVED (ie kernel) pages, which
 316  * means that they are always shared by all processes.
 317  */
 318 int copy_page_tables(struct task_struct * tsk)
     /* [previous][next][first][last][top][bottom][index][help] */
 319 {
 320         int i;
 321         pgd_t *old_pgd;
 322         pgd_t *new_pgd;
 323 
 324         new_pgd = pgd_alloc();
 325         if (!new_pgd)
 326                 return -ENOMEM;
 327         SET_PAGE_DIR(tsk, new_pgd);
 328         old_pgd = pgd_offset(current, 0);
 329         for (i = 0 ; i < PTRS_PER_PGD ; i++) {
 330                 int errno = copy_one_pgd(old_pgd, new_pgd);
 331                 if (errno) {
 332                         free_page_tables(tsk);
 333                         invalidate();
 334                         return errno;
 335                 }
 336                 old_pgd++;
 337                 new_pgd++;
 338         }
 339         invalidate();
 340         return 0;
 341 }
 342 
 343 static inline void forget_pte(pte_t page)
     /* [previous][next][first][last][top][bottom][index][help] */
 344 {
 345         if (pte_none(page))
 346                 return;
 347         if (pte_present(page)) {
 348                 free_page(pte_page(page));
 349                 if (mem_map[MAP_NR(pte_page(page))] & MAP_PAGE_RESERVED)
 350                         return;
 351                 if (current->mm->rss <= 0)
 352                         return;
 353                 current->mm->rss--;
 354                 return;
 355         }
 356         swap_free(pte_val(page));
 357 }
 358 
 359 static inline void unmap_pte_range(pmd_t * pmd, unsigned long address, unsigned long size)
     /* [previous][next][first][last][top][bottom][index][help] */
 360 {
 361         pte_t * pte;
 362         unsigned long end;
 363 
 364         if (pmd_none(*pmd))
 365                 return;
 366         if (pmd_bad(*pmd)) {
 367                 printk("unmap_pte_range: bad pmd (%08lx)\n", pmd_val(*pmd));
 368                 pmd_clear(pmd);
 369                 return;
 370         }
 371         pte = pte_offset(pmd, address);
 372         address &= ~PMD_MASK;
 373         end = address + size;
 374         if (end >= PMD_SIZE)
 375                 end = PMD_SIZE;
 376         do {
 377                 pte_t page = *pte;
 378                 pte_clear(pte);
 379                 forget_pte(page);
 380                 address += PAGE_SIZE;
 381                 pte++;
 382         } while (address < end);
 383 }
 384 
 385 static inline void unmap_pmd_range(pgd_t * dir, unsigned long address, unsigned long size)
     /* [previous][next][first][last][top][bottom][index][help] */
 386 {
 387         pmd_t * pmd;
 388         unsigned long end;
 389 
 390         if (pgd_none(*dir))
 391                 return;
 392         if (pgd_bad(*dir)) {
 393                 printk("unmap_pmd_range: bad pgd (%08lx)\n", pgd_val(*dir));
 394                 pgd_clear(dir);
 395                 return;
 396         }
 397         pmd = pmd_offset(dir, address);
 398         address &= ~PGDIR_MASK;
 399         end = address + size;
 400         if (end > PGDIR_SIZE)
 401                 end = PGDIR_SIZE;
 402         do {
 403                 unmap_pte_range(pmd, address, end - address);
 404                 address = (address + PMD_SIZE) & PMD_MASK; 
 405                 pmd++;
 406         } while (address < end);
 407 }
 408 
 409 /*
 410  * a more complete version of free_page_tables which performs with page
 411  * granularity.
 412  */
 413 int unmap_page_range(unsigned long address, unsigned long size)
     /* [previous][next][first][last][top][bottom][index][help] */
 414 {
 415         pgd_t * dir;
 416         unsigned long end = address + size;
 417 
 418         dir = pgd_offset(current, address);
 419         while (address < end) {
 420                 unmap_pmd_range(dir, address, end - address);
 421                 address = (address + PGDIR_SIZE) & PGDIR_MASK;
 422                 dir++;
 423         }
 424         invalidate();
 425         return 0;
 426 }
 427 
 428 static inline void zeromap_pte_range(pte_t * pte, unsigned long address, unsigned long size, pte_t zero_pte)
     /* [previous][next][first][last][top][bottom][index][help] */
 429 {
 430         unsigned long end;
 431 
 432         address &= ~PMD_MASK;
 433         end = address + size;
 434         if (end > PMD_SIZE)
 435                 end = PMD_SIZE;
 436         do {
 437                 pte_t oldpage = *pte;
 438                 *pte = zero_pte;
 439                 forget_pte(oldpage);
 440                 address += PAGE_SIZE;
 441                 pte++;
 442         } while (address < end);
 443 }
 444 
 445 static inline int zeromap_pmd_range(pmd_t * pmd, unsigned long address, unsigned long size, pte_t zero_pte)
     /* [previous][next][first][last][top][bottom][index][help] */
 446 {
 447         unsigned long end;
 448 
 449         address &= ~PGDIR_MASK;
 450         end = address + size;
 451         if (end > PGDIR_SIZE)
 452                 end = PGDIR_SIZE;
 453         do {
 454                 pte_t * pte = pte_alloc(pmd, address);
 455                 if (!pte)
 456                         return -ENOMEM;
 457                 zeromap_pte_range(pte, address, end - address, zero_pte);
 458                 address = (address + PMD_SIZE) & PMD_MASK;
 459                 pmd++;
 460         } while (address < end);
 461         return 0;
 462 }
 463 
 464 int zeromap_page_range(unsigned long address, unsigned long size, pgprot_t prot)
     /* [previous][next][first][last][top][bottom][index][help] */
 465 {
 466         int error = 0;
 467         pgd_t * dir;
 468         unsigned long end = address + size;
 469         pte_t zero_pte;
 470 
 471         zero_pte = pte_wrprotect(mk_pte(ZERO_PAGE, prot));
 472         dir = pgd_offset(current, address);
 473         while (address < end) {
 474                 pmd_t *pmd = pmd_alloc(dir, address);
 475                 error = -ENOMEM;
 476                 if (!pmd)
 477                         break;
 478                 error = zeromap_pmd_range(pmd, address, end - address, zero_pte);
 479                 if (error)
 480                         break;
 481                 address = (address + PGDIR_SIZE) & PGDIR_MASK;
 482                 dir++;
 483         }
 484         invalidate();
 485         return error;
 486 }
 487 
 488 /*
 489  * maps a range of physical memory into the requested pages. the old
 490  * mappings are removed. any references to nonexistent pages results
 491  * in null mappings (currently treated as "copy-on-access")
 492  */
 493 static inline void remap_pte_range(pte_t * pte, unsigned long address, unsigned long size,
     /* [previous][next][first][last][top][bottom][index][help] */
 494         unsigned long offset, pgprot_t prot)
 495 {
 496         unsigned long end;
 497 
 498         address &= ~PMD_MASK;
 499         end = address + size;
 500         if (end > PMD_SIZE)
 501                 end = PMD_SIZE;
 502         do {
 503                 pte_t oldpage = *pte;
 504                 pte_clear(pte);
 505                 if (offset >= high_memory || (mem_map[MAP_NR(offset)] & MAP_PAGE_RESERVED))
 506                         *pte = mk_pte(offset, prot);
 507                 else if (mem_map[MAP_NR(offset)]) {
 508                         mem_map[MAP_NR(offset)]++;
 509                         *pte = mk_pte(offset, prot);
 510                 }
 511                 forget_pte(oldpage);
 512                 address += PAGE_SIZE;
 513                 offset += PAGE_SIZE;
 514                 pte++;
 515         } while (address < end);
 516 }
 517 
 518 static inline int remap_pmd_range(pmd_t * pmd, unsigned long address, unsigned long size,
     /* [previous][next][first][last][top][bottom][index][help] */
 519         unsigned long offset, pgprot_t prot)
 520 {
 521         unsigned long end;
 522 
 523         address &= ~PGDIR_MASK;
 524         end = address + size;
 525         if (end > PGDIR_SIZE)
 526                 end = PGDIR_SIZE;
 527         offset -= address;
 528         do {
 529                 pte_t * pte = pte_alloc(pmd, address);
 530                 if (!pte)
 531                         return -ENOMEM;
 532                 remap_pte_range(pte, address, end - address, address + offset, prot);
 533                 address = (address + PMD_SIZE) & PMD_MASK;
 534                 pmd++;
 535         } while (address < end);
 536         return 0;
 537 }
 538 
 539 int remap_page_range(unsigned long from, unsigned long offset, unsigned long size, pgprot_t prot)
     /* [previous][next][first][last][top][bottom][index][help] */
 540 {
 541         int error = 0;
 542         pgd_t * dir;
 543         unsigned long end = from + size;
 544 
 545         offset -= from;
 546         dir = pgd_offset(current, from);
 547         while (from < end) {
 548                 pmd_t *pmd = pmd_alloc(dir, from);
 549                 error = -ENOMEM;
 550                 if (!pmd)
 551                         break;
 552                 error = remap_pmd_range(pmd, from, end - from, offset + from, prot);
 553                 if (error)
 554                         break;
 555                 from = (from + PGDIR_SIZE) & PGDIR_MASK;
 556                 dir++;
 557         }
 558         invalidate();
 559         return error;
 560 }
 561 
 562 /*
 563  * sanity-check function..
 564  */
 565 static void put_page(pte_t * page_table, pte_t pte)
     /* [previous][next][first][last][top][bottom][index][help] */
 566 {
 567         if (!pte_none(*page_table)) {
 568                 printk("put_page: page already exists %08lx\n", pte_val(*page_table));
 569                 free_page(pte_page(pte));
 570                 return;
 571         }
 572 /* no need for invalidate */
 573         *page_table = pte;
 574 }
 575 
 576 /*
 577  * This routine is used to map in a page into an address space: needed by
 578  * execve() for the initial stack and environment pages.
 579  */
 580 unsigned long put_dirty_page(struct task_struct * tsk, unsigned long page, unsigned long address)
     /* [previous][next][first][last][top][bottom][index][help] */
 581 {
 582         pgd_t * pgd;
 583         pmd_t * pmd;
 584         pte_t * pte;
 585 
 586         if (page >= high_memory)
 587                 printk("put_dirty_page: trying to put page %08lx at %08lx\n",page,address);
 588         if (mem_map[MAP_NR(page)] != 1)
 589                 printk("mem_map disagrees with %08lx at %08lx\n",page,address);
 590         pgd = pgd_offset(tsk,address);
 591         pmd = pmd_alloc(pgd, address);
 592         if (!pmd) {
 593                 free_page(page);
 594                 oom(tsk);
 595                 return 0;
 596         }
 597         pte = pte_alloc(pmd, address);
 598         if (!pte) {
 599                 free_page(page);
 600                 oom(tsk);
 601                 return 0;
 602         }
 603         if (!pte_none(*pte)) {
 604                 printk("put_dirty_page: page already exists\n");
 605                 pte_clear(pte);
 606                 invalidate();
 607         }
 608         *pte = pte_mkwrite(pte_mkdirty(mk_pte(page, PAGE_COPY)));
 609 /* no need for invalidate */
 610         return page;
 611 }
 612 
 613 /*
 614  * This routine handles present pages, when users try to write
 615  * to a shared page. It is done by copying the page to a new address
 616  * and decrementing the shared-page counter for the old page.
 617  *
 618  * Goto-purists beware: the only reason for goto's here is that it results
 619  * in better assembly code.. The "default" path will see no jumps at all.
 620  *
 621  * Note that this routine assumes that the protection checks have been
 622  * done by the caller (the low-level page fault routine in most cases).
 623  * Thus we can safely just mark it writable once we've done any necessary
 624  * COW.
 625  *
 626  * We also mark the page dirty at this point even though the page will
 627  * change only once the write actually happens. This avoids a few races,
 628  * and potentially makes it more efficient.
 629  */
 630 void do_wp_page(struct vm_area_struct * vma, unsigned long address,
     /* [previous][next][first][last][top][bottom][index][help] */
 631         int write_access)
 632 {
 633         pgd_t *page_dir;
 634         pmd_t *page_middle;
 635         pte_t *page_table, pte;
 636         unsigned long old_page, new_page;
 637 
 638         new_page = __get_free_page(GFP_KERNEL);
 639         page_dir = pgd_offset(vma->vm_task,address);
 640         if (pgd_none(*page_dir))
 641                 goto end_wp_page;
 642         if (pgd_bad(*page_dir))
 643                 goto bad_wp_pagedir;
 644         page_middle = pmd_offset(page_dir, address);
 645         if (pmd_none(*page_middle))
 646                 goto end_wp_page;
 647         if (pmd_bad(*page_middle))
 648                 goto bad_wp_pagemiddle;
 649         page_table = pte_offset(page_middle, address);
 650         pte = *page_table;
 651         if (!pte_present(pte))
 652                 goto end_wp_page;
 653         if (pte_write(pte))
 654                 goto end_wp_page;
 655         old_page = pte_page(pte);
 656         if (old_page >= high_memory)
 657                 goto bad_wp_page;
 658         vma->vm_task->mm->min_flt++;
 659         /*
 660          * Do we need to copy?
 661          */
 662         if (mem_map[MAP_NR(old_page)] != 1) {
 663                 if (new_page) {
 664                         if (mem_map[MAP_NR(old_page)] & MAP_PAGE_RESERVED)
 665                                 ++vma->vm_task->mm->rss;
 666                         copy_page(old_page,new_page);
 667                         *page_table = pte_mkwrite(pte_mkdirty(mk_pte(new_page, vma->vm_page_prot)));
 668                         free_page(old_page);
 669                         invalidate();
 670                         return;
 671                 }
 672                 *page_table = BAD_PAGE;
 673                 free_page(old_page);
 674                 oom(vma->vm_task);
 675                 invalidate();
 676                 return;
 677         }
 678         *page_table = pte_mkdirty(pte_mkwrite(pte));
 679         invalidate();
 680         if (new_page)
 681                 free_page(new_page);
 682         return;
 683 bad_wp_page:
 684         printk("do_wp_page: bogus page at address %08lx (%08lx)\n",address,old_page);
 685         send_sig(SIGKILL, vma->vm_task, 1);
 686         goto end_wp_page;
 687 bad_wp_pagemiddle:
 688         printk("do_wp_page: bogus page-middle at address %08lx (%08lx)\n", address, pmd_val(*page_middle));
 689         send_sig(SIGKILL, vma->vm_task, 1);
 690         goto end_wp_page;
 691 bad_wp_pagedir:
 692         printk("do_wp_page: bogus page-dir entry at address %08lx (%08lx)\n", address, pgd_val(*page_dir));
 693         send_sig(SIGKILL, vma->vm_task, 1);
 694 end_wp_page:
 695         if (new_page)
 696                 free_page(new_page);
 697         return;
 698 }
 699 
 700 /*
 701  * Ugly, ugly, but the goto's result in better assembly..
 702  */
 703 int verify_area(int type, const void * addr, unsigned long size)
     /* [previous][next][first][last][top][bottom][index][help] */
 704 {
 705         struct vm_area_struct * vma;
 706         unsigned long start = (unsigned long) addr;
 707 
 708         /* If the current user space is mapped to kernel space (for the
 709          * case where we use a fake user buffer with get_fs/set_fs()) we
 710          * don't expect to find the address in the user vm map.
 711          */
 712         if (get_fs() == get_ds())
 713                 return 0;
 714 
 715         vma = find_vma(current, start);
 716         if (!vma)
 717                 goto bad_area;
 718         if (vma->vm_start <= start)
 719                 goto good_area;
 720         if (!(vma->vm_flags & VM_GROWSDOWN))
 721                 goto bad_area;
 722         if (vma->vm_end - start > current->rlim[RLIMIT_STACK].rlim_cur)
 723                 goto bad_area;
 724 
 725 good_area:
 726         if (type == VERIFY_WRITE)
 727                 goto check_write;
 728         for (;;) {
 729                 struct vm_area_struct * next;
 730                 if (!(vma->vm_flags & VM_READ))
 731                         goto bad_area;
 732                 if (vma->vm_end - start >= size)
 733                         return 0;
 734                 next = vma->vm_next;
 735                 if (!next || vma->vm_end != next->vm_start)
 736                         goto bad_area;
 737                 vma = next;
 738         }
 739 
 740 check_write:
 741         if (!(vma->vm_flags & VM_WRITE))
 742                 goto bad_area;
 743         if (!wp_works_ok)
 744                 goto check_wp_fault_by_hand;
 745         for (;;) {
 746                 if (vma->vm_end - start >= size)
 747                         break;
 748                 if (!vma->vm_next || vma->vm_end != vma->vm_next->vm_start)
 749                         goto bad_area;
 750                 vma = vma->vm_next;
 751                 if (!(vma->vm_flags & VM_WRITE))
 752                         goto bad_area;
 753         }
 754         return 0;
 755 
 756 check_wp_fault_by_hand:
 757         size--;
 758         size += start & ~PAGE_MASK;
 759         size >>= PAGE_SHIFT;
 760         start &= PAGE_MASK;
 761 
 762         for (;;) {
 763                 do_wp_page(vma, start, 1);
 764                 if (!size)
 765                         break;
 766                 size--;
 767                 start += PAGE_SIZE;
 768                 if (start < vma->vm_end)
 769                         continue;
 770                 vma = vma->vm_next;
 771                 if (!vma || vma->vm_start != start)
 772                         goto bad_area;
 773                 if (!(vma->vm_flags & VM_WRITE))
 774                         goto bad_area;;
 775         }
 776         return 0;
 777 
 778 bad_area:
 779         return -EFAULT;
 780 }
 781 
 782 static inline void get_empty_page(struct vm_area_struct * vma, pte_t * page_table)
     /* [previous][next][first][last][top][bottom][index][help] */
 783 {
 784         unsigned long tmp;
 785 
 786         if (!(tmp = get_free_page(GFP_KERNEL))) {
 787                 oom(vma->vm_task);
 788                 put_page(page_table, BAD_PAGE);
 789                 return;
 790         }
 791         put_page(page_table, pte_mkwrite(mk_pte(tmp, vma->vm_page_prot)));
 792 }
 793 
 794 /*
 795  * try_to_share() checks the page at address "address" in the task "p",
 796  * to see if it exists, and if it is clean. If so, share it with the current
 797  * task.
 798  *
 799  * NOTE! This assumes we have checked that p != current, and that they
 800  * share the same inode and can generally otherwise be shared.
 801  */
 802 static int try_to_share(unsigned long to_address, struct vm_area_struct * to_area,
     /* [previous][next][first][last][top][bottom][index][help] */
 803         unsigned long from_address, struct vm_area_struct * from_area,
 804         unsigned long newpage)
 805 {
 806         pgd_t * from_dir, * to_dir;
 807         pmd_t * from_middle, * to_middle;
 808         pte_t * from_table, * to_table;
 809         pte_t from, to;
 810 
 811         from_dir = pgd_offset(from_area->vm_task,from_address);
 812 /* is there a page-directory at from? */
 813         if (pgd_none(*from_dir))
 814                 return 0;
 815         if (pgd_bad(*from_dir)) {
 816                 printk("try_to_share: bad page directory %08lx\n", pgd_val(*from_dir));
 817                 pgd_clear(from_dir);
 818                 return 0;
 819         }
 820         from_middle = pmd_offset(from_dir, from_address);
 821 /* is there a mid-directory at from? */
 822         if (pmd_none(*from_middle))
 823                 return 0;
 824         if (pmd_bad(*from_middle)) {
 825                 printk("try_to_share: bad mid directory %08lx\n", pmd_val(*from_middle));
 826                 pmd_clear(from_middle);
 827                 return 0;
 828         }
 829         from_table = pte_offset(from_middle, from_address);
 830         from = *from_table;
 831 /* is the page present? */
 832         if (!pte_present(from))
 833                 return 0;
 834 /* if it is dirty it must be from a shared mapping to be shared */
 835         if (pte_dirty(from)) {
 836                 if (!(from_area->vm_flags & VM_SHARED))
 837                         return 0;
 838                 if (pte_write(from)) {
 839                         printk("nonwritable, but dirty, shared page\n");
 840                         return 0;
 841                 }
 842         }
 843 /* is the page reasonable at all? */
 844         if (pte_page(from) >= high_memory)
 845                 return 0;
 846         if (mem_map[MAP_NR(pte_page(from))] & MAP_PAGE_RESERVED)
 847                 return 0;
 848 /* is the destination ok? */
 849         to_dir = pgd_offset(to_area->vm_task,to_address);
 850 /* is there a page-directory at to? */
 851         if (pgd_none(*to_dir))
 852                 return 0;
 853         if (pgd_bad(*to_dir)) {
 854                 printk("try_to_share: bad page directory %08lx\n", pgd_val(*to_dir));
 855                 return 0;
 856         }
 857         to_middle = pmd_offset(to_dir, to_address);
 858 /* is there a mid-directory at to? */
 859         if (pmd_none(*to_middle))
 860                 return 0;
 861         if (pmd_bad(*to_middle)) {
 862                 printk("try_to_share: bad mid directory %08lx\n", pmd_val(*to_middle));
 863                 return 0;
 864         }
 865         to_table = pte_offset(to_middle, to_address);
 866         to = *to_table;
 867         if (!pte_none(to))
 868                 return 0;
 869 /* do we copy? */
 870         if (newpage) {
 871                 /* if it's in the swap cache, it's dirty by implication */
 872                 /* so we can't use it if it's not from a shared mapping */
 873                 if (in_swap_cache(pte_page(from))) {
 874                         if (!(from_area->vm_flags & VM_SHARED))
 875                                 return 0;
 876                         if (!pte_write(from)) {
 877                                 printk("nonwritable, but dirty, shared page\n");
 878                                 return 0;
 879                         }
 880                 }
 881                 copy_page(pte_page(from), newpage);
 882                 *to_table = mk_pte(newpage, to_area->vm_page_prot);
 883                 return 1;
 884         }
 885 /*
 886  * do a final swap-cache test before sharing them: if it's in the swap
 887  * cache, we have to remove it now, as we get two pointers to the same
 888  * physical page and the cache can't handle it. Mark the original dirty.
 889  *
 890  * NOTE! Even if "from" is dirty, "to" will be clean: if we get here
 891  * with a dirty "from", the from-mapping is a shared map, so we can trust
 892  * the page contents to be up-to-date
 893  */
 894         if (in_swap_cache(pte_page(from))) {
 895                 if (!(from_area->vm_flags & VM_SHARED))
 896                         return 0;
 897                 *from_table = pte_mkdirty(from);
 898                 delete_from_swap_cache(pte_page(from));
 899         }
 900         mem_map[MAP_NR(pte_page(from))]++;
 901         *to_table = mk_pte(pte_page(from), to_area->vm_page_prot);
 902 /* Check if we need to do anything at all to the 'from' field */
 903         if (!pte_write(from))
 904                 return 1;
 905         if (from_area->vm_flags & VM_SHARED)
 906                 return 1;
 907 /* ok, need to mark it read-only, so invalidate any possible old TB entry */
 908         *from_table = pte_wrprotect(from);
 909         invalidate();
 910         return 1;
 911 }
 912 
 913 /*
 914  * share_page() tries to find a process that could share a page with
 915  * the current one.
 916  *
 917  * We first check if it is at all feasible by checking inode->i_count.
 918  * It should be >1 if there are other tasks sharing this inode.
 919  */
 920 static int share_page(struct vm_area_struct * area, unsigned long address,
     /* [previous][next][first][last][top][bottom][index][help] */
 921         int write_access, unsigned long newpage)
 922 {
 923         struct inode * inode;
 924         unsigned long offset;
 925         unsigned long from_address;
 926         unsigned long give_page;
 927         struct vm_area_struct * mpnt;
 928 
 929         if (!area || !(inode = area->vm_inode) || inode->i_count < 2)
 930                 return 0;
 931         /* do we need to copy or can we just share? */
 932         give_page = 0;
 933         if (write_access && !(area->vm_flags & VM_SHARED)) {
 934                 if (!newpage)
 935                         return 0;
 936                 give_page = newpage;
 937         }
 938         offset = address - area->vm_start + area->vm_offset;
 939         /* See if there is something in the VM we can share pages with. */
 940         /* Traverse the entire circular i_mmap list, except `area' itself. */
 941         for (mpnt = area->vm_next_share; mpnt != area; mpnt = mpnt->vm_next_share) {
 942                 /* must be same inode */
 943                 if (mpnt->vm_inode != inode) {
 944                         printk("Aiee! Corrupt vm_area_struct i_mmap ring\n");
 945                         break;  
 946                 }
 947                 /* offsets must be mutually page-aligned */
 948                 if ((mpnt->vm_offset ^ area->vm_offset) & ~PAGE_MASK)
 949                         continue;
 950                 /* the other area must actually cover the wanted page.. */
 951                 from_address = offset + mpnt->vm_start - mpnt->vm_offset;
 952                 if (from_address < mpnt->vm_start || from_address >= mpnt->vm_end)
 953                         continue;
 954                 /* .. NOW we can actually try to use the same physical page */
 955                 if (!try_to_share(address, area, from_address, mpnt, give_page))
 956                         continue;
 957                 /* free newpage if we never used it.. */
 958                 if (give_page || !newpage)
 959                         return 1;
 960                 free_page(newpage);
 961                 return 1;
 962         }
 963         return 0;
 964 }
 965 
 966 /*
 967  * fill in an empty page-table if none exists.
 968  */
 969 static inline pte_t * get_empty_pgtable(struct task_struct * tsk,unsigned long address)
     /* [previous][next][first][last][top][bottom][index][help] */
 970 {
 971         pgd_t *pgd;
 972         pmd_t *pmd;
 973         pte_t *pte;
 974 
 975         pgd = pgd_offset(tsk, address);
 976         pmd = pmd_alloc(pgd, address);
 977         if (!pmd) {
 978                 oom(tsk);
 979                 return NULL;
 980         }
 981         pte = pte_alloc(pmd, address);
 982         if (!pte) {
 983                 oom(tsk);
 984                 return NULL;
 985         }
 986         return pte;
 987 }
 988 
 989 static inline void do_swap_page(struct vm_area_struct * vma, unsigned long address,
     /* [previous][next][first][last][top][bottom][index][help] */
 990         pte_t * page_table, pte_t entry, int write_access)
 991 {
 992         pte_t page;
 993 
 994         if (!vma->vm_ops || !vma->vm_ops->swapin) {
 995                 swap_in(vma, page_table, pte_val(entry), write_access);
 996                 return;
 997         }
 998         page = vma->vm_ops->swapin(vma, address - vma->vm_start + vma->vm_offset, pte_val(entry));
 999         if (pte_val(*page_table) != pte_val(entry)) {
1000                 free_page(pte_page(page));
1001                 return;
1002         }
1003         if (mem_map[MAP_NR(pte_page(page))] > 1 && !(vma->vm_flags & VM_SHARED))
1004                 page = pte_wrprotect(page);
1005         ++vma->vm_task->mm->rss;
1006         ++vma->vm_task->mm->maj_flt;
1007         *page_table = page;
1008         return;
1009 }
1010 
1011 /*
1012  * do_no_page() tries to create a new page mapping. It aggressively
1013  * tries to share with existing pages, but makes a separate copy if
1014  * the "write_access" parameter is true in order to avoid the next
1015  * page fault.
1016  */
1017 void do_no_page(struct vm_area_struct * vma, unsigned long address,
     /* [previous][next][first][last][top][bottom][index][help] */
1018         int write_access)
1019 {
1020         pte_t * page_table;
1021         pte_t entry;
1022         unsigned long page;
1023 
1024         page_table = get_empty_pgtable(vma->vm_task,address);
1025         if (!page_table)
1026                 return;
1027         entry = *page_table;
1028         if (pte_present(entry))
1029                 return;
1030         if (!pte_none(entry)) {
1031                 do_swap_page(vma, address, page_table, entry, write_access);
1032                 return;
1033         }
1034         address &= PAGE_MASK;
1035         if (!vma->vm_ops || !vma->vm_ops->nopage) {
1036                 ++vma->vm_task->mm->rss;
1037                 ++vma->vm_task->mm->min_flt;
1038                 get_empty_page(vma, page_table);
1039                 return;
1040         }
1041         page = __get_free_page(GFP_KERNEL);
1042         if (share_page(vma, address, write_access, page)) {
1043                 ++vma->vm_task->mm->min_flt;
1044                 ++vma->vm_task->mm->rss;
1045                 return;
1046         }
1047         if (!page) {
1048                 oom(current);
1049                 put_page(page_table, BAD_PAGE);
1050                 return;
1051         }
1052         ++vma->vm_task->mm->maj_flt;
1053         ++vma->vm_task->mm->rss;
1054         /*
1055          * The fourth argument is "no_share", which tells the low-level code
1056          * to copy, not share the page even if sharing is possible.  It's
1057          * essentially an early COW detection 
1058          */
1059         page = vma->vm_ops->nopage(vma, address, page,
1060                 write_access && !(vma->vm_flags & VM_SHARED));
1061         if (share_page(vma, address, write_access, 0)) {
1062                 free_page(page);
1063                 return;
1064         }
1065         /*
1066          * This silly early PAGE_DIRTY setting removes a race
1067          * due to the bad i386 page protection. But it's valid
1068          * for other architectures too.
1069          *
1070          * Note that if write_access is true, we either now have
1071          * a exclusive copy of the page, or this is a shared mapping,
1072          * so we can make it writable and dirty to avoid having to
1073          * handle that later.
1074          */
1075         entry = mk_pte(page, vma->vm_page_prot);
1076         if (write_access) {
1077                 entry = pte_mkwrite(pte_mkdirty(entry));
1078         } else if (mem_map[MAP_NR(page)] > 1 && !(vma->vm_flags & VM_SHARED))
1079                 entry = pte_wrprotect(entry);
1080         put_page(page_table, entry);
1081 }
1082 
1083 /*
1084  * The above separate functions for the no-page and wp-page
1085  * cases will go away (they mostly do the same thing anyway),
1086  * and we'll instead use only a general "handle_mm_fault()".
1087  *
1088  * These routines also need to handle stuff like marking pages dirty
1089  * and/or accessed for architectures that don't do it in hardware (most
1090  * RISC architectures).  The early dirtying is also good on the i386.
1091  *
1092  * There is also a hook called "update_mmu_cache()" that architectures
1093  * with external mmu caches can use to update those (ie the Sparc or
1094  * PowerPC hashed page tables that act as extended TLBs).
1095  */
1096 static inline void handle_pte_fault(struct vm_area_struct * vma, unsigned long address,
     /* [previous][next][first][last][top][bottom][index][help] */
1097         int write_access, pte_t * pte)
1098 {
1099         if (!pte_present(*pte)) {
1100                 do_no_page(vma, address, write_access);
1101                 return;
1102         }
1103         *pte = pte_mkyoung(*pte);
1104         if (!write_access)
1105                 return;
1106         if (pte_write(*pte)) {
1107                 *pte = pte_mkdirty(*pte);
1108                 return;
1109         }
1110         do_wp_page(vma, address, write_access);
1111 }
1112 
1113 void handle_mm_fault(struct vm_area_struct * vma, unsigned long address,
     /* [previous][next][first][last][top][bottom][index][help] */
1114         int write_access)
1115 {
1116         pgd_t *pgd;
1117         pmd_t *pmd;
1118         pte_t *pte;
1119 
1120         pgd = pgd_offset(vma->vm_task, address);
1121         pmd = pmd_alloc(pgd, address);
1122         if (!pmd)
1123                 goto no_memory;
1124         pte = pte_alloc(pmd, address);
1125         if (!pte)
1126                 goto no_memory;
1127         handle_pte_fault(vma, address, write_access, pte);
1128         update_mmu_cache(vma, address, *pte);
1129         return;
1130 no_memory:
1131         oom(vma->vm_task);
1132 }

/* [previous][next][first][last][top][bottom][index][help] */