root/mm/memory.c

/* [previous][next][first][last][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. oom
  2. free_one_pte
  3. free_one_pmd
  4. free_one_pgd
  5. clear_page_tables
  6. free_page_tables
  7. clone_page_tables
  8. copy_one_pte
  9. copy_one_pmd
  10. copy_one_pgd
  11. copy_page_tables
  12. forget_pte
  13. unmap_pte_range
  14. unmap_pmd_range
  15. unmap_page_range
  16. zeromap_pte_range
  17. zeromap_pmd_range
  18. zeromap_page_range
  19. remap_pte_range
  20. remap_pmd_range
  21. remap_page_range
  22. put_page
  23. put_dirty_page
  24. do_wp_page
  25. verify_area
  26. get_empty_page
  27. try_to_share
  28. share_page
  29. get_empty_pgtable
  30. do_swap_page
  31. do_no_page
  32. handle_pte_fault
  33. handle_mm_fault

   1 /*
   2  *  linux/mm/memory.c
   3  *
   4  *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
   5  */
   6 
   7 /*
   8  * demand-loading started 01.12.91 - seems it is high on the list of
   9  * things wanted, and it should be easy to implement. - Linus
  10  */
  11 
  12 /*
  13  * Ok, demand-loading was easy, shared pages a little bit tricker. Shared
  14  * pages started 02.12.91, seems to work. - Linus.
  15  *
  16  * Tested sharing by executing about 30 /bin/sh: under the old kernel it
  17  * would have taken more than the 6M I have free, but it worked well as
  18  * far as I could see.
  19  *
  20  * Also corrected some "invalidate()"s - I wasn't doing enough of them.
  21  */
  22 
  23 /*
  24  * Real VM (paging to/from disk) started 18.12.91. Much more work and
  25  * thought has to go into this. Oh, well..
  26  * 19.12.91  -  works, somewhat. Sometimes I get faults, don't know why.
  27  *              Found it. Everything seems to work now.
  28  * 20.12.91  -  Ok, making the swap-device changeable like the root.
  29  */
  30 
  31 /*
  32  * 05.04.94  -  Multi-page memory management added for v1.1.
  33  *              Idea by Alex Bligh (alex@cconcepts.co.uk)
  34  */
  35 
  36 #include <linux/config.h>
  37 #include <linux/signal.h>
  38 #include <linux/sched.h>
  39 #include <linux/head.h>
  40 #include <linux/kernel.h>
  41 #include <linux/errno.h>
  42 #include <linux/string.h>
  43 #include <linux/types.h>
  44 #include <linux/ptrace.h>
  45 #include <linux/mman.h>
  46 #include <linux/mm.h>
  47 
  48 #include <asm/system.h>
  49 #include <asm/segment.h>
  50 #include <asm/pgtable.h>
  51 
  52 unsigned long high_memory = 0;
  53 
  54 /*
  55  * The free_area_list arrays point to the queue heads of the free areas
  56  * of different sizes
  57  */
  58 int nr_swap_pages = 0;
  59 int nr_free_pages = 0;
  60 struct mem_list free_area_list[NR_MEM_LISTS];
  61 unsigned char * free_area_map[NR_MEM_LISTS];
  62 
  63 #define copy_page(from,to) memcpy((void *) to, (void *) from, PAGE_SIZE)
  64 
  65 #define USER_PTRS_PER_PGD (TASK_SIZE / PGDIR_SIZE)
  66 
  67 mem_map_t * mem_map = NULL;
  68 
  69 /*
  70  * oom() prints a message (so that the user knows why the process died),
  71  * and gives the process an untrappable SIGKILL.
  72  */
  73 void oom(struct task_struct * task)
     /* [previous][next][first][last][top][bottom][index][help] */
  74 {
  75         printk("\nOut of memory for %s.\n", current->comm);
  76         task->sigaction[SIGKILL-1].sa_handler = NULL;
  77         task->blocked &= ~(1<<(SIGKILL-1));
  78         send_sig(SIGKILL,task,1);
  79 }
  80 
  81 static inline void free_one_pte(pte_t * page_table)
     /* [previous][next][first][last][top][bottom][index][help] */
  82 {
  83         pte_t page = *page_table;
  84 
  85         if (pte_none(page))
  86                 return;
  87         pte_clear(page_table);
  88         if (!pte_present(page)) {
  89                 swap_free(pte_val(page));
  90                 return;
  91         }
  92         free_page(pte_page(page));
  93         return;
  94 }
  95 
  96 static inline void free_one_pmd(pmd_t * dir)
     /* [previous][next][first][last][top][bottom][index][help] */
  97 {
  98         int j;
  99         pte_t * pte;
 100 
 101         if (pmd_none(*dir))
 102                 return;
 103         if (pmd_bad(*dir)) {
 104                 printk("free_one_pmd: bad directory entry %08lx\n", pmd_val(*dir));
 105                 pmd_clear(dir);
 106                 return;
 107         }
 108         pte = pte_offset(dir, 0);
 109         pmd_clear(dir);
 110         if (pte_inuse(pte)) {
 111                 pte_free(pte);
 112                 return;
 113         }
 114         for (j = 0; j < PTRS_PER_PTE ; j++)
 115                 free_one_pte(pte+j);
 116         pte_free(pte);
 117 }
 118 
 119 static inline void free_one_pgd(pgd_t * dir)
     /* [previous][next][first][last][top][bottom][index][help] */
 120 {
 121         pmd_t * pmd;
 122 
 123         if (pgd_none(*dir))
 124                 return;
 125         if (pgd_bad(*dir)) {
 126                 printk("free_one_pgd: bad directory entry %08lx\n", pgd_val(*dir));
 127                 pgd_clear(dir);
 128                 return;
 129         }
 130         pmd = pmd_offset(dir, 0);
 131         pgd_clear(dir);
 132         if (!pmd_inuse(pmd)) {
 133                 int j;
 134                 for (j = 0; j < PTRS_PER_PMD ; j++)
 135                         free_one_pmd(pmd+j);
 136         }
 137         pmd_free(pmd);
 138 }
 139         
 140 
 141 /*
 142  * This function clears all user-level page tables of a process - this
 143  * is needed by execve(), so that old pages aren't in the way. Note that
 144  * unlike 'free_page_tables()', this function still leaves a valid
 145  * page-table-tree in memory: it just removes the user pages. The two
 146  * functions are similar, but there is a fundamental difference.
 147  */
 148 void clear_page_tables(struct task_struct * tsk)
     /* [previous][next][first][last][top][bottom][index][help] */
 149 {
 150         int i;
 151         pgd_t * page_dir;
 152 
 153         if (!tsk)
 154                 return;
 155         if (tsk == task[0])
 156                 panic("task[0] (swapper) doesn't support exec()\n");
 157         page_dir = pgd_offset(tsk->mm, 0);
 158         if (!page_dir || page_dir == swapper_pg_dir) {
 159                 printk("%s trying to clear kernel page-directory: not good\n", tsk->comm);
 160                 return;
 161         }
 162         if (pgd_inuse(page_dir)) {
 163                 pgd_t * new_pg;
 164 
 165                 if (!(new_pg = pgd_alloc())) {
 166                         oom(tsk);
 167                         return;
 168                 }
 169                 for (i = USER_PTRS_PER_PGD ; i < PTRS_PER_PGD ; i++)
 170                         new_pg[i] = page_dir[i];
 171                 SET_PAGE_DIR(tsk, new_pg);
 172                 tsk->mm->pgd = new_pg;
 173                 pgd_free(page_dir);
 174                 return;
 175         }
 176         for (i = 0 ; i < USER_PTRS_PER_PGD ; i++)
 177                 free_one_pgd(page_dir + i);
 178         invalidate();
 179         return;
 180 }
 181 
 182 /*
 183  * This function frees up all page tables of a process when it exits.
 184  */
 185 void free_page_tables(struct task_struct * tsk)
     /* [previous][next][first][last][top][bottom][index][help] */
 186 {
 187         int i;
 188         pgd_t * page_dir;
 189 
 190         if (!tsk)
 191                 return;
 192         if (tsk == task[0]) {
 193                 printk("task[0] (swapper) killed: unable to recover\n");
 194                 panic("Trying to free up swapper memory space");
 195         }
 196         page_dir = pgd_offset(tsk->mm, 0);
 197         if (!page_dir || page_dir == swapper_pg_dir) {
 198                 printk("%s trying to free kernel page-directory: not good\n", tsk->comm);
 199                 return;
 200         }
 201         SET_PAGE_DIR(tsk, swapper_pg_dir);
 202         if (pgd_inuse(page_dir)) {
 203                 pgd_free(page_dir);
 204                 return;
 205         }
 206         tsk->mm->pgd = swapper_pg_dir;  /* or else... */
 207         for (i = 0 ; i < PTRS_PER_PGD ; i++)
 208                 free_one_pgd(page_dir + i);
 209         pgd_free(page_dir);
 210         invalidate();
 211 }
 212 
 213 /*
 214  * clone_page_tables() clones the page table for a process - both
 215  * processes will have the exact same pages in memory. There are
 216  * probably races in the memory management with cloning, but we'll
 217  * see..
 218  */
 219 int clone_page_tables(struct task_struct * tsk)
     /* [previous][next][first][last][top][bottom][index][help] */
 220 {
 221         pgd_t * pg_dir;
 222 
 223         pg_dir = pgd_offset(current->mm, 0);
 224         pgd_reuse(pg_dir);
 225         SET_PAGE_DIR(tsk, pg_dir);
 226         tsk->mm->pgd = pg_dir;
 227         return 0;
 228 }
 229 
 230 static inline void copy_one_pte(pte_t * old_pte, pte_t * new_pte)
     /* [previous][next][first][last][top][bottom][index][help] */
 231 {
 232         pte_t pte = *old_pte;
 233 
 234         if (pte_none(pte))
 235                 return;
 236         if (!pte_present(pte)) {
 237                 swap_duplicate(pte_val(pte));
 238                 set_pte(new_pte, pte);
 239                 return;
 240         }
 241         if (pte_page(pte) > high_memory || (mem_map[MAP_NR(pte_page(pte))] & MAP_PAGE_RESERVED)) {
 242                 set_pte(new_pte, pte);
 243                 return;
 244         }
 245         if (pte_cow(pte))
 246                 pte = pte_wrprotect(pte);
 247         if (delete_from_swap_cache(pte_page(pte)))
 248                 pte = pte_mkdirty(pte);
 249         set_pte(new_pte, pte_mkold(pte));
 250         set_pte(old_pte, pte);
 251         mem_map[MAP_NR(pte_page(pte))]++;
 252 }
 253 
 254 static inline int copy_one_pmd(pmd_t * old_pmd, pmd_t * new_pmd)
     /* [previous][next][first][last][top][bottom][index][help] */
 255 {
 256         int j;
 257         pte_t *old_pte, *new_pte;
 258 
 259         if (pmd_none(*old_pmd))
 260                 return 0;
 261         if (pmd_bad(*old_pmd)) {
 262                 printk("copy_one_pmd: bad page table (%08lx): probable memory corruption\n", pmd_val(*old_pmd));
 263                 pmd_clear(old_pmd);
 264                 return 0;
 265         }
 266         old_pte = pte_offset(old_pmd, 0);
 267         if (pte_inuse(old_pte)) {
 268                 pte_reuse(old_pte);
 269                 *new_pmd = *old_pmd;
 270                 return 0;
 271         }
 272         new_pte = pte_alloc(new_pmd, 0);
 273         if (!new_pte)
 274                 return -ENOMEM;
 275         for (j = 0 ; j < PTRS_PER_PTE ; j++) {
 276                 copy_one_pte(old_pte, new_pte);
 277                 old_pte++;
 278                 new_pte++;
 279         }
 280         return 0;
 281 }
 282 
 283 static inline int copy_one_pgd(pgd_t * old_pgd, pgd_t * new_pgd)
     /* [previous][next][first][last][top][bottom][index][help] */
 284 {
 285         int j;
 286         pmd_t *old_pmd, *new_pmd;
 287 
 288         if (pgd_none(*old_pgd))
 289                 return 0;
 290         if (pgd_bad(*old_pgd)) {
 291                 printk("copy_one_pgd: bad page table (%p: %08lx): probable memory corruption\n", old_pgd, pgd_val(*old_pgd));
 292                 pgd_clear(old_pgd);
 293                 return 0;
 294         }
 295         old_pmd = pmd_offset(old_pgd, 0);
 296         if (pmd_inuse(old_pmd)) {
 297                 pmd_reuse(old_pmd);
 298                 *new_pgd = *old_pgd;
 299                 return 0;
 300         }
 301         new_pmd = pmd_alloc(new_pgd, 0);
 302         if (!new_pmd)
 303                 return -ENOMEM;
 304         for (j = 0 ; j < PTRS_PER_PMD ; j++) {
 305                 int error = copy_one_pmd(old_pmd, new_pmd);
 306                 if (error)
 307                         return error;
 308                 old_pmd++;
 309                 new_pmd++;
 310         }
 311         return 0;
 312 }
 313 
 314 /*
 315  * copy_page_tables() just copies the whole process memory range:
 316  * note the special handling of RESERVED (ie kernel) pages, which
 317  * means that they are always shared by all processes.
 318  */
 319 int copy_page_tables(struct task_struct * tsk)
     /* [previous][next][first][last][top][bottom][index][help] */
 320 {
 321         int i;
 322         pgd_t *old_pgd;
 323         pgd_t *new_pgd;
 324 
 325         new_pgd = pgd_alloc();
 326         if (!new_pgd)
 327                 return -ENOMEM;
 328         SET_PAGE_DIR(tsk, new_pgd);
 329         tsk->mm->pgd = new_pgd;
 330         old_pgd = pgd_offset(current->mm, 0);
 331         for (i = 0 ; i < PTRS_PER_PGD ; i++) {
 332                 int errno = copy_one_pgd(old_pgd, new_pgd);
 333                 if (errno) {
 334                         free_page_tables(tsk);
 335                         invalidate();
 336                         return errno;
 337                 }
 338                 old_pgd++;
 339                 new_pgd++;
 340         }
 341         invalidate();
 342         return 0;
 343 }
 344 
 345 static inline void forget_pte(pte_t page)
     /* [previous][next][first][last][top][bottom][index][help] */
 346 {
 347         if (pte_none(page))
 348                 return;
 349         if (pte_present(page)) {
 350                 free_page(pte_page(page));
 351                 if (mem_map[MAP_NR(pte_page(page))] & MAP_PAGE_RESERVED)
 352                         return;
 353                 if (current->mm->rss <= 0)
 354                         return;
 355                 current->mm->rss--;
 356                 return;
 357         }
 358         swap_free(pte_val(page));
 359 }
 360 
 361 static inline void unmap_pte_range(pmd_t * pmd, unsigned long address, unsigned long size)
     /* [previous][next][first][last][top][bottom][index][help] */
 362 {
 363         pte_t * pte;
 364         unsigned long end;
 365 
 366         if (pmd_none(*pmd))
 367                 return;
 368         if (pmd_bad(*pmd)) {
 369                 printk("unmap_pte_range: bad pmd (%08lx)\n", pmd_val(*pmd));
 370                 pmd_clear(pmd);
 371                 return;
 372         }
 373         pte = pte_offset(pmd, address);
 374         address &= ~PMD_MASK;
 375         end = address + size;
 376         if (end >= PMD_SIZE)
 377                 end = PMD_SIZE;
 378         do {
 379                 pte_t page = *pte;
 380                 pte_clear(pte);
 381                 forget_pte(page);
 382                 address += PAGE_SIZE;
 383                 pte++;
 384         } while (address < end);
 385 }
 386 
 387 static inline void unmap_pmd_range(pgd_t * dir, unsigned long address, unsigned long size)
     /* [previous][next][first][last][top][bottom][index][help] */
 388 {
 389         pmd_t * pmd;
 390         unsigned long end;
 391 
 392         if (pgd_none(*dir))
 393                 return;
 394         if (pgd_bad(*dir)) {
 395                 printk("unmap_pmd_range: bad pgd (%08lx)\n", pgd_val(*dir));
 396                 pgd_clear(dir);
 397                 return;
 398         }
 399         pmd = pmd_offset(dir, address);
 400         address &= ~PGDIR_MASK;
 401         end = address + size;
 402         if (end > PGDIR_SIZE)
 403                 end = PGDIR_SIZE;
 404         do {
 405                 unmap_pte_range(pmd, address, end - address);
 406                 address = (address + PMD_SIZE) & PMD_MASK; 
 407                 pmd++;
 408         } while (address < end);
 409 }
 410 
 411 /*
 412  * a more complete version of free_page_tables which performs with page
 413  * granularity.
 414  */
 415 int unmap_page_range(unsigned long address, unsigned long size)
     /* [previous][next][first][last][top][bottom][index][help] */
 416 {
 417         pgd_t * dir;
 418         unsigned long end = address + size;
 419 
 420         dir = pgd_offset(current->mm, address);
 421         while (address < end) {
 422                 unmap_pmd_range(dir, address, end - address);
 423                 address = (address + PGDIR_SIZE) & PGDIR_MASK;
 424                 dir++;
 425         }
 426         invalidate();
 427         return 0;
 428 }
 429 
 430 static inline void zeromap_pte_range(pte_t * pte, unsigned long address, unsigned long size, pte_t zero_pte)
     /* [previous][next][first][last][top][bottom][index][help] */
 431 {
 432         unsigned long end;
 433 
 434         address &= ~PMD_MASK;
 435         end = address + size;
 436         if (end > PMD_SIZE)
 437                 end = PMD_SIZE;
 438         do {
 439                 pte_t oldpage = *pte;
 440                 set_pte(pte, zero_pte);
 441                 forget_pte(oldpage);
 442                 address += PAGE_SIZE;
 443                 pte++;
 444         } while (address < end);
 445 }
 446 
 447 static inline int zeromap_pmd_range(pmd_t * pmd, unsigned long address, unsigned long size, pte_t zero_pte)
     /* [previous][next][first][last][top][bottom][index][help] */
 448 {
 449         unsigned long end;
 450 
 451         address &= ~PGDIR_MASK;
 452         end = address + size;
 453         if (end > PGDIR_SIZE)
 454                 end = PGDIR_SIZE;
 455         do {
 456                 pte_t * pte = pte_alloc(pmd, address);
 457                 if (!pte)
 458                         return -ENOMEM;
 459                 zeromap_pte_range(pte, address, end - address, zero_pte);
 460                 address = (address + PMD_SIZE) & PMD_MASK;
 461                 pmd++;
 462         } while (address < end);
 463         return 0;
 464 }
 465 
 466 int zeromap_page_range(unsigned long address, unsigned long size, pgprot_t prot)
     /* [previous][next][first][last][top][bottom][index][help] */
 467 {
 468         int error = 0;
 469         pgd_t * dir;
 470         unsigned long end = address + size;
 471         pte_t zero_pte;
 472 
 473         zero_pte = pte_wrprotect(mk_pte(ZERO_PAGE, prot));
 474         dir = pgd_offset(current->mm, address);
 475         while (address < end) {
 476                 pmd_t *pmd = pmd_alloc(dir, address);
 477                 error = -ENOMEM;
 478                 if (!pmd)
 479                         break;
 480                 error = zeromap_pmd_range(pmd, address, end - address, zero_pte);
 481                 if (error)
 482                         break;
 483                 address = (address + PGDIR_SIZE) & PGDIR_MASK;
 484                 dir++;
 485         }
 486         invalidate();
 487         return error;
 488 }
 489 
 490 /*
 491  * maps a range of physical memory into the requested pages. the old
 492  * mappings are removed. any references to nonexistent pages results
 493  * in null mappings (currently treated as "copy-on-access")
 494  */
 495 static inline void remap_pte_range(pte_t * pte, unsigned long address, unsigned long size,
     /* [previous][next][first][last][top][bottom][index][help] */
 496         unsigned long offset, pgprot_t prot)
 497 {
 498         unsigned long end;
 499 
 500         address &= ~PMD_MASK;
 501         end = address + size;
 502         if (end > PMD_SIZE)
 503                 end = PMD_SIZE;
 504         do {
 505                 pte_t oldpage = *pte;
 506                 pte_clear(pte);
 507                 if (offset >= high_memory || (mem_map[MAP_NR(offset)] & MAP_PAGE_RESERVED))
 508                         set_pte(pte, mk_pte(offset, prot));
 509                 forget_pte(oldpage);
 510                 address += PAGE_SIZE;
 511                 offset += PAGE_SIZE;
 512                 pte++;
 513         } while (address < end);
 514 }
 515 
 516 static inline int remap_pmd_range(pmd_t * pmd, unsigned long address, unsigned long size,
     /* [previous][next][first][last][top][bottom][index][help] */
 517         unsigned long offset, pgprot_t prot)
 518 {
 519         unsigned long end;
 520 
 521         address &= ~PGDIR_MASK;
 522         end = address + size;
 523         if (end > PGDIR_SIZE)
 524                 end = PGDIR_SIZE;
 525         offset -= address;
 526         do {
 527                 pte_t * pte = pte_alloc(pmd, address);
 528                 if (!pte)
 529                         return -ENOMEM;
 530                 remap_pte_range(pte, address, end - address, address + offset, prot);
 531                 address = (address + PMD_SIZE) & PMD_MASK;
 532                 pmd++;
 533         } while (address < end);
 534         return 0;
 535 }
 536 
 537 int remap_page_range(unsigned long from, unsigned long offset, unsigned long size, pgprot_t prot)
     /* [previous][next][first][last][top][bottom][index][help] */
 538 {
 539         int error = 0;
 540         pgd_t * dir;
 541         unsigned long end = from + size;
 542 
 543         offset -= from;
 544         dir = pgd_offset(current->mm, from);
 545         while (from < end) {
 546                 pmd_t *pmd = pmd_alloc(dir, from);
 547                 error = -ENOMEM;
 548                 if (!pmd)
 549                         break;
 550                 error = remap_pmd_range(pmd, from, end - from, offset + from, prot);
 551                 if (error)
 552                         break;
 553                 from = (from + PGDIR_SIZE) & PGDIR_MASK;
 554                 dir++;
 555         }
 556         invalidate();
 557         return error;
 558 }
 559 
 560 /*
 561  * sanity-check function..
 562  */
 563 static void put_page(pte_t * page_table, pte_t pte)
     /* [previous][next][first][last][top][bottom][index][help] */
 564 {
 565         if (!pte_none(*page_table)) {
 566                 printk("put_page: page already exists %08lx\n", pte_val(*page_table));
 567                 free_page(pte_page(pte));
 568                 return;
 569         }
 570 /* no need for invalidate */
 571         *page_table = pte;
 572 }
 573 
 574 /*
 575  * This routine is used to map in a page into an address space: needed by
 576  * execve() for the initial stack and environment pages.
 577  */
 578 unsigned long put_dirty_page(struct task_struct * tsk, unsigned long page, unsigned long address)
     /* [previous][next][first][last][top][bottom][index][help] */
 579 {
 580         pgd_t * pgd;
 581         pmd_t * pmd;
 582         pte_t * pte;
 583 
 584         if (page >= high_memory)
 585                 printk("put_dirty_page: trying to put page %08lx at %08lx\n",page,address);
 586         if (mem_map[MAP_NR(page)] != 1)
 587                 printk("mem_map disagrees with %08lx at %08lx\n",page,address);
 588         pgd = pgd_offset(tsk->mm,address);
 589         pmd = pmd_alloc(pgd, address);
 590         if (!pmd) {
 591                 free_page(page);
 592                 oom(tsk);
 593                 return 0;
 594         }
 595         pte = pte_alloc(pmd, address);
 596         if (!pte) {
 597                 free_page(page);
 598                 oom(tsk);
 599                 return 0;
 600         }
 601         if (!pte_none(*pte)) {
 602                 printk("put_dirty_page: page already exists\n");
 603                 pte_clear(pte);
 604                 invalidate();
 605         }
 606         set_pte(pte, pte_mkwrite(pte_mkdirty(mk_pte(page, PAGE_COPY))));
 607 /* no need for invalidate */
 608         return page;
 609 }
 610 
 611 /*
 612  * This routine handles present pages, when users try to write
 613  * to a shared page. It is done by copying the page to a new address
 614  * and decrementing the shared-page counter for the old page.
 615  *
 616  * Goto-purists beware: the only reason for goto's here is that it results
 617  * in better assembly code.. The "default" path will see no jumps at all.
 618  *
 619  * Note that this routine assumes that the protection checks have been
 620  * done by the caller (the low-level page fault routine in most cases).
 621  * Thus we can safely just mark it writable once we've done any necessary
 622  * COW.
 623  *
 624  * We also mark the page dirty at this point even though the page will
 625  * change only once the write actually happens. This avoids a few races,
 626  * and potentially makes it more efficient.
 627  */
 628 void do_wp_page(struct task_struct * tsk, struct vm_area_struct * vma,
     /* [previous][next][first][last][top][bottom][index][help] */
 629         unsigned long address, int write_access)
 630 {
 631         pgd_t *page_dir;
 632         pmd_t *page_middle;
 633         pte_t *page_table, pte;
 634         unsigned long old_page, new_page;
 635 
 636         new_page = __get_free_page(GFP_KERNEL);
 637         page_dir = pgd_offset(vma->vm_mm, address);
 638         if (pgd_none(*page_dir))
 639                 goto end_wp_page;
 640         if (pgd_bad(*page_dir))
 641                 goto bad_wp_pagedir;
 642         page_middle = pmd_offset(page_dir, address);
 643         if (pmd_none(*page_middle))
 644                 goto end_wp_page;
 645         if (pmd_bad(*page_middle))
 646                 goto bad_wp_pagemiddle;
 647         page_table = pte_offset(page_middle, address);
 648         pte = *page_table;
 649         if (!pte_present(pte))
 650                 goto end_wp_page;
 651         if (pte_write(pte))
 652                 goto end_wp_page;
 653         old_page = pte_page(pte);
 654         if (old_page >= high_memory)
 655                 goto bad_wp_page;
 656         vma->vm_mm->min_flt++;
 657         /*
 658          * Do we need to copy?
 659          */
 660         if (mem_map[MAP_NR(old_page)] != 1) {
 661                 if (new_page) {
 662                         if (mem_map[MAP_NR(old_page)] & MAP_PAGE_RESERVED)
 663                                 ++vma->vm_mm->rss;
 664                         copy_page(old_page,new_page);
 665                         set_pte(page_table, pte_mkwrite(pte_mkdirty(mk_pte(new_page, vma->vm_page_prot))));
 666                         free_page(old_page);
 667                         invalidate();
 668                         return;
 669                 }
 670                 set_pte(page_table, BAD_PAGE);
 671                 free_page(old_page);
 672                 oom(tsk);
 673                 invalidate();
 674                 return;
 675         }
 676         set_pte(page_table, pte_mkdirty(pte_mkwrite(pte)));
 677         invalidate();
 678         if (new_page)
 679                 free_page(new_page);
 680         return;
 681 bad_wp_page:
 682         printk("do_wp_page: bogus page at address %08lx (%08lx)\n",address,old_page);
 683         send_sig(SIGKILL, tsk, 1);
 684         goto end_wp_page;
 685 bad_wp_pagemiddle:
 686         printk("do_wp_page: bogus page-middle at address %08lx (%08lx)\n", address, pmd_val(*page_middle));
 687         send_sig(SIGKILL, tsk, 1);
 688         goto end_wp_page;
 689 bad_wp_pagedir:
 690         printk("do_wp_page: bogus page-dir entry at address %08lx (%08lx)\n", address, pgd_val(*page_dir));
 691         send_sig(SIGKILL, tsk, 1);
 692 end_wp_page:
 693         if (new_page)
 694                 free_page(new_page);
 695         return;
 696 }
 697 
 698 /*
 699  * Ugly, ugly, but the goto's result in better assembly..
 700  */
 701 int verify_area(int type, const void * addr, unsigned long size)
     /* [previous][next][first][last][top][bottom][index][help] */
 702 {
 703         struct vm_area_struct * vma;
 704         unsigned long start = (unsigned long) addr;
 705 
 706         /* If the current user space is mapped to kernel space (for the
 707          * case where we use a fake user buffer with get_fs/set_fs()) we
 708          * don't expect to find the address in the user vm map.
 709          */
 710         if (get_fs() == get_ds())
 711                 return 0;
 712 
 713         vma = find_vma(current, start);
 714         if (!vma)
 715                 goto bad_area;
 716         if (vma->vm_start <= start)
 717                 goto good_area;
 718         if (!(vma->vm_flags & VM_GROWSDOWN))
 719                 goto bad_area;
 720         if (vma->vm_end - start > current->rlim[RLIMIT_STACK].rlim_cur)
 721                 goto bad_area;
 722 
 723 good_area:
 724         if (type == VERIFY_WRITE)
 725                 goto check_write;
 726         for (;;) {
 727                 struct vm_area_struct * next;
 728                 if (!(vma->vm_flags & VM_READ))
 729                         goto bad_area;
 730                 if (vma->vm_end - start >= size)
 731                         return 0;
 732                 next = vma->vm_next;
 733                 if (!next || vma->vm_end != next->vm_start)
 734                         goto bad_area;
 735                 vma = next;
 736         }
 737 
 738 check_write:
 739         if (!(vma->vm_flags & VM_WRITE))
 740                 goto bad_area;
 741         if (!wp_works_ok)
 742                 goto check_wp_fault_by_hand;
 743         for (;;) {
 744                 if (vma->vm_end - start >= size)
 745                         break;
 746                 if (!vma->vm_next || vma->vm_end != vma->vm_next->vm_start)
 747                         goto bad_area;
 748                 vma = vma->vm_next;
 749                 if (!(vma->vm_flags & VM_WRITE))
 750                         goto bad_area;
 751         }
 752         return 0;
 753 
 754 check_wp_fault_by_hand:
 755         size--;
 756         size += start & ~PAGE_MASK;
 757         size >>= PAGE_SHIFT;
 758         start &= PAGE_MASK;
 759 
 760         for (;;) {
 761                 do_wp_page(current, vma, start, 1);
 762                 if (!size)
 763                         break;
 764                 size--;
 765                 start += PAGE_SIZE;
 766                 if (start < vma->vm_end)
 767                         continue;
 768                 vma = vma->vm_next;
 769                 if (!vma || vma->vm_start != start)
 770                         goto bad_area;
 771                 if (!(vma->vm_flags & VM_WRITE))
 772                         goto bad_area;;
 773         }
 774         return 0;
 775 
 776 bad_area:
 777         return -EFAULT;
 778 }
 779 
 780 static inline void get_empty_page(struct task_struct * tsk, struct vm_area_struct * vma, pte_t * page_table)
     /* [previous][next][first][last][top][bottom][index][help] */
 781 {
 782         unsigned long tmp;
 783 
 784         if (!(tmp = get_free_page(GFP_KERNEL))) {
 785                 oom(tsk);
 786                 put_page(page_table, BAD_PAGE);
 787                 return;
 788         }
 789         put_page(page_table, pte_mkwrite(mk_pte(tmp, vma->vm_page_prot)));
 790 }
 791 
 792 /*
 793  * try_to_share() checks the page at address "address" in the task "p",
 794  * to see if it exists, and if it is clean. If so, share it with the current
 795  * task.
 796  *
 797  * NOTE! This assumes we have checked that p != current, and that they
 798  * share the same inode and can generally otherwise be shared.
 799  */
 800 static int try_to_share(unsigned long to_address, struct vm_area_struct * to_area,
     /* [previous][next][first][last][top][bottom][index][help] */
 801         unsigned long from_address, struct vm_area_struct * from_area,
 802         unsigned long newpage)
 803 {
 804         pgd_t * from_dir, * to_dir;
 805         pmd_t * from_middle, * to_middle;
 806         pte_t * from_table, * to_table;
 807         pte_t from, to;
 808 
 809         from_dir = pgd_offset(from_area->vm_mm,from_address);
 810 /* is there a page-directory at from? */
 811         if (pgd_none(*from_dir))
 812                 return 0;
 813         if (pgd_bad(*from_dir)) {
 814                 printk("try_to_share: bad page directory %08lx\n", pgd_val(*from_dir));
 815                 pgd_clear(from_dir);
 816                 return 0;
 817         }
 818         from_middle = pmd_offset(from_dir, from_address);
 819 /* is there a mid-directory at from? */
 820         if (pmd_none(*from_middle))
 821                 return 0;
 822         if (pmd_bad(*from_middle)) {
 823                 printk("try_to_share: bad mid directory %08lx\n", pmd_val(*from_middle));
 824                 pmd_clear(from_middle);
 825                 return 0;
 826         }
 827         from_table = pte_offset(from_middle, from_address);
 828         from = *from_table;
 829 /* is the page present? */
 830         if (!pte_present(from))
 831                 return 0;
 832 /* if it is dirty it must be from a shared mapping to be shared */
 833         if (pte_dirty(from)) {
 834                 if (!(from_area->vm_flags & VM_SHARED))
 835                         return 0;
 836         }
 837 /* is the page reasonable at all? */
 838         if (pte_page(from) >= high_memory)
 839                 return 0;
 840         if (mem_map[MAP_NR(pte_page(from))] & MAP_PAGE_RESERVED)
 841                 return 0;
 842 /* is the destination ok? */
 843         to_dir = pgd_offset(to_area->vm_mm,to_address);
 844 /* is there a page-directory at to? */
 845         if (pgd_none(*to_dir))
 846                 return 0;
 847         if (pgd_bad(*to_dir)) {
 848                 printk("try_to_share: bad page directory %08lx\n", pgd_val(*to_dir));
 849                 return 0;
 850         }
 851         to_middle = pmd_offset(to_dir, to_address);
 852 /* is there a mid-directory at to? */
 853         if (pmd_none(*to_middle))
 854                 return 0;
 855         if (pmd_bad(*to_middle)) {
 856                 printk("try_to_share: bad mid directory %08lx\n", pmd_val(*to_middle));
 857                 return 0;
 858         }
 859         to_table = pte_offset(to_middle, to_address);
 860         to = *to_table;
 861         if (!pte_none(to))
 862                 return 0;
 863 /* do we copy? */
 864         if (newpage) {
 865                 /* if it's in the swap cache, it's dirty by implication */
 866                 /* so we can't use it if it's not from a shared mapping */
 867                 if (in_swap_cache(pte_page(from))) {
 868                         if (!(from_area->vm_flags & VM_SHARED))
 869                                 return 0;
 870                 }
 871                 copy_page(pte_page(from), newpage);
 872                 set_pte(to_table, mk_pte(newpage, to_area->vm_page_prot));
 873                 return 1;
 874         }
 875 /*
 876  * do a final swap-cache test before sharing them: if it's in the swap
 877  * cache, we have to remove it now, as we get two pointers to the same
 878  * physical page and the cache can't handle it. Mark the original dirty.
 879  *
 880  * NOTE! Even if "from" is dirty, "to" will be clean: if we get here
 881  * with a dirty "from", the from-mapping is a shared map, so we can trust
 882  * the page contents to be up-to-date
 883  */
 884         if (in_swap_cache(pte_page(from))) {
 885                 if (!(from_area->vm_flags & VM_SHARED))
 886                         return 0;
 887                 set_pte(from_table, pte_mkdirty(from));
 888                 delete_from_swap_cache(pte_page(from));
 889         }
 890         mem_map[MAP_NR(pte_page(from))]++;
 891         set_pte(to_table, mk_pte(pte_page(from), to_area->vm_page_prot));
 892 /* Check if we need to do anything at all to the 'from' field */
 893         if (!pte_write(from))
 894                 return 1;
 895         if (from_area->vm_flags & VM_SHARED)
 896                 return 1;
 897 /* ok, need to mark it read-only, so invalidate any possible old TB entry */
 898         set_pte(from_table, pte_wrprotect(from));
 899         invalidate();
 900         return 1;
 901 }
 902 
 903 /*
 904  * share_page() tries to find a process that could share a page with
 905  * the current one.
 906  *
 907  * We first check if it is at all feasible by checking inode->i_count.
 908  * It should be >1 if there are other tasks sharing this inode.
 909  */
 910 static int share_page(struct vm_area_struct * area, unsigned long address,
     /* [previous][next][first][last][top][bottom][index][help] */
 911         int write_access, unsigned long newpage)
 912 {
 913         struct inode * inode;
 914         unsigned long offset;
 915         unsigned long from_address;
 916         unsigned long give_page;
 917         struct vm_area_struct * mpnt;
 918 
 919         if (!area || !(inode = area->vm_inode) || inode->i_count < 2)
 920                 return 0;
 921         /* do we need to copy or can we just share? */
 922         give_page = 0;
 923         if (write_access && !(area->vm_flags & VM_SHARED)) {
 924                 if (!newpage)
 925                         return 0;
 926                 give_page = newpage;
 927         }
 928         offset = address - area->vm_start + area->vm_offset;
 929         /* See if there is something in the VM we can share pages with. */
 930         /* Traverse the entire circular i_mmap list, except `area' itself. */
 931         for (mpnt = area->vm_next_share; mpnt != area; mpnt = mpnt->vm_next_share) {
 932                 /* must be same inode */
 933                 if (mpnt->vm_inode != inode) {
 934                         printk("Aiee! Corrupt vm_area_struct i_mmap ring\n");
 935                         break;  
 936                 }
 937                 /* offsets must be mutually page-aligned */
 938                 if ((mpnt->vm_offset ^ area->vm_offset) & ~PAGE_MASK)
 939                         continue;
 940                 /* the other area must actually cover the wanted page.. */
 941                 from_address = offset + mpnt->vm_start - mpnt->vm_offset;
 942                 if (from_address < mpnt->vm_start || from_address >= mpnt->vm_end)
 943                         continue;
 944                 /* .. NOW we can actually try to use the same physical page */
 945                 if (!try_to_share(address, area, from_address, mpnt, give_page))
 946                         continue;
 947                 /* free newpage if we never used it.. */
 948                 if (give_page || !newpage)
 949                         return 1;
 950                 free_page(newpage);
 951                 return 1;
 952         }
 953         return 0;
 954 }
 955 
 956 /*
 957  * fill in an empty page-table if none exists.
 958  */
 959 static inline pte_t * get_empty_pgtable(struct task_struct * tsk,unsigned long address)
     /* [previous][next][first][last][top][bottom][index][help] */
 960 {
 961         pgd_t *pgd;
 962         pmd_t *pmd;
 963         pte_t *pte;
 964 
 965         pgd = pgd_offset(tsk->mm, address);
 966         pmd = pmd_alloc(pgd, address);
 967         if (!pmd) {
 968                 oom(tsk);
 969                 return NULL;
 970         }
 971         pte = pte_alloc(pmd, address);
 972         if (!pte) {
 973                 oom(tsk);
 974                 return NULL;
 975         }
 976         return pte;
 977 }
 978 
 979 static inline void do_swap_page(struct task_struct * tsk, 
     /* [previous][next][first][last][top][bottom][index][help] */
 980         struct vm_area_struct * vma, unsigned long address,
 981         pte_t * page_table, pte_t entry, int write_access)
 982 {
 983         pte_t page;
 984 
 985         if (!vma->vm_ops || !vma->vm_ops->swapin) {
 986                 swap_in(tsk, vma, page_table, pte_val(entry), write_access);
 987                 return;
 988         }
 989         page = vma->vm_ops->swapin(vma, address - vma->vm_start + vma->vm_offset, pte_val(entry));
 990         if (pte_val(*page_table) != pte_val(entry)) {
 991                 free_page(pte_page(page));
 992                 return;
 993         }
 994         if (mem_map[MAP_NR(pte_page(page))] > 1 && !(vma->vm_flags & VM_SHARED))
 995                 page = pte_wrprotect(page);
 996         ++vma->vm_mm->rss;
 997         ++vma->vm_mm->maj_flt;
 998         set_pte(page_table, page);
 999         return;
1000 }
1001 
1002 /*
1003  * do_no_page() tries to create a new page mapping. It aggressively
1004  * tries to share with existing pages, but makes a separate copy if
1005  * the "write_access" parameter is true in order to avoid the next
1006  * page fault.
1007  */
1008 void do_no_page(struct task_struct * tsk, struct vm_area_struct * vma,
     /* [previous][next][first][last][top][bottom][index][help] */
1009         unsigned long address, int write_access)
1010 {
1011         pte_t * page_table;
1012         pte_t entry;
1013         unsigned long page;
1014 
1015         page_table = get_empty_pgtable(tsk, address);
1016         if (!page_table)
1017                 return;
1018         entry = *page_table;
1019         if (pte_present(entry))
1020                 return;
1021         if (!pte_none(entry)) {
1022                 do_swap_page(tsk, vma, address, page_table, entry, write_access);
1023                 return;
1024         }
1025         address &= PAGE_MASK;
1026         if (!vma->vm_ops || !vma->vm_ops->nopage) {
1027                 ++vma->vm_mm->rss;
1028                 ++vma->vm_mm->min_flt;
1029                 get_empty_page(tsk, vma, page_table);
1030                 return;
1031         }
1032         page = __get_free_page(GFP_KERNEL);
1033         if (share_page(vma, address, write_access, page)) {
1034                 ++vma->vm_mm->min_flt;
1035                 ++vma->vm_mm->rss;
1036                 return;
1037         }
1038         if (!page) {
1039                 oom(tsk);
1040                 put_page(page_table, BAD_PAGE);
1041                 return;
1042         }
1043         ++vma->vm_mm->maj_flt;
1044         ++vma->vm_mm->rss;
1045         /*
1046          * The fourth argument is "no_share", which tells the low-level code
1047          * to copy, not share the page even if sharing is possible.  It's
1048          * essentially an early COW detection 
1049          */
1050         page = vma->vm_ops->nopage(vma, address, page,
1051                 write_access && !(vma->vm_flags & VM_SHARED));
1052         if (share_page(vma, address, write_access, 0)) {
1053                 free_page(page);
1054                 return;
1055         }
1056         /*
1057          * This silly early PAGE_DIRTY setting removes a race
1058          * due to the bad i386 page protection. But it's valid
1059          * for other architectures too.
1060          *
1061          * Note that if write_access is true, we either now have
1062          * a exclusive copy of the page, or this is a shared mapping,
1063          * so we can make it writable and dirty to avoid having to
1064          * handle that later.
1065          */
1066         entry = mk_pte(page, vma->vm_page_prot);
1067         if (write_access) {
1068                 entry = pte_mkwrite(pte_mkdirty(entry));
1069         } else if (mem_map[MAP_NR(page)] > 1 && !(vma->vm_flags & VM_SHARED))
1070                 entry = pte_wrprotect(entry);
1071         put_page(page_table, entry);
1072 }
1073 
1074 /*
1075  * The above separate functions for the no-page and wp-page
1076  * cases will go away (they mostly do the same thing anyway),
1077  * and we'll instead use only a general "handle_mm_fault()".
1078  *
1079  * These routines also need to handle stuff like marking pages dirty
1080  * and/or accessed for architectures that don't do it in hardware (most
1081  * RISC architectures).  The early dirtying is also good on the i386.
1082  *
1083  * There is also a hook called "update_mmu_cache()" that architectures
1084  * with external mmu caches can use to update those (ie the Sparc or
1085  * PowerPC hashed page tables that act as extended TLBs).
1086  */
1087 static inline void handle_pte_fault(struct vm_area_struct * vma, unsigned long address,
     /* [previous][next][first][last][top][bottom][index][help] */
1088         int write_access, pte_t * pte)
1089 {
1090         if (!pte_present(*pte)) {
1091                 do_no_page(current, vma, address, write_access);
1092                 return;
1093         }
1094         set_pte(pte, pte_mkyoung(*pte));
1095         if (!write_access)
1096                 return;
1097         if (pte_write(*pte)) {
1098                 set_pte(pte, pte_mkdirty(*pte));
1099                 return;
1100         }
1101         do_wp_page(current, vma, address, write_access);
1102 }
1103 
1104 void handle_mm_fault(struct vm_area_struct * vma, unsigned long address,
     /* [previous][next][first][last][top][bottom][index][help] */
1105         int write_access)
1106 {
1107         pgd_t *pgd;
1108         pmd_t *pmd;
1109         pte_t *pte;
1110 
1111         pgd = pgd_offset(vma->vm_mm, address);
1112         pmd = pmd_alloc(pgd, address);
1113         if (!pmd)
1114                 goto no_memory;
1115         pte = pte_alloc(pmd, address);
1116         if (!pte)
1117                 goto no_memory;
1118         handle_pte_fault(vma, address, write_access, pte);
1119         update_mmu_cache(vma, address, *pte);
1120         return;
1121 no_memory:
1122         oom(current);
1123 }

/* [previous][next][first][last][top][bottom][index][help] */