root/mm/memory.c

/* [previous][next][first][last][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. oom
  2. free_one_pte
  3. free_one_pmd
  4. free_one_pgd
  5. clear_page_tables
  6. free_page_tables
  7. clone_page_tables
  8. copy_one_pte
  9. copy_one_pmd
  10. copy_one_pgd
  11. copy_page_tables
  12. forget_pte
  13. unmap_pte_range
  14. unmap_pmd_range
  15. unmap_page_range
  16. zeromap_pte_range
  17. zeromap_pmd_range
  18. zeromap_page_range
  19. remap_pte_range
  20. remap_pmd_range
  21. remap_page_range
  22. put_page
  23. put_dirty_page
  24. do_wp_page
  25. verify_area
  26. get_empty_page
  27. try_to_share
  28. share_page
  29. get_empty_pgtable
  30. do_swap_page
  31. do_no_page
  32. handle_pte_fault
  33. handle_mm_fault

   1 #define THREE_LEVEL
   2 /*
   3  *  linux/mm/memory.c
   4  *
   5  *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
   6  */
   7 
   8 /*
   9  * demand-loading started 01.12.91 - seems it is high on the list of
  10  * things wanted, and it should be easy to implement. - Linus
  11  */
  12 
  13 /*
  14  * Ok, demand-loading was easy, shared pages a little bit tricker. Shared
  15  * pages started 02.12.91, seems to work. - Linus.
  16  *
  17  * Tested sharing by executing about 30 /bin/sh: under the old kernel it
  18  * would have taken more than the 6M I have free, but it worked well as
  19  * far as I could see.
  20  *
  21  * Also corrected some "invalidate()"s - I wasn't doing enough of them.
  22  */
  23 
  24 /*
  25  * Real VM (paging to/from disk) started 18.12.91. Much more work and
  26  * thought has to go into this. Oh, well..
  27  * 19.12.91  -  works, somewhat. Sometimes I get faults, don't know why.
  28  *              Found it. Everything seems to work now.
  29  * 20.12.91  -  Ok, making the swap-device changeable like the root.
  30  */
  31 
  32 /*
  33  * 05.04.94  -  Multi-page memory management added for v1.1.
  34  *              Idea by Alex Bligh (alex@cconcepts.co.uk)
  35  */
  36 
  37 #include <linux/config.h>
  38 #include <linux/signal.h>
  39 #include <linux/sched.h>
  40 #include <linux/head.h>
  41 #include <linux/kernel.h>
  42 #include <linux/errno.h>
  43 #include <linux/string.h>
  44 #include <linux/types.h>
  45 #include <linux/ptrace.h>
  46 #include <linux/mman.h>
  47 #include <linux/mm.h>
  48 
  49 #include <asm/system.h>
  50 #include <asm/segment.h>
  51 #include <asm/pgtable.h>
  52 
  53 unsigned long high_memory = 0;
  54 
  55 /*
  56  * The free_area_list arrays point to the queue heads of the free areas
  57  * of different sizes
  58  */
  59 int nr_swap_pages = 0;
  60 int nr_free_pages = 0;
  61 struct mem_list free_area_list[NR_MEM_LISTS];
  62 unsigned char * free_area_map[NR_MEM_LISTS];
  63 
  64 #define copy_page(from,to) memcpy((void *) to, (void *) from, PAGE_SIZE)
  65 
  66 #define USER_PTRS_PER_PGD (TASK_SIZE / PGDIR_SIZE)
  67 
  68 mem_map_t * mem_map = NULL;
  69 
  70 /*
  71  * oom() prints a message (so that the user knows why the process died),
  72  * and gives the process an untrappable SIGKILL.
  73  */
  74 void oom(struct task_struct * task)
     /* [previous][next][first][last][top][bottom][index][help] */
  75 {
  76         printk("\nOut of memory for %s.\n", current->comm);
  77         task->sigaction[SIGKILL-1].sa_handler = NULL;
  78         task->blocked &= ~(1<<(SIGKILL-1));
  79         send_sig(SIGKILL,task,1);
  80 }
  81 
  82 static inline void free_one_pte(pte_t * page_table)
     /* [previous][next][first][last][top][bottom][index][help] */
  83 {
  84         pte_t page = *page_table;
  85 
  86         if (pte_none(page))
  87                 return;
  88         pte_clear(page_table);
  89         if (!pte_present(page)) {
  90                 swap_free(pte_val(page));
  91                 return;
  92         }
  93         free_page(pte_page(page));
  94         return;
  95 }
  96 
  97 static inline void free_one_pmd(pmd_t * dir)
     /* [previous][next][first][last][top][bottom][index][help] */
  98 {
  99         int j;
 100         pte_t * pte;
 101 
 102         if (pmd_none(*dir))
 103                 return;
 104         if (pmd_bad(*dir)) {
 105                 printk("free_one_pmd: bad directory entry %08lx\n", pmd_val(*dir));
 106                 pmd_clear(dir);
 107                 return;
 108         }
 109         pte = pte_offset(dir, 0);
 110         pmd_clear(dir);
 111         if (pte_inuse(pte)) {
 112                 pte_free(pte);
 113                 return;
 114         }
 115         for (j = 0; j < PTRS_PER_PTE ; j++)
 116                 free_one_pte(pte+j);
 117         pte_free(pte);
 118 }
 119 
 120 static inline void free_one_pgd(pgd_t * dir)
     /* [previous][next][first][last][top][bottom][index][help] */
 121 {
 122         int j;
 123         pmd_t * pmd;
 124 
 125         if (pgd_none(*dir))
 126                 return;
 127         if (pgd_bad(*dir)) {
 128                 printk("free_one_pgd: bad directory entry %08lx\n", pgd_val(*dir));
 129                 pgd_clear(dir);
 130                 return;
 131         }
 132         pmd = pmd_offset(dir, 0);
 133         pgd_clear(dir);
 134         if (pmd_inuse(pmd)) {
 135                 pmd_free(pmd);
 136                 return;
 137         }
 138         for (j = 0; j < PTRS_PER_PMD ; j++)
 139                 free_one_pmd(pmd+j);
 140         pmd_free(pmd);
 141 }
 142         
 143 
 144 /*
 145  * This function clears all user-level page tables of a process - this
 146  * is needed by execve(), so that old pages aren't in the way. Note that
 147  * unlike 'free_page_tables()', this function still leaves a valid
 148  * page-table-tree in memory: it just removes the user pages. The two
 149  * functions are similar, but there is a fundamental difference.
 150  */
 151 void clear_page_tables(struct task_struct * tsk)
     /* [previous][next][first][last][top][bottom][index][help] */
 152 {
 153         int i;
 154         pgd_t * page_dir;
 155 
 156         if (!tsk)
 157                 return;
 158         if (tsk == task[0])
 159                 panic("task[0] (swapper) doesn't support exec()\n");
 160         page_dir = pgd_offset(tsk, 0);
 161         if (!page_dir || page_dir == swapper_pg_dir) {
 162                 printk("Trying to clear kernel page-directory: not good\n");
 163                 return;
 164         }
 165         if (pgd_inuse(page_dir)) {
 166                 pgd_t * new_pg;
 167 
 168                 if (!(new_pg = pgd_alloc())) {
 169                         oom(tsk);
 170                         return;
 171                 }
 172                 for (i = USER_PTRS_PER_PGD ; i < PTRS_PER_PGD ; i++)
 173                         new_pg[i] = page_dir[i];
 174                 SET_PAGE_DIR(tsk, new_pg);
 175                 pgd_free(page_dir);
 176                 return;
 177         }
 178         for (i = 0 ; i < USER_PTRS_PER_PGD ; i++)
 179                 free_one_pgd(page_dir + i);
 180         invalidate();
 181         return;
 182 }
 183 
 184 /*
 185  * This function frees up all page tables of a process when it exits.
 186  */
 187 void free_page_tables(struct task_struct * tsk)
     /* [previous][next][first][last][top][bottom][index][help] */
 188 {
 189         int i;
 190         pgd_t * page_dir;
 191 
 192         if (!tsk)
 193                 return;
 194         if (tsk == task[0]) {
 195                 printk("task[0] (swapper) killed: unable to recover\n");
 196                 panic("Trying to free up swapper memory space");
 197         }
 198         page_dir = pgd_offset(tsk, 0);
 199         if (!page_dir || page_dir == swapper_pg_dir) {
 200                 printk("Trying to free kernel page-directory: not good\n");
 201                 return;
 202         }
 203         SET_PAGE_DIR(tsk, swapper_pg_dir);
 204         if (pgd_inuse(page_dir)) {
 205                 pgd_free(page_dir);
 206                 return;
 207         }
 208         for (i = 0 ; i < PTRS_PER_PGD ; i++)
 209                 free_one_pgd(page_dir + i);
 210         pgd_free(page_dir);
 211         invalidate();
 212 }
 213 
 214 /*
 215  * clone_page_tables() clones the page table for a process - both
 216  * processes will have the exact same pages in memory. There are
 217  * probably races in the memory management with cloning, but we'll
 218  * see..
 219  */
 220 int clone_page_tables(struct task_struct * tsk)
     /* [previous][next][first][last][top][bottom][index][help] */
 221 {
 222         pgd_t * pg_dir;
 223 
 224         pg_dir = pgd_offset(current, 0);
 225         pgd_reuse(pg_dir);
 226         SET_PAGE_DIR(tsk, pg_dir);
 227         return 0;
 228 }
 229 
 230 static inline void copy_one_pte(pte_t * old_pte, pte_t * new_pte)
     /* [previous][next][first][last][top][bottom][index][help] */
 231 {
 232         pte_t pte = *old_pte;
 233 
 234         if (pte_none(pte))
 235                 return;
 236         if (!pte_present(pte)) {
 237                 swap_duplicate(pte_val(pte));
 238                 *new_pte = pte;
 239                 return;
 240         }
 241         if (pte_page(pte) > high_memory || (mem_map[MAP_NR(pte_page(pte))] & MAP_PAGE_RESERVED)) {
 242                 *new_pte = pte;
 243                 return;
 244         }
 245         if (pte_cow(pte))
 246                 pte = pte_wrprotect(pte);
 247         if (delete_from_swap_cache(pte_page(pte)))
 248                 pte = pte_mkdirty(pte);
 249         *new_pte = pte_mkold(pte);
 250         *old_pte = pte;
 251         mem_map[MAP_NR(pte_page(pte))]++;
 252 }
 253 
 254 static inline int copy_one_pmd(pmd_t * old_pmd, pmd_t * new_pmd)
     /* [previous][next][first][last][top][bottom][index][help] */
 255 {
 256         int j;
 257         pte_t *old_pte, *new_pte;
 258 
 259         if (pmd_none(*old_pmd))
 260                 return 0;
 261         if (pmd_bad(*old_pmd)) {
 262                 printk("copy_one_pmd: bad page table: probable memory corruption\n");
 263                 pmd_clear(old_pmd);
 264                 return 0;
 265         }
 266         old_pte = pte_offset(old_pmd, 0);
 267         if (pte_inuse(old_pte)) {
 268                 pte_reuse(old_pte);
 269                 *new_pmd = *old_pmd;
 270                 return 0;
 271         }
 272         new_pte = pte_alloc(new_pmd, 0);
 273         if (!new_pte)
 274                 return -ENOMEM;
 275         for (j = 0 ; j < PTRS_PER_PTE ; j++) {
 276                 copy_one_pte(old_pte, new_pte);
 277                 old_pte++;
 278                 new_pte++;
 279         }
 280         return 0;
 281 }
 282 
 283 static inline int copy_one_pgd(pgd_t * old_pgd, pgd_t * new_pgd)
     /* [previous][next][first][last][top][bottom][index][help] */
 284 {
 285         int j;
 286         pmd_t *old_pmd, *new_pmd;
 287 
 288         if (pgd_none(*old_pgd))
 289                 return 0;
 290         if (pgd_bad(*old_pgd)) {
 291                 printk("copy_one_pgd: bad page table (%p: %08lx): probable memory corruption\n", old_pgd, pgd_val(*old_pgd));
 292                 pgd_clear(old_pgd);
 293                 return 0;
 294         }
 295         old_pmd = pmd_offset(old_pgd, 0);
 296         if (pmd_inuse(old_pmd)) {
 297                 pmd_reuse(old_pmd);
 298                 *new_pgd = *old_pgd;
 299                 return 0;
 300         }
 301         new_pmd = pmd_alloc(new_pgd, 0);
 302         if (!new_pmd)
 303                 return -ENOMEM;
 304         for (j = 0 ; j < PTRS_PER_PMD ; j++) {
 305                 int error = copy_one_pmd(old_pmd, new_pmd);
 306                 if (error)
 307                         return error;
 308                 old_pmd++;
 309                 new_pmd++;
 310         }
 311         return 0;
 312 }
 313 
 314 /*
 315  * copy_page_tables() just copies the whole process memory range:
 316  * note the special handling of RESERVED (ie kernel) pages, which
 317  * means that they are always shared by all processes.
 318  */
 319 int copy_page_tables(struct task_struct * tsk)
     /* [previous][next][first][last][top][bottom][index][help] */
 320 {
 321         int i;
 322         pgd_t *old_pgd;
 323         pgd_t *new_pgd;
 324 
 325         new_pgd = pgd_alloc();
 326         if (!new_pgd)
 327                 return -ENOMEM;
 328         SET_PAGE_DIR(tsk, new_pgd);
 329         old_pgd = pgd_offset(current, 0);
 330         for (i = 0 ; i < PTRS_PER_PGD ; i++) {
 331                 int errno = copy_one_pgd(old_pgd, new_pgd);
 332                 if (errno) {
 333                         free_page_tables(tsk);
 334                         invalidate();
 335                         return errno;
 336                 }
 337                 old_pgd++;
 338                 new_pgd++;
 339         }
 340         invalidate();
 341         return 0;
 342 }
 343 
 344 static inline void forget_pte(pte_t page)
     /* [previous][next][first][last][top][bottom][index][help] */
 345 {
 346         if (pte_none(page))
 347                 return;
 348         if (pte_present(page)) {
 349                 free_page(pte_page(page));
 350                 if (mem_map[MAP_NR(pte_page(page))] & MAP_PAGE_RESERVED)
 351                         return;
 352                 if (current->mm->rss <= 0)
 353                         return;
 354                 current->mm->rss--;
 355                 return;
 356         }
 357         swap_free(pte_val(page));
 358 }
 359 
 360 static inline void unmap_pte_range(pmd_t * pmd, unsigned long address, unsigned long size)
     /* [previous][next][first][last][top][bottom][index][help] */
 361 {
 362         pte_t * pte;
 363         unsigned long end;
 364 
 365         if (pmd_none(*pmd))
 366                 return;
 367         if (pmd_bad(*pmd)) {
 368                 printk("unmap_pte_range: bad pmd (%08lx)\n", pmd_val(*pmd));
 369                 pmd_clear(pmd);
 370                 return;
 371         }
 372         pte = pte_offset(pmd, address);
 373         address &= ~PMD_MASK;
 374         end = address + size;
 375         if (end >= PMD_SIZE)
 376                 end = PMD_SIZE;
 377         do {
 378                 pte_t page = *pte;
 379                 pte_clear(pte);
 380                 forget_pte(page);
 381                 address += PAGE_SIZE;
 382                 pte++;
 383         } while (address < end);
 384 }
 385 
 386 static inline void unmap_pmd_range(pgd_t * dir, unsigned long address, unsigned long size)
     /* [previous][next][first][last][top][bottom][index][help] */
 387 {
 388         pmd_t * pmd;
 389         unsigned long end;
 390 
 391         if (pgd_none(*dir))
 392                 return;
 393         if (pgd_bad(*dir)) {
 394                 printk("unmap_pmd_range: bad pgd (%08lx)\n", pgd_val(*dir));
 395                 pgd_clear(dir);
 396                 return;
 397         }
 398         pmd = pmd_offset(dir, address);
 399         address &= ~PGDIR_MASK;
 400         end = address + size;
 401         if (end > PGDIR_SIZE)
 402                 end = PGDIR_SIZE;
 403         do {
 404                 unmap_pte_range(pmd, address, end - address);
 405                 address = (address + PMD_SIZE) & PMD_MASK; 
 406                 pmd++;
 407         } while (address < end);
 408 }
 409 
 410 /*
 411  * a more complete version of free_page_tables which performs with page
 412  * granularity.
 413  */
 414 int unmap_page_range(unsigned long address, unsigned long size)
     /* [previous][next][first][last][top][bottom][index][help] */
 415 {
 416         pgd_t * dir;
 417         unsigned long end = address + size;
 418 
 419         dir = pgd_offset(current, address);
 420         while (address < end) {
 421                 unmap_pmd_range(dir, address, end - address);
 422                 address = (address + PGDIR_SIZE) & PGDIR_MASK;
 423                 dir++;
 424         }
 425         invalidate();
 426         return 0;
 427 }
 428 
 429 static inline void zeromap_pte_range(pte_t * pte, unsigned long address, unsigned long size, pte_t zero_pte)
     /* [previous][next][first][last][top][bottom][index][help] */
 430 {
 431         unsigned long end;
 432 
 433         address &= ~PMD_MASK;
 434         end = address + size;
 435         if (end > PMD_SIZE)
 436                 end = PMD_SIZE;
 437         do {
 438                 pte_t oldpage = *pte;
 439                 *pte = zero_pte;
 440                 forget_pte(oldpage);
 441                 address += PAGE_SIZE;
 442                 pte++;
 443         } while (address < end);
 444 }
 445 
 446 static inline int zeromap_pmd_range(pmd_t * pmd, unsigned long address, unsigned long size, pte_t zero_pte)
     /* [previous][next][first][last][top][bottom][index][help] */
 447 {
 448         unsigned long end;
 449 
 450         address &= ~PGDIR_MASK;
 451         end = address + size;
 452         if (end > PGDIR_SIZE)
 453                 end = PGDIR_SIZE;
 454         do {
 455                 pte_t * pte = pte_alloc(pmd, address);
 456                 if (!pte)
 457                         return -ENOMEM;
 458                 zeromap_pte_range(pte, address, end - address, zero_pte);
 459                 address = (address + PMD_SIZE) & PMD_MASK;
 460                 pmd++;
 461         } while (address < end);
 462         return 0;
 463 }
 464 
 465 int zeromap_page_range(unsigned long address, unsigned long size, pgprot_t prot)
     /* [previous][next][first][last][top][bottom][index][help] */
 466 {
 467         int error = 0;
 468         pgd_t * dir;
 469         unsigned long end = address + size;
 470         pte_t zero_pte;
 471 
 472         zero_pte = pte_wrprotect(mk_pte(ZERO_PAGE, prot));
 473         dir = pgd_offset(current, address);
 474         while (address < end) {
 475                 pmd_t *pmd = pmd_alloc(dir, address);
 476                 error = -ENOMEM;
 477                 if (!pmd)
 478                         break;
 479                 error = zeromap_pmd_range(pmd, address, end - address, zero_pte);
 480                 if (error)
 481                         break;
 482                 address = (address + PGDIR_SIZE) & PGDIR_MASK;
 483                 dir++;
 484         }
 485         invalidate();
 486         return error;
 487 }
 488 
 489 /*
 490  * maps a range of physical memory into the requested pages. the old
 491  * mappings are removed. any references to nonexistent pages results
 492  * in null mappings (currently treated as "copy-on-access")
 493  */
 494 static inline void remap_pte_range(pte_t * pte, unsigned long address, unsigned long size,
     /* [previous][next][first][last][top][bottom][index][help] */
 495         unsigned long offset, pgprot_t prot)
 496 {
 497         unsigned long end;
 498 
 499         address &= ~PMD_MASK;
 500         end = address + size;
 501         if (end > PMD_SIZE)
 502                 end = PMD_SIZE;
 503         do {
 504                 pte_t oldpage = *pte;
 505                 pte_clear(pte);
 506                 if (offset >= high_memory || (mem_map[MAP_NR(offset)] & MAP_PAGE_RESERVED))
 507                         *pte = mk_pte(offset, prot);
 508                 else if (mem_map[MAP_NR(offset)]) {
 509                         mem_map[MAP_NR(offset)]++;
 510                         *pte = mk_pte(offset, prot);
 511                 }
 512                 forget_pte(oldpage);
 513                 address += PAGE_SIZE;
 514                 offset += PAGE_SIZE;
 515                 pte++;
 516         } while (address < end);
 517 }
 518 
 519 static inline int remap_pmd_range(pmd_t * pmd, unsigned long address, unsigned long size,
     /* [previous][next][first][last][top][bottom][index][help] */
 520         unsigned long offset, pgprot_t prot)
 521 {
 522         unsigned long end;
 523 
 524         address &= ~PGDIR_MASK;
 525         end = address + size;
 526         if (end > PGDIR_SIZE)
 527                 end = PGDIR_SIZE;
 528         offset -= address;
 529         do {
 530                 pte_t * pte = pte_alloc(pmd, address);
 531                 if (!pte)
 532                         return -ENOMEM;
 533                 remap_pte_range(pte, address, end - address, address + offset, prot);
 534                 address = (address + PMD_SIZE) & PMD_MASK;
 535                 pmd++;
 536         } while (address < end);
 537         return 0;
 538 }
 539 
 540 int remap_page_range(unsigned long from, unsigned long offset, unsigned long size, pgprot_t prot)
     /* [previous][next][first][last][top][bottom][index][help] */
 541 {
 542         int error = 0;
 543         pgd_t * dir;
 544         unsigned long end = from + size;
 545 
 546         offset -= from;
 547         dir = pgd_offset(current, from);
 548         while (from < end) {
 549                 pmd_t *pmd = pmd_alloc(dir, from);
 550                 error = -ENOMEM;
 551                 if (!pmd)
 552                         break;
 553                 error = remap_pmd_range(pmd, from, end - from, offset + from, prot);
 554                 if (error)
 555                         break;
 556                 from = (from + PGDIR_SIZE) & PGDIR_MASK;
 557                 dir++;
 558         }
 559         invalidate();
 560         return error;
 561 }
 562 
 563 /*
 564  * sanity-check function..
 565  */
 566 static void put_page(pte_t * page_table, pte_t pte)
     /* [previous][next][first][last][top][bottom][index][help] */
 567 {
 568         if (!pte_none(*page_table)) {
 569                 printk("put_page: page already exists %08lx\n", pte_val(*page_table));
 570                 free_page(pte_page(pte));
 571                 return;
 572         }
 573 /* no need for invalidate */
 574         *page_table = pte;
 575 }
 576 
 577 /*
 578  * This routine is used to map in a page into an address space: needed by
 579  * execve() for the initial stack and environment pages.
 580  */
 581 unsigned long put_dirty_page(struct task_struct * tsk, unsigned long page, unsigned long address)
     /* [previous][next][first][last][top][bottom][index][help] */
 582 {
 583         pgd_t * pgd;
 584         pmd_t * pmd;
 585         pte_t * pte;
 586 
 587         if (page >= high_memory)
 588                 printk("put_dirty_page: trying to put page %08lx at %08lx\n",page,address);
 589         if (mem_map[MAP_NR(page)] != 1)
 590                 printk("mem_map disagrees with %08lx at %08lx\n",page,address);
 591         pgd = pgd_offset(tsk,address);
 592         pmd = pmd_alloc(pgd, address);
 593         if (!pmd) {
 594                 free_page(page);
 595                 oom(tsk);
 596                 return 0;
 597         }
 598         pte = pte_alloc(pmd, address);
 599         if (!pte) {
 600                 free_page(page);
 601                 oom(tsk);
 602                 return 0;
 603         }
 604         if (!pte_none(*pte)) {
 605                 printk("put_dirty_page: page already exists\n");
 606                 pte_clear(pte);
 607                 invalidate();
 608         }
 609         *pte = pte_mkwrite(pte_mkdirty(mk_pte(page, PAGE_COPY)));
 610 /* no need for invalidate */
 611         return page;
 612 }
 613 
 614 /*
 615  * This routine handles present pages, when users try to write
 616  * to a shared page. It is done by copying the page to a new address
 617  * and decrementing the shared-page counter for the old page.
 618  *
 619  * Goto-purists beware: the only reason for goto's here is that it results
 620  * in better assembly code.. The "default" path will see no jumps at all.
 621  *
 622  * Note that this routine assumes that the protection checks have been
 623  * done by the caller (the low-level page fault routine in most cases).
 624  * Thus we can safely just mark it writable once we've done any necessary
 625  * COW.
 626  *
 627  * We also mark the page dirty at this point even though the page will
 628  * change only once the write actually happens. This avoids a few races,
 629  * and potentially makes it more efficient.
 630  */
 631 void do_wp_page(struct vm_area_struct * vma, unsigned long address,
     /* [previous][next][first][last][top][bottom][index][help] */
 632         int write_access)
 633 {
 634         pgd_t *page_dir;
 635         pmd_t *page_middle;
 636         pte_t *page_table, pte;
 637         unsigned long old_page, new_page;
 638 
 639         new_page = __get_free_page(GFP_KERNEL);
 640         page_dir = pgd_offset(vma->vm_task,address);
 641         if (pgd_none(*page_dir))
 642                 goto end_wp_page;
 643         if (pgd_bad(*page_dir))
 644                 goto bad_wp_pagedir;
 645         page_middle = pmd_offset(page_dir, address);
 646         if (pmd_none(*page_middle))
 647                 goto end_wp_page;
 648         if (pmd_bad(*page_middle))
 649                 goto bad_wp_pagemiddle;
 650         page_table = pte_offset(page_middle, address);
 651         pte = *page_table;
 652         if (!pte_present(pte))
 653                 goto end_wp_page;
 654         if (pte_write(pte))
 655                 goto end_wp_page;
 656         old_page = pte_page(pte);
 657         if (old_page >= high_memory)
 658                 goto bad_wp_page;
 659         vma->vm_task->mm->min_flt++;
 660         /*
 661          * Do we need to copy?
 662          */
 663         if (mem_map[MAP_NR(old_page)] != 1) {
 664                 if (new_page) {
 665                         if (mem_map[MAP_NR(old_page)] & MAP_PAGE_RESERVED)
 666                                 ++vma->vm_task->mm->rss;
 667                         copy_page(old_page,new_page);
 668                         *page_table = pte_mkwrite(pte_mkdirty(mk_pte(new_page, vma->vm_page_prot)));
 669                         free_page(old_page);
 670                         invalidate();
 671                         return;
 672                 }
 673                 *page_table = BAD_PAGE;
 674                 free_page(old_page);
 675                 oom(vma->vm_task);
 676                 invalidate();
 677                 return;
 678         }
 679         *page_table = pte_mkdirty(pte_mkwrite(pte));
 680         invalidate();
 681         if (new_page)
 682                 free_page(new_page);
 683         return;
 684 bad_wp_page:
 685         printk("do_wp_page: bogus page at address %08lx (%08lx)\n",address,old_page);
 686         send_sig(SIGKILL, vma->vm_task, 1);
 687         goto end_wp_page;
 688 bad_wp_pagemiddle:
 689         printk("do_wp_page: bogus page-middle at address %08lx (%08lx)\n", address, pmd_val(*page_middle));
 690         send_sig(SIGKILL, vma->vm_task, 1);
 691         goto end_wp_page;
 692 bad_wp_pagedir:
 693         printk("do_wp_page: bogus page-dir entry at address %08lx (%08lx)\n", address, pgd_val(*page_dir));
 694         send_sig(SIGKILL, vma->vm_task, 1);
 695 end_wp_page:
 696         if (new_page)
 697                 free_page(new_page);
 698         return;
 699 }
 700 
 701 /*
 702  * Ugly, ugly, but the goto's result in better assembly..
 703  */
 704 int verify_area(int type, const void * addr, unsigned long size)
     /* [previous][next][first][last][top][bottom][index][help] */
 705 {
 706         struct vm_area_struct * vma;
 707         unsigned long start = (unsigned long) addr;
 708 
 709         /* If the current user space is mapped to kernel space (for the
 710          * case where we use a fake user buffer with get_fs/set_fs()) we
 711          * don't expect to find the address in the user vm map.
 712          */
 713         if (get_fs() == get_ds())
 714                 return 0;
 715 
 716         vma = find_vma(current, start);
 717         if (!vma)
 718                 goto bad_area;
 719         if (vma->vm_start <= start)
 720                 goto good_area;
 721         if (!(vma->vm_flags & VM_GROWSDOWN))
 722                 goto bad_area;
 723         if (vma->vm_end - start > current->rlim[RLIMIT_STACK].rlim_cur)
 724                 goto bad_area;
 725 
 726 good_area:
 727         if (type == VERIFY_WRITE)
 728                 goto check_write;
 729         for (;;) {
 730                 struct vm_area_struct * next;
 731                 if (!(vma->vm_flags & VM_READ))
 732                         goto bad_area;
 733                 if (vma->vm_end - start >= size)
 734                         return 0;
 735                 next = vma->vm_next;
 736                 if (!next || vma->vm_end != next->vm_start)
 737                         goto bad_area;
 738                 vma = next;
 739         }
 740 
 741 check_write:
 742         if (!(vma->vm_flags & VM_WRITE))
 743                 goto bad_area;
 744         if (!wp_works_ok)
 745                 goto check_wp_fault_by_hand;
 746         for (;;) {
 747                 if (vma->vm_end - start >= size)
 748                         break;
 749                 if (!vma->vm_next || vma->vm_end != vma->vm_next->vm_start)
 750                         goto bad_area;
 751                 vma = vma->vm_next;
 752                 if (!(vma->vm_flags & VM_WRITE))
 753                         goto bad_area;
 754         }
 755         return 0;
 756 
 757 check_wp_fault_by_hand:
 758         size--;
 759         size += start & ~PAGE_MASK;
 760         size >>= PAGE_SHIFT;
 761         start &= PAGE_MASK;
 762 
 763         for (;;) {
 764                 do_wp_page(vma, start, 1);
 765                 if (!size)
 766                         break;
 767                 size--;
 768                 start += PAGE_SIZE;
 769                 if (start < vma->vm_end)
 770                         continue;
 771                 vma = vma->vm_next;
 772                 if (!vma || vma->vm_start != start)
 773                         goto bad_area;
 774                 if (!(vma->vm_flags & VM_WRITE))
 775                         goto bad_area;;
 776         }
 777         return 0;
 778 
 779 bad_area:
 780         return -EFAULT;
 781 }
 782 
 783 static inline void get_empty_page(struct vm_area_struct * vma, pte_t * page_table)
     /* [previous][next][first][last][top][bottom][index][help] */
 784 {
 785         unsigned long tmp;
 786 
 787         if (!(tmp = get_free_page(GFP_KERNEL))) {
 788                 oom(vma->vm_task);
 789                 put_page(page_table, BAD_PAGE);
 790                 return;
 791         }
 792         put_page(page_table, pte_mkwrite(mk_pte(tmp, vma->vm_page_prot)));
 793 }
 794 
 795 /*
 796  * try_to_share() checks the page at address "address" in the task "p",
 797  * to see if it exists, and if it is clean. If so, share it with the current
 798  * task.
 799  *
 800  * NOTE! This assumes we have checked that p != current, and that they
 801  * share the same inode and can generally otherwise be shared.
 802  */
 803 static int try_to_share(unsigned long to_address, struct vm_area_struct * to_area,
     /* [previous][next][first][last][top][bottom][index][help] */
 804         unsigned long from_address, struct vm_area_struct * from_area,
 805         unsigned long newpage)
 806 {
 807         pgd_t * from_dir, * to_dir;
 808         pmd_t * from_middle, * to_middle;
 809         pte_t * from_table, * to_table;
 810         pte_t from, to;
 811 
 812         from_dir = pgd_offset(from_area->vm_task,from_address);
 813 /* is there a page-directory at from? */
 814         if (pgd_none(*from_dir))
 815                 return 0;
 816         if (pgd_bad(*from_dir)) {
 817                 printk("try_to_share: bad page directory %08lx\n", pgd_val(*from_dir));
 818                 pgd_clear(from_dir);
 819                 return 0;
 820         }
 821         from_middle = pmd_offset(from_dir, from_address);
 822 /* is there a mid-directory at from? */
 823         if (pmd_none(*from_middle))
 824                 return 0;
 825         if (pmd_bad(*from_middle)) {
 826                 printk("try_to_share: bad mid directory %08lx\n", pmd_val(*from_middle));
 827                 pmd_clear(from_middle);
 828                 return 0;
 829         }
 830         from_table = pte_offset(from_middle, from_address);
 831         from = *from_table;
 832 /* is the page present? */
 833         if (!pte_present(from))
 834                 return 0;
 835 /* if it is dirty it must be from a shared mapping to be shared */
 836         if (pte_dirty(from)) {
 837                 if (!(from_area->vm_flags & VM_SHARED))
 838                         return 0;
 839                 if (pte_write(from)) {
 840                         printk("nonwritable, but dirty, shared page\n");
 841                         return 0;
 842                 }
 843         }
 844 /* is the page reasonable at all? */
 845         if (pte_page(from) >= high_memory)
 846                 return 0;
 847         if (mem_map[MAP_NR(pte_page(from))] & MAP_PAGE_RESERVED)
 848                 return 0;
 849 /* is the destination ok? */
 850         to_dir = pgd_offset(to_area->vm_task,to_address);
 851 /* is there a page-directory at to? */
 852         if (pgd_none(*to_dir))
 853                 return 0;
 854         if (pgd_bad(*to_dir)) {
 855                 printk("try_to_share: bad page directory %08lx\n", pgd_val(*to_dir));
 856                 return 0;
 857         }
 858         to_middle = pmd_offset(to_dir, to_address);
 859 /* is there a mid-directory at to? */
 860         if (pmd_none(*to_middle))
 861                 return 0;
 862         if (pmd_bad(*to_middle)) {
 863                 printk("try_to_share: bad mid directory %08lx\n", pmd_val(*to_middle));
 864                 return 0;
 865         }
 866         to_table = pte_offset(to_middle, to_address);
 867         to = *to_table;
 868         if (!pte_none(to))
 869                 return 0;
 870 /* do we copy? */
 871         if (newpage) {
 872                 /* if it's in the swap cache, it's dirty by implication */
 873                 /* so we can't use it if it's not from a shared mapping */
 874                 if (in_swap_cache(pte_page(from))) {
 875                         if (!(from_area->vm_flags & VM_SHARED))
 876                                 return 0;
 877                         if (!pte_write(from)) {
 878                                 printk("nonwritable, but dirty, shared page\n");
 879                                 return 0;
 880                         }
 881                 }
 882                 copy_page(pte_page(from), newpage);
 883                 *to_table = mk_pte(newpage, to_area->vm_page_prot);
 884                 return 1;
 885         }
 886 /*
 887  * do a final swap-cache test before sharing them: if it's in the swap
 888  * cache, we have to remove it now, as we get two pointers to the same
 889  * physical page and the cache can't handle it. Mark the original dirty.
 890  *
 891  * NOTE! Even if "from" is dirty, "to" will be clean: if we get here
 892  * with a dirty "from", the from-mapping is a shared map, so we can trust
 893  * the page contents to be up-to-date
 894  */
 895         if (in_swap_cache(pte_page(from))) {
 896                 if (!(from_area->vm_flags & VM_SHARED))
 897                         return 0;
 898                 *from_table = pte_mkdirty(from);
 899                 delete_from_swap_cache(pte_page(from));
 900         }
 901         mem_map[MAP_NR(pte_page(from))]++;
 902         *to_table = mk_pte(pte_page(from), to_area->vm_page_prot);
 903 /* Check if we need to do anything at all to the 'from' field */
 904         if (!pte_write(from))
 905                 return 1;
 906         if (from_area->vm_flags & VM_SHARED)
 907                 return 1;
 908 /* ok, need to mark it read-only, so invalidate any possible old TB entry */
 909         *from_table = pte_wrprotect(from);
 910         invalidate();
 911         return 1;
 912 }
 913 
 914 /*
 915  * share_page() tries to find a process that could share a page with
 916  * the current one.
 917  *
 918  * We first check if it is at all feasible by checking inode->i_count.
 919  * It should be >1 if there are other tasks sharing this inode.
 920  */
 921 static int share_page(struct vm_area_struct * area, unsigned long address,
     /* [previous][next][first][last][top][bottom][index][help] */
 922         int write_access, unsigned long newpage)
 923 {
 924         struct inode * inode;
 925         unsigned long offset;
 926         unsigned long from_address;
 927         unsigned long give_page;
 928         struct vm_area_struct * mpnt;
 929 
 930         if (!area || !(inode = area->vm_inode) || inode->i_count < 2)
 931                 return 0;
 932         /* do we need to copy or can we just share? */
 933         give_page = 0;
 934         if (write_access && !(area->vm_flags & VM_SHARED)) {
 935                 if (!newpage)
 936                         return 0;
 937                 give_page = newpage;
 938         }
 939         offset = address - area->vm_start + area->vm_offset;
 940         /* See if there is something in the VM we can share pages with. */
 941         /* Traverse the entire circular i_mmap list, except `area' itself. */
 942         for (mpnt = area->vm_next_share; mpnt != area; mpnt = mpnt->vm_next_share) {
 943                 /* must be same inode */
 944                 if (mpnt->vm_inode != inode) {
 945                         printk("Aiee! Corrupt vm_area_struct i_mmap ring\n");
 946                         break;  
 947                 }
 948                 /* offsets must be mutually page-aligned */
 949                 if ((mpnt->vm_offset ^ area->vm_offset) & ~PAGE_MASK)
 950                         continue;
 951                 /* the other area must actually cover the wanted page.. */
 952                 from_address = offset + mpnt->vm_start - mpnt->vm_offset;
 953                 if (from_address < mpnt->vm_start || from_address >= mpnt->vm_end)
 954                         continue;
 955                 /* .. NOW we can actually try to use the same physical page */
 956                 if (!try_to_share(address, area, from_address, mpnt, give_page))
 957                         continue;
 958                 /* free newpage if we never used it.. */
 959                 if (give_page || !newpage)
 960                         return 1;
 961                 free_page(newpage);
 962                 return 1;
 963         }
 964         return 0;
 965 }
 966 
 967 /*
 968  * fill in an empty page-table if none exists.
 969  */
 970 static inline pte_t * get_empty_pgtable(struct task_struct * tsk,unsigned long address)
     /* [previous][next][first][last][top][bottom][index][help] */
 971 {
 972         pgd_t *pgd;
 973         pmd_t *pmd;
 974         pte_t *pte;
 975 
 976         pgd = pgd_offset(tsk, address);
 977         pmd = pmd_alloc(pgd, address);
 978         if (!pmd) {
 979                 oom(tsk);
 980                 return NULL;
 981         }
 982         pte = pte_alloc(pmd, address);
 983         if (!pte) {
 984                 oom(tsk);
 985                 return NULL;
 986         }
 987         return pte;
 988 }
 989 
 990 static inline void do_swap_page(struct vm_area_struct * vma, unsigned long address,
     /* [previous][next][first][last][top][bottom][index][help] */
 991         pte_t * page_table, pte_t entry, int write_access)
 992 {
 993         pte_t page;
 994 
 995         if (!vma->vm_ops || !vma->vm_ops->swapin) {
 996                 swap_in(vma, page_table, pte_val(entry), write_access);
 997                 return;
 998         }
 999         page = vma->vm_ops->swapin(vma, address - vma->vm_start + vma->vm_offset, pte_val(entry));
1000         if (pte_val(*page_table) != pte_val(entry)) {
1001                 free_page(pte_page(page));
1002                 return;
1003         }
1004         if (mem_map[MAP_NR(pte_page(page))] > 1 && !(vma->vm_flags & VM_SHARED))
1005                 page = pte_wrprotect(page);
1006         ++vma->vm_task->mm->rss;
1007         ++vma->vm_task->mm->maj_flt;
1008         *page_table = page;
1009         return;
1010 }
1011 
1012 /*
1013  * do_no_page() tries to create a new page mapping. It aggressively
1014  * tries to share with existing pages, but makes a separate copy if
1015  * the "write_access" parameter is true in order to avoid the next
1016  * page fault.
1017  */
1018 void do_no_page(struct vm_area_struct * vma, unsigned long address,
     /* [previous][next][first][last][top][bottom][index][help] */
1019         int write_access)
1020 {
1021         pte_t * page_table;
1022         pte_t entry;
1023         unsigned long page;
1024 
1025         page_table = get_empty_pgtable(vma->vm_task,address);
1026         if (!page_table)
1027                 return;
1028         entry = *page_table;
1029         if (pte_present(entry))
1030                 return;
1031         if (!pte_none(entry)) {
1032                 do_swap_page(vma, address, page_table, entry, write_access);
1033                 return;
1034         }
1035         address &= PAGE_MASK;
1036         if (!vma->vm_ops || !vma->vm_ops->nopage) {
1037                 ++vma->vm_task->mm->rss;
1038                 ++vma->vm_task->mm->min_flt;
1039                 get_empty_page(vma, page_table);
1040                 return;
1041         }
1042         page = get_free_page(GFP_KERNEL);
1043         if (share_page(vma, address, write_access, page)) {
1044                 ++vma->vm_task->mm->min_flt;
1045                 ++vma->vm_task->mm->rss;
1046                 return;
1047         }
1048         if (!page) {
1049                 oom(current);
1050                 put_page(page_table, BAD_PAGE);
1051                 return;
1052         }
1053         ++vma->vm_task->mm->maj_flt;
1054         ++vma->vm_task->mm->rss;
1055         /*
1056          * The fourth argument is "no_share", which tells the low-level code
1057          * to copy, not share the page even if sharing is possible.  It's
1058          * essentially an early COW detection 
1059          */
1060         page = vma->vm_ops->nopage(vma, address, page,
1061                 write_access && !(vma->vm_flags & VM_SHARED));
1062         if (share_page(vma, address, write_access, 0)) {
1063                 free_page(page);
1064                 return;
1065         }
1066         /*
1067          * This silly early PAGE_DIRTY setting removes a race
1068          * due to the bad i386 page protection. But it's valid
1069          * for other architectures too.
1070          *
1071          * Note that if write_access is true, we either now have
1072          * a exclusive copy of the page, or this is a shared mapping,
1073          * so we can make it writable and dirty to avoid having to
1074          * handle that later.
1075          */
1076         entry = mk_pte(page, vma->vm_page_prot);
1077         if (write_access) {
1078                 entry = pte_mkwrite(pte_mkdirty(entry));
1079         } else if (mem_map[MAP_NR(page)] > 1 && !(vma->vm_flags & VM_SHARED))
1080                 entry = pte_wrprotect(entry);
1081         put_page(page_table, entry);
1082 }
1083 
1084 /*
1085  * The above separate functions for the no-page and wp-page
1086  * cases will go away (they mostly do the same thing anyway),
1087  * and we'll instead use only a general "handle_mm_fault()".
1088  *
1089  * These routines also need to handle stuff like marking pages dirty
1090  * and/or accessed for architectures that don't do it in hardware (most
1091  * RISC architectures).  The early dirtying is also good on the i386.
1092  *
1093  * There is also a hook called "update_mmu_cache()" that architectures
1094  * with external mmu caches can use to update those (ie the Sparc or
1095  * PowerPC hashed page tables that act as extended TLBs).
1096  */
1097 static inline void handle_pte_fault(struct vm_area_struct * vma, unsigned long address,
     /* [previous][next][first][last][top][bottom][index][help] */
1098         int write_access, pte_t * pte)
1099 {
1100         if (!pte_present(*pte)) {
1101                 do_no_page(vma, address, write_access);
1102                 return;
1103         }
1104         *pte = pte_mkyoung(*pte);
1105         if (!write_access)
1106                 return;
1107         if (pte_write(*pte)) {
1108                 *pte = pte_mkdirty(*pte);
1109                 return;
1110         }
1111         do_wp_page(vma, address, write_access);
1112 }
1113 
1114 void handle_mm_fault(struct vm_area_struct * vma, unsigned long address,
     /* [previous][next][first][last][top][bottom][index][help] */
1115         int write_access)
1116 {
1117         pgd_t *pgd;
1118         pmd_t *pmd;
1119         pte_t *pte;
1120 
1121         pgd = pgd_offset(vma->vm_task, address);
1122         pmd = pmd_alloc(pgd, address);
1123         if (!pmd)
1124                 goto no_memory;
1125         pte = pte_alloc(pmd, address);
1126         if (!pte)
1127                 goto no_memory;
1128         handle_pte_fault(vma, address, write_access, pte);
1129         update_mmu_cache(vma, address, *pte);
1130         return;
1131 no_memory:
1132         oom(vma->vm_task);
1133 }

/* [previous][next][first][last][top][bottom][index][help] */