root/mm/memory.c

/* [previous][next][first][last][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. copy_page
  2. oom
  3. free_one_pmd
  4. free_one_pgd
  5. clear_page_tables
  6. free_page_tables
  7. new_page_tables
  8. copy_one_pte
  9. copy_pte_range
  10. copy_pmd_range
  11. copy_page_range
  12. forget_pte
  13. zap_pte_range
  14. zap_pmd_range
  15. zap_page_range
  16. zeromap_pte_range
  17. zeromap_pmd_range
  18. zeromap_page_range
  19. remap_pte_range
  20. remap_pmd_range
  21. remap_page_range
  22. put_page
  23. put_dirty_page
  24. do_wp_page
  25. verify_area
  26. get_empty_page
  27. partial_clear
  28. vmtruncate
  29. get_empty_pgtable
  30. do_swap_page
  31. do_no_page
  32. handle_pte_fault
  33. handle_mm_fault

   1 /*
   2  *  linux/mm/memory.c
   3  *
   4  *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
   5  */
   6 
   7 /*
   8  * demand-loading started 01.12.91 - seems it is high on the list of
   9  * things wanted, and it should be easy to implement. - Linus
  10  */
  11 
  12 /*
  13  * Ok, demand-loading was easy, shared pages a little bit tricker. Shared
  14  * pages started 02.12.91, seems to work. - Linus.
  15  *
  16  * Tested sharing by executing about 30 /bin/sh: under the old kernel it
  17  * would have taken more than the 6M I have free, but it worked well as
  18  * far as I could see.
  19  *
  20  * Also corrected some "invalidate()"s - I wasn't doing enough of them.
  21  */
  22 
  23 /*
  24  * Real VM (paging to/from disk) started 18.12.91. Much more work and
  25  * thought has to go into this. Oh, well..
  26  * 19.12.91  -  works, somewhat. Sometimes I get faults, don't know why.
  27  *              Found it. Everything seems to work now.
  28  * 20.12.91  -  Ok, making the swap-device changeable like the root.
  29  */
  30 
  31 /*
  32  * 05.04.94  -  Multi-page memory management added for v1.1.
  33  *              Idea by Alex Bligh (alex@cconcepts.co.uk)
  34  */
  35 
  36 #include <linux/signal.h>
  37 #include <linux/sched.h>
  38 #include <linux/head.h>
  39 #include <linux/kernel.h>
  40 #include <linux/errno.h>
  41 #include <linux/string.h>
  42 #include <linux/types.h>
  43 #include <linux/ptrace.h>
  44 #include <linux/mman.h>
  45 #include <linux/mm.h>
  46 #include <linux/swap.h>
  47 
  48 #include <asm/system.h>
  49 #include <asm/segment.h>
  50 #include <asm/pgtable.h>
  51 #include <asm/string.h>
  52 
  53 unsigned long high_memory = 0;
  54 
  55 /*
  56  * We special-case the C-O-W ZERO_PAGE, because it's such
  57  * a common occurrence (no need to read the page to know
  58  * that it's zero - better for the cache and memory subsystem).
  59  */
  60 static inline void copy_page(unsigned long from, unsigned long to)
     /* [previous][next][first][last][top][bottom][index][help] */
  61 {
  62         if (from == ZERO_PAGE) {
  63                 memset((void *) to, 0, PAGE_SIZE);
  64                 return;
  65         }
  66         memcpy((void *) to, (void *) from, PAGE_SIZE);
  67 }
  68 
  69 #define USER_PTRS_PER_PGD (TASK_SIZE / PGDIR_SIZE)
  70 
  71 mem_map_t * mem_map = NULL;
  72 
  73 /*
  74  * oom() prints a message (so that the user knows why the process died),
  75  * and gives the process an untrappable SIGKILL.
  76  */
  77 void oom(struct task_struct * task)
     /* [previous][next][first][last][top][bottom][index][help] */
  78 {
  79         printk("\nOut of memory for %s.\n", current->comm);
  80         task->sig->action[SIGKILL-1].sa_handler = NULL;
  81         task->blocked &= ~(1<<(SIGKILL-1));
  82         send_sig(SIGKILL,task,1);
  83 }
  84 
  85 /*
  86  * Note: this doesn't free the actual pages themselves. That
  87  * has been handled earlier when unmapping all the memory regions.
  88  */
  89 static inline void free_one_pmd(pmd_t * dir)
     /* [previous][next][first][last][top][bottom][index][help] */
  90 {
  91         pte_t * pte;
  92 
  93         if (pmd_none(*dir))
  94                 return;
  95         if (pmd_bad(*dir)) {
  96                 printk("free_one_pmd: bad directory entry %08lx\n", pmd_val(*dir));
  97                 pmd_clear(dir);
  98                 return;
  99         }
 100         pte = pte_offset(dir, 0);
 101         pmd_clear(dir);
 102         pte_free(pte);
 103 }
 104 
 105 static inline void free_one_pgd(pgd_t * dir)
     /* [previous][next][first][last][top][bottom][index][help] */
 106 {
 107         int j;
 108         pmd_t * pmd;
 109 
 110         if (pgd_none(*dir))
 111                 return;
 112         if (pgd_bad(*dir)) {
 113                 printk("free_one_pgd: bad directory entry %08lx\n", pgd_val(*dir));
 114                 pgd_clear(dir);
 115                 return;
 116         }
 117         pmd = pmd_offset(dir, 0);
 118         pgd_clear(dir);
 119         for (j = 0; j < PTRS_PER_PMD ; j++)
 120                 free_one_pmd(pmd+j);
 121         pmd_free(pmd);
 122 }
 123         
 124 /*
 125  * This function clears all user-level page tables of a process - this
 126  * is needed by execve(), so that old pages aren't in the way.
 127  */
 128 void clear_page_tables(struct task_struct * tsk)
     /* [previous][next][first][last][top][bottom][index][help] */
 129 {
 130         int i;
 131         pgd_t * page_dir;
 132 
 133         page_dir = tsk->mm->pgd;
 134         if (!page_dir || page_dir == swapper_pg_dir) {
 135                 printk("%s trying to clear kernel page-directory: not good\n", tsk->comm);
 136                 return;
 137         }
 138         flush_cache_mm(tsk->mm);
 139         for (i = 0 ; i < USER_PTRS_PER_PGD ; i++)
 140                 free_one_pgd(page_dir + i);
 141         flush_tlb_mm(tsk->mm);
 142 }
 143 
 144 /*
 145  * This function frees up all page tables of a process when it exits. It
 146  * is the same as "clear_page_tables()", except it also changes the process'
 147  * page table directory to the kernel page tables and then frees the old
 148  * page table directory.
 149  */
 150 void free_page_tables(struct mm_struct * mm)
     /* [previous][next][first][last][top][bottom][index][help] */
 151 {
 152         int i;
 153         pgd_t * page_dir;
 154 
 155         page_dir = mm->pgd;
 156         if (!page_dir || page_dir == swapper_pg_dir) {
 157                 printk("Trying to free kernel page-directory: not good\n");
 158                 return;
 159         }
 160         for (i = 0 ; i < USER_PTRS_PER_PGD ; i++)
 161                 free_one_pgd(page_dir + i);
 162         pgd_free(page_dir);
 163 }
 164 
 165 int new_page_tables(struct task_struct * tsk)
     /* [previous][next][first][last][top][bottom][index][help] */
 166 {
 167         pgd_t * page_dir, * new_pg;
 168 
 169         if (!(new_pg = pgd_alloc()))
 170                 return -ENOMEM;
 171         page_dir = pgd_offset(&init_mm, 0);
 172         flush_cache_mm(tsk->mm);
 173         memcpy(new_pg + USER_PTRS_PER_PGD, page_dir + USER_PTRS_PER_PGD,
 174                (PTRS_PER_PGD - USER_PTRS_PER_PGD) * sizeof (pgd_t));
 175         flush_tlb_mm(tsk->mm);
 176         SET_PAGE_DIR(tsk, new_pg);
 177         tsk->mm->pgd = new_pg;
 178         return 0;
 179 }
 180 
 181 static inline void copy_one_pte(pte_t * old_pte, pte_t * new_pte, int cow)
     /* [previous][next][first][last][top][bottom][index][help] */
 182 {
 183         pte_t pte = *old_pte;
 184         unsigned long page_nr;
 185 
 186         if (pte_none(pte))
 187                 return;
 188         if (!pte_present(pte)) {
 189                 swap_duplicate(pte_val(pte));
 190                 set_pte(new_pte, pte);
 191                 return;
 192         }
 193         page_nr = MAP_NR(pte_page(pte));
 194         if (page_nr >= MAP_NR(high_memory) || PageReserved(mem_map+page_nr)) {
 195                 set_pte(new_pte, pte);
 196                 return;
 197         }
 198         if (cow)
 199                 pte = pte_wrprotect(pte);
 200         if (delete_from_swap_cache(page_nr))
 201                 pte = pte_mkdirty(pte);
 202         set_pte(new_pte, pte_mkold(pte));
 203         set_pte(old_pte, pte);
 204         mem_map[page_nr].count++;
 205 }
 206 
 207 static inline int copy_pte_range(pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long address, unsigned long size, int cow)
     /* [previous][next][first][last][top][bottom][index][help] */
 208 {
 209         pte_t * src_pte, * dst_pte;
 210         unsigned long end;
 211 
 212         if (pmd_none(*src_pmd))
 213                 return 0;
 214         if (pmd_bad(*src_pmd)) {
 215                 printk("copy_pte_range: bad pmd (%08lx)\n", pmd_val(*src_pmd));
 216                 pmd_clear(src_pmd);
 217                 return 0;
 218         }
 219         src_pte = pte_offset(src_pmd, address);
 220         if (pmd_none(*dst_pmd)) {
 221                 if (!pte_alloc(dst_pmd, 0))
 222                         return -ENOMEM;
 223         }
 224         dst_pte = pte_offset(dst_pmd, address);
 225         address &= ~PMD_MASK;
 226         end = address + size;
 227         if (end >= PMD_SIZE)
 228                 end = PMD_SIZE;
 229         do {
 230                 /* I would like to switch arguments here, to make it
 231                  * consistent with copy_xxx_range and memcpy syntax.
 232                  */
 233                 copy_one_pte(src_pte++, dst_pte++, cow);
 234                 address += PAGE_SIZE;
 235         } while (address < end);
 236         return 0;
 237 }
 238 
 239 static inline int copy_pmd_range(pgd_t *dst_pgd, pgd_t *src_pgd, unsigned long address, unsigned long size, int cow)
     /* [previous][next][first][last][top][bottom][index][help] */
 240 {
 241         pmd_t * src_pmd, * dst_pmd;
 242         unsigned long end;
 243         int error = 0;
 244 
 245         if (pgd_none(*src_pgd))
 246                 return 0;
 247         if (pgd_bad(*src_pgd)) {
 248                 printk("copy_pmd_range: bad pgd (%08lx)\n", pgd_val(*src_pgd));
 249                 pgd_clear(src_pgd);
 250                 return 0;
 251         }
 252         src_pmd = pmd_offset(src_pgd, address);
 253         if (pgd_none(*dst_pgd)) {
 254                 if (!pmd_alloc(dst_pgd, 0))
 255                         return -ENOMEM;
 256         }
 257         dst_pmd = pmd_offset(dst_pgd, address);
 258         address &= ~PGDIR_MASK;
 259         end = address + size;
 260         if (end > PGDIR_SIZE)
 261                 end = PGDIR_SIZE;
 262         do {
 263                 error = copy_pte_range(dst_pmd++, src_pmd++, address, end - address, cow);
 264                 if (error)
 265                         break;
 266                 address = (address + PMD_SIZE) & PMD_MASK; 
 267         } while (address < end);
 268         return error;
 269 }
 270 
 271 /*
 272  * copy one vm_area from one task to the other. Assumes the page tables
 273  * already present in the new task to be cleared in the whole range
 274  * covered by this vma.
 275  */
 276 int copy_page_range(struct mm_struct *dst, struct mm_struct *src,
     /* [previous][next][first][last][top][bottom][index][help] */
 277                         struct vm_area_struct *vma)
 278 {
 279         pgd_t * src_pgd, * dst_pgd;
 280         unsigned long address = vma->vm_start;
 281         unsigned long end = vma->vm_end;
 282         int error = 0, cow;
 283 
 284         cow = (vma->vm_flags & (VM_SHARED | VM_WRITE)) == VM_WRITE;
 285         src_pgd = pgd_offset(src, address);
 286         dst_pgd = pgd_offset(dst, address);
 287         flush_cache_range(src, vma->vm_start, vma->vm_end);
 288         flush_cache_range(dst, vma->vm_start, vma->vm_end);
 289         while (address < end) {
 290                 error = copy_pmd_range(dst_pgd++, src_pgd++, address, end - address, cow);
 291                 if (error)
 292                         break;
 293                 address = (address + PGDIR_SIZE) & PGDIR_MASK;
 294         }
 295         /* Note that the src ptes get c-o-w treatment, so they change too. */
 296         flush_tlb_range(src, vma->vm_start, vma->vm_end);
 297         flush_tlb_range(dst, vma->vm_start, vma->vm_end);
 298         return error;
 299 }
 300 
 301 static inline void forget_pte(pte_t page)
     /* [previous][next][first][last][top][bottom][index][help] */
 302 {
 303         if (pte_none(page))
 304                 return;
 305         if (pte_present(page)) {
 306                 unsigned long addr = pte_page(page);
 307                 if (addr >= high_memory || PageReserved(mem_map+MAP_NR(addr)))
 308                         return;
 309                 free_page(addr);
 310                 if (current->mm->rss <= 0)
 311                         return;
 312                 current->mm->rss--;
 313                 return;
 314         }
 315         swap_free(pte_val(page));
 316 }
 317 
 318 static inline void zap_pte_range(pmd_t * pmd, unsigned long address, unsigned long size)
     /* [previous][next][first][last][top][bottom][index][help] */
 319 {
 320         pte_t * pte;
 321         unsigned long end;
 322 
 323         if (pmd_none(*pmd))
 324                 return;
 325         if (pmd_bad(*pmd)) {
 326                 printk("zap_pte_range: bad pmd (%08lx)\n", pmd_val(*pmd));
 327                 pmd_clear(pmd);
 328                 return;
 329         }
 330         pte = pte_offset(pmd, address);
 331         address &= ~PMD_MASK;
 332         end = address + size;
 333         if (end >= PMD_SIZE)
 334                 end = PMD_SIZE;
 335         do {
 336                 pte_t page = *pte;
 337                 pte_clear(pte);
 338                 forget_pte(page);
 339                 address += PAGE_SIZE;
 340                 pte++;
 341         } while (address < end);
 342 }
 343 
 344 static inline void zap_pmd_range(pgd_t * dir, unsigned long address, unsigned long size)
     /* [previous][next][first][last][top][bottom][index][help] */
 345 {
 346         pmd_t * pmd;
 347         unsigned long end;
 348 
 349         if (pgd_none(*dir))
 350                 return;
 351         if (pgd_bad(*dir)) {
 352                 printk("zap_pmd_range: bad pgd (%08lx)\n", pgd_val(*dir));
 353                 pgd_clear(dir);
 354                 return;
 355         }
 356         pmd = pmd_offset(dir, address);
 357         address &= ~PGDIR_MASK;
 358         end = address + size;
 359         if (end > PGDIR_SIZE)
 360                 end = PGDIR_SIZE;
 361         do {
 362                 zap_pte_range(pmd, address, end - address);
 363                 address = (address + PMD_SIZE) & PMD_MASK; 
 364                 pmd++;
 365         } while (address < end);
 366 }
 367 
 368 /*
 369  * remove user pages in a given range.
 370  */
 371 int zap_page_range(struct mm_struct *mm, unsigned long address, unsigned long size)
     /* [previous][next][first][last][top][bottom][index][help] */
 372 {
 373         pgd_t * dir;
 374         unsigned long end = address + size;
 375 
 376         dir = pgd_offset(mm, address);
 377         flush_cache_range(mm, end - size, end);
 378         while (address < end) {
 379                 zap_pmd_range(dir, address, end - address);
 380                 address = (address + PGDIR_SIZE) & PGDIR_MASK;
 381                 dir++;
 382         }
 383         flush_tlb_range(mm, end - size, end);
 384         return 0;
 385 }
 386 
 387 static inline void zeromap_pte_range(pte_t * pte, unsigned long address, unsigned long size, pte_t zero_pte)
     /* [previous][next][first][last][top][bottom][index][help] */
 388 {
 389         unsigned long end;
 390 
 391         address &= ~PMD_MASK;
 392         end = address + size;
 393         if (end > PMD_SIZE)
 394                 end = PMD_SIZE;
 395         do {
 396                 pte_t oldpage = *pte;
 397                 set_pte(pte, zero_pte);
 398                 forget_pte(oldpage);
 399                 address += PAGE_SIZE;
 400                 pte++;
 401         } while (address < end);
 402 }
 403 
 404 static inline int zeromap_pmd_range(pmd_t * pmd, unsigned long address, unsigned long size, pte_t zero_pte)
     /* [previous][next][first][last][top][bottom][index][help] */
 405 {
 406         unsigned long end;
 407 
 408         address &= ~PGDIR_MASK;
 409         end = address + size;
 410         if (end > PGDIR_SIZE)
 411                 end = PGDIR_SIZE;
 412         do {
 413                 pte_t * pte = pte_alloc(pmd, address);
 414                 if (!pte)
 415                         return -ENOMEM;
 416                 zeromap_pte_range(pte, address, end - address, zero_pte);
 417                 address = (address + PMD_SIZE) & PMD_MASK;
 418                 pmd++;
 419         } while (address < end);
 420         return 0;
 421 }
 422 
 423 int zeromap_page_range(unsigned long address, unsigned long size, pgprot_t prot)
     /* [previous][next][first][last][top][bottom][index][help] */
 424 {
 425         int error = 0;
 426         pgd_t * dir;
 427         unsigned long beg = address;
 428         unsigned long end = address + size;
 429         pte_t zero_pte;
 430 
 431         zero_pte = pte_wrprotect(mk_pte(ZERO_PAGE, prot));
 432         dir = pgd_offset(current->mm, address);
 433         flush_cache_range(current->mm, beg, end);
 434         while (address < end) {
 435                 pmd_t *pmd = pmd_alloc(dir, address);
 436                 error = -ENOMEM;
 437                 if (!pmd)
 438                         break;
 439                 error = zeromap_pmd_range(pmd, address, end - address, zero_pte);
 440                 if (error)
 441                         break;
 442                 address = (address + PGDIR_SIZE) & PGDIR_MASK;
 443                 dir++;
 444         }
 445         flush_tlb_range(current->mm, beg, end);
 446         return error;
 447 }
 448 
 449 /*
 450  * maps a range of physical memory into the requested pages. the old
 451  * mappings are removed. any references to nonexistent pages results
 452  * in null mappings (currently treated as "copy-on-access")
 453  */
 454 static inline void remap_pte_range(pte_t * pte, unsigned long address, unsigned long size,
     /* [previous][next][first][last][top][bottom][index][help] */
 455         unsigned long offset, pgprot_t prot)
 456 {
 457         unsigned long end;
 458 
 459         address &= ~PMD_MASK;
 460         end = address + size;
 461         if (end > PMD_SIZE)
 462                 end = PMD_SIZE;
 463         do {
 464                 pte_t oldpage = *pte;
 465                 pte_clear(pte);
 466                 if (offset >= high_memory || PageReserved(mem_map+MAP_NR(offset)))
 467                         set_pte(pte, mk_pte(offset, prot));
 468                 forget_pte(oldpage);
 469                 address += PAGE_SIZE;
 470                 offset += PAGE_SIZE;
 471                 pte++;
 472         } while (address < end);
 473 }
 474 
 475 static inline int remap_pmd_range(pmd_t * pmd, unsigned long address, unsigned long size,
     /* [previous][next][first][last][top][bottom][index][help] */
 476         unsigned long offset, pgprot_t prot)
 477 {
 478         unsigned long end;
 479 
 480         address &= ~PGDIR_MASK;
 481         end = address + size;
 482         if (end > PGDIR_SIZE)
 483                 end = PGDIR_SIZE;
 484         offset -= address;
 485         do {
 486                 pte_t * pte = pte_alloc(pmd, address);
 487                 if (!pte)
 488                         return -ENOMEM;
 489                 remap_pte_range(pte, address, end - address, address + offset, prot);
 490                 address = (address + PMD_SIZE) & PMD_MASK;
 491                 pmd++;
 492         } while (address < end);
 493         return 0;
 494 }
 495 
 496 int remap_page_range(unsigned long from, unsigned long offset, unsigned long size, pgprot_t prot)
     /* [previous][next][first][last][top][bottom][index][help] */
 497 {
 498         int error = 0;
 499         pgd_t * dir;
 500         unsigned long beg = from;
 501         unsigned long end = from + size;
 502 
 503         offset -= from;
 504         dir = pgd_offset(current->mm, from);
 505         flush_cache_range(current->mm, beg, end);
 506         while (from < end) {
 507                 pmd_t *pmd = pmd_alloc(dir, from);
 508                 error = -ENOMEM;
 509                 if (!pmd)
 510                         break;
 511                 error = remap_pmd_range(pmd, from, end - from, offset + from, prot);
 512                 if (error)
 513                         break;
 514                 from = (from + PGDIR_SIZE) & PGDIR_MASK;
 515                 dir++;
 516         }
 517         flush_tlb_range(current->mm, beg, end);
 518         return error;
 519 }
 520 
 521 /*
 522  * sanity-check function..
 523  */
 524 static void put_page(pte_t * page_table, pte_t pte)
     /* [previous][next][first][last][top][bottom][index][help] */
 525 {
 526         if (!pte_none(*page_table)) {
 527                 printk("put_page: page already exists %08lx\n", pte_val(*page_table));
 528                 free_page(pte_page(pte));
 529                 return;
 530         }
 531 /* no need for flush_tlb */
 532         set_pte(page_table, pte);
 533 }
 534 
 535 /*
 536  * This routine is used to map in a page into an address space: needed by
 537  * execve() for the initial stack and environment pages.
 538  */
 539 unsigned long put_dirty_page(struct task_struct * tsk, unsigned long page, unsigned long address)
     /* [previous][next][first][last][top][bottom][index][help] */
 540 {
 541         pgd_t * pgd;
 542         pmd_t * pmd;
 543         pte_t * pte;
 544 
 545         if (page >= high_memory)
 546                 printk("put_dirty_page: trying to put page %08lx at %08lx\n",page,address);
 547         if (mem_map[MAP_NR(page)].count != 1)
 548                 printk("mem_map disagrees with %08lx at %08lx\n",page,address);
 549         pgd = pgd_offset(tsk->mm,address);
 550         pmd = pmd_alloc(pgd, address);
 551         if (!pmd) {
 552                 free_page(page);
 553                 oom(tsk);
 554                 return 0;
 555         }
 556         pte = pte_alloc(pmd, address);
 557         if (!pte) {
 558                 free_page(page);
 559                 oom(tsk);
 560                 return 0;
 561         }
 562         if (!pte_none(*pte)) {
 563                 printk("put_dirty_page: page already exists\n");
 564                 free_page(page);
 565                 return 0;
 566         }
 567         flush_page_to_ram(page);
 568         set_pte(pte, pte_mkwrite(pte_mkdirty(mk_pte(page, PAGE_COPY))));
 569 /* no need for invalidate */
 570         return page;
 571 }
 572 
 573 /*
 574  * This routine handles present pages, when users try to write
 575  * to a shared page. It is done by copying the page to a new address
 576  * and decrementing the shared-page counter for the old page.
 577  *
 578  * Goto-purists beware: the only reason for goto's here is that it results
 579  * in better assembly code.. The "default" path will see no jumps at all.
 580  *
 581  * Note that this routine assumes that the protection checks have been
 582  * done by the caller (the low-level page fault routine in most cases).
 583  * Thus we can safely just mark it writable once we've done any necessary
 584  * COW.
 585  *
 586  * We also mark the page dirty at this point even though the page will
 587  * change only once the write actually happens. This avoids a few races,
 588  * and potentially makes it more efficient.
 589  */
 590 void do_wp_page(struct task_struct * tsk, struct vm_area_struct * vma,
     /* [previous][next][first][last][top][bottom][index][help] */
 591         unsigned long address, int write_access)
 592 {
 593         pgd_t *page_dir;
 594         pmd_t *page_middle;
 595         pte_t *page_table, pte;
 596         unsigned long old_page, new_page;
 597 
 598         new_page = __get_free_page(GFP_KERNEL);
 599         page_dir = pgd_offset(vma->vm_mm, address);
 600         if (pgd_none(*page_dir))
 601                 goto end_wp_page;
 602         if (pgd_bad(*page_dir))
 603                 goto bad_wp_pagedir;
 604         page_middle = pmd_offset(page_dir, address);
 605         if (pmd_none(*page_middle))
 606                 goto end_wp_page;
 607         if (pmd_bad(*page_middle))
 608                 goto bad_wp_pagemiddle;
 609         page_table = pte_offset(page_middle, address);
 610         pte = *page_table;
 611         if (!pte_present(pte))
 612                 goto end_wp_page;
 613         if (pte_write(pte))
 614                 goto end_wp_page;
 615         old_page = pte_page(pte);
 616         if (old_page >= high_memory)
 617                 goto bad_wp_page;
 618         tsk->min_flt++;
 619         /*
 620          * Do we need to copy?
 621          */
 622         if (mem_map[MAP_NR(old_page)].count != 1) {
 623                 if (new_page) {
 624                         if (PageReserved(mem_map + MAP_NR(old_page)))
 625                                 ++vma->vm_mm->rss;
 626                         copy_page(old_page,new_page);
 627                         flush_page_to_ram(old_page);
 628                         flush_page_to_ram(new_page);
 629                         flush_cache_page(vma, address);
 630                         set_pte(page_table, pte_mkwrite(pte_mkdirty(mk_pte(new_page, vma->vm_page_prot))));
 631                         free_page(old_page);
 632                         flush_tlb_page(vma, address);
 633                         return;
 634                 }
 635                 flush_cache_page(vma, address);
 636                 set_pte(page_table, BAD_PAGE);
 637                 flush_tlb_page(vma, address);
 638                 free_page(old_page);
 639                 oom(tsk);
 640                 return;
 641         }
 642         flush_cache_page(vma, address);
 643         set_pte(page_table, pte_mkdirty(pte_mkwrite(pte)));
 644         flush_tlb_page(vma, address);
 645         if (new_page)
 646                 free_page(new_page);
 647         return;
 648 bad_wp_page:
 649         printk("do_wp_page: bogus page at address %08lx (%08lx)\n",address,old_page);
 650         send_sig(SIGKILL, tsk, 1);
 651         goto end_wp_page;
 652 bad_wp_pagemiddle:
 653         printk("do_wp_page: bogus page-middle at address %08lx (%08lx)\n", address, pmd_val(*page_middle));
 654         send_sig(SIGKILL, tsk, 1);
 655         goto end_wp_page;
 656 bad_wp_pagedir:
 657         printk("do_wp_page: bogus page-dir entry at address %08lx (%08lx)\n", address, pgd_val(*page_dir));
 658         send_sig(SIGKILL, tsk, 1);
 659 end_wp_page:
 660         if (new_page)
 661                 free_page(new_page);
 662         return;
 663 }
 664 
 665 /*
 666  * Ugly, ugly, but the goto's result in better assembly..
 667  */
 668 int verify_area(int type, const void * addr, unsigned long size)
     /* [previous][next][first][last][top][bottom][index][help] */
 669 {
 670         struct vm_area_struct * vma;
 671         unsigned long start = (unsigned long) addr;
 672 
 673         /* If the current user space is mapped to kernel space (for the
 674          * case where we use a fake user buffer with get_fs/set_fs()) we
 675          * don't expect to find the address in the user vm map.
 676          */
 677         if (!size || get_fs() == KERNEL_DS)
 678                 return 0;
 679 
 680         vma = find_vma(current, start);
 681         if (!vma)
 682                 goto bad_area;
 683         if (vma->vm_start > start)
 684                 goto check_stack;
 685 
 686 good_area:
 687         if (type == VERIFY_WRITE)
 688                 goto check_write;
 689         for (;;) {
 690                 struct vm_area_struct * next;
 691                 if (!(vma->vm_flags & VM_READ))
 692                         goto bad_area;
 693                 if (vma->vm_end - start >= size)
 694                         return 0;
 695                 next = vma->vm_next;
 696                 if (!next || vma->vm_end != next->vm_start)
 697                         goto bad_area;
 698                 vma = next;
 699         }
 700 
 701 check_write:
 702         if (!(vma->vm_flags & VM_WRITE))
 703                 goto bad_area;
 704         if (!wp_works_ok)
 705                 goto check_wp_fault_by_hand;
 706         for (;;) {
 707                 if (vma->vm_end - start >= size)
 708                         break;
 709                 if (!vma->vm_next || vma->vm_end != vma->vm_next->vm_start)
 710                         goto bad_area;
 711                 vma = vma->vm_next;
 712                 if (!(vma->vm_flags & VM_WRITE))
 713                         goto bad_area;
 714         }
 715         return 0;
 716 
 717 check_wp_fault_by_hand:
 718         size--;
 719         size += start & ~PAGE_MASK;
 720         size >>= PAGE_SHIFT;
 721         start &= PAGE_MASK;
 722 
 723         for (;;) {
 724                 do_wp_page(current, vma, start, 1);
 725                 if (!size)
 726                         break;
 727                 size--;
 728                 start += PAGE_SIZE;
 729                 if (start < vma->vm_end)
 730                         continue;
 731                 vma = vma->vm_next;
 732                 if (!vma || vma->vm_start != start)
 733                         goto bad_area;
 734                 if (!(vma->vm_flags & VM_WRITE))
 735                         goto bad_area;;
 736         }
 737         return 0;
 738 
 739 check_stack:
 740         if (!(vma->vm_flags & VM_GROWSDOWN))
 741                 goto bad_area;
 742         if (expand_stack(vma, start) == 0)
 743                 goto good_area;
 744 
 745 bad_area:
 746         return -EFAULT;
 747 }
 748 
 749 static inline void get_empty_page(struct task_struct * tsk, struct vm_area_struct * vma,
     /* [previous][next][first][last][top][bottom][index][help] */
 750         pte_t * page_table, int write_access)
 751 {
 752         pte_t pte;
 753 
 754         pte = pte_wrprotect(mk_pte(ZERO_PAGE, vma->vm_page_prot));
 755         if (write_access) {
 756                 unsigned long page = get_free_page(GFP_KERNEL);
 757                 pte = pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
 758                 vma->vm_mm->rss++;
 759                 tsk->min_flt++;
 760                 if (!page) {
 761                         oom(tsk);
 762                         pte = BAD_PAGE;
 763                 }
 764                 flush_page_to_ram(page);
 765         }
 766         put_page(page_table, pte);
 767 }
 768 
 769 /*
 770  * This function zeroes out partial mmap'ed pages at truncation time..
 771  */
 772 static void partial_clear(struct vm_area_struct *vma, unsigned long address)
     /* [previous][next][first][last][top][bottom][index][help] */
 773 {
 774         pgd_t *page_dir;
 775         pmd_t *page_middle;
 776         pte_t *page_table, pte;
 777 
 778         page_dir = pgd_offset(vma->vm_mm, address);
 779         if (pgd_none(*page_dir))
 780                 return;
 781         if (pgd_bad(*page_dir)) {
 782                 printk("bad page table directory entry %p:[%lx]\n", page_dir, pgd_val(*page_dir));
 783                 pgd_clear(page_dir);
 784                 return;
 785         }
 786         page_middle = pmd_offset(page_dir, address);
 787         if (pmd_none(*page_middle))
 788                 return;
 789         if (pmd_bad(*page_middle)) {
 790                 printk("bad page table directory entry %p:[%lx]\n", page_dir, pgd_val(*page_dir));
 791                 pmd_clear(page_middle);
 792                 return;
 793         }
 794         page_table = pte_offset(page_middle, address);
 795         pte = *page_table;
 796         if (!pte_present(pte))
 797                 return;
 798         flush_cache_page(vma, address);
 799         address &= ~PAGE_MASK;
 800         address += pte_page(pte);
 801         if (address >= high_memory)
 802                 return;
 803         memset((void *) address, 0, PAGE_SIZE - (address & ~PAGE_MASK));
 804         flush_page_to_ram(pte_page(pte));
 805 }
 806 
 807 /*
 808  * Handle all mappings that got truncated by a "truncate()"
 809  * system call.
 810  *
 811  * NOTE! We have to be ready to update the memory sharing
 812  * between the file and the memory map for a potential last
 813  * incomplete page.  Ugly, but necessary.
 814  */
 815 void vmtruncate(struct inode * inode, unsigned long offset)
     /* [previous][next][first][last][top][bottom][index][help] */
 816 {
 817         struct vm_area_struct * mpnt;
 818 
 819         truncate_inode_pages(inode, offset);
 820         if (!inode->i_mmap)
 821                 return;
 822         mpnt = inode->i_mmap;
 823         do {
 824                 unsigned long start = mpnt->vm_start;
 825                 unsigned long len = mpnt->vm_end - start;
 826                 unsigned long diff;
 827 
 828                 /* mapping wholly truncated? */
 829                 if (mpnt->vm_offset >= offset) {
 830                         zap_page_range(mpnt->vm_mm, start, len);
 831                         continue;
 832                 }
 833                 /* mapping wholly unaffected? */
 834                 diff = offset - mpnt->vm_offset;
 835                 if (diff >= len)
 836                         continue;
 837                 /* Ok, partially affected.. */
 838                 start += diff;
 839                 len = (len - diff) & PAGE_MASK;
 840                 if (start & ~PAGE_MASK) {
 841                         partial_clear(mpnt, start);
 842                         start = (start + ~PAGE_MASK) & PAGE_MASK;
 843                 }
 844                 zap_page_range(mpnt->vm_mm, start, len);
 845         } while ((mpnt = mpnt->vm_next_share) != inode->i_mmap);
 846 }
 847 
 848 /*
 849  * fill in an empty page-table if none exists.
 850  */
 851 static inline pte_t * get_empty_pgtable(struct task_struct * tsk,unsigned long address)
     /* [previous][next][first][last][top][bottom][index][help] */
 852 {
 853         pgd_t *pgd;
 854         pmd_t *pmd;
 855         pte_t *pte;
 856 
 857         pgd = pgd_offset(tsk->mm, address);
 858         pmd = pmd_alloc(pgd, address);
 859         if (!pmd) {
 860                 oom(tsk);
 861                 return NULL;
 862         }
 863         pte = pte_alloc(pmd, address);
 864         if (!pte) {
 865                 oom(tsk);
 866                 return NULL;
 867         }
 868         return pte;
 869 }
 870 
 871 static inline void do_swap_page(struct task_struct * tsk, 
     /* [previous][next][first][last][top][bottom][index][help] */
 872         struct vm_area_struct * vma, unsigned long address,
 873         pte_t * page_table, pte_t entry, int write_access)
 874 {
 875         pte_t page;
 876 
 877         if (!vma->vm_ops || !vma->vm_ops->swapin) {
 878                 swap_in(tsk, vma, page_table, pte_val(entry), write_access);
 879                 flush_page_to_ram(pte_page(*page_table));
 880                 return;
 881         }
 882         page = vma->vm_ops->swapin(vma, address - vma->vm_start + vma->vm_offset, pte_val(entry));
 883         if (pte_val(*page_table) != pte_val(entry)) {
 884                 free_page(pte_page(page));
 885                 return;
 886         }
 887         if (mem_map[MAP_NR(pte_page(page))].count > 1 && !(vma->vm_flags & VM_SHARED))
 888                 page = pte_wrprotect(page);
 889         ++vma->vm_mm->rss;
 890         ++tsk->maj_flt;
 891         flush_page_to_ram(pte_page(page));
 892         set_pte(page_table, page);
 893         return;
 894 }
 895 
 896 /*
 897  * do_no_page() tries to create a new page mapping. It aggressively
 898  * tries to share with existing pages, but makes a separate copy if
 899  * the "write_access" parameter is true in order to avoid the next
 900  * page fault.
 901  */
 902 void do_no_page(struct task_struct * tsk, struct vm_area_struct * vma,
     /* [previous][next][first][last][top][bottom][index][help] */
 903         unsigned long address, int write_access)
 904 {
 905         pte_t * page_table;
 906         pte_t entry;
 907         unsigned long page;
 908 
 909         page_table = get_empty_pgtable(tsk, address);
 910         if (!page_table)
 911                 return;
 912         entry = *page_table;
 913         if (pte_present(entry))
 914                 return;
 915         if (!pte_none(entry)) {
 916                 do_swap_page(tsk, vma, address, page_table, entry, write_access);
 917                 return;
 918         }
 919         address &= PAGE_MASK;
 920         if (!vma->vm_ops || !vma->vm_ops->nopage) {
 921                 flush_cache_page(vma, address);
 922                 get_empty_page(tsk, vma, page_table, write_access);
 923                 return;
 924         }
 925         ++tsk->maj_flt;
 926         ++vma->vm_mm->rss;
 927         /*
 928          * The third argument is "no_share", which tells the low-level code
 929          * to copy, not share the page even if sharing is possible.  It's
 930          * essentially an early COW detection 
 931          */
 932         page = vma->vm_ops->nopage(vma, address, write_access && !(vma->vm_flags & VM_SHARED));
 933         if (!page) {
 934                 force_sig(SIGBUS, current);
 935                 flush_cache_page(vma, address);
 936                 put_page(page_table, BAD_PAGE);
 937                 flush_tlb_page(vma, address);
 938                 return;
 939         }
 940         /*
 941          * This silly early PAGE_DIRTY setting removes a race
 942          * due to the bad i386 page protection. But it's valid
 943          * for other architectures too.
 944          *
 945          * Note that if write_access is true, we either now have
 946          * a exclusive copy of the page, or this is a shared mapping,
 947          * so we can make it writable and dirty to avoid having to
 948          * handle that later.
 949          */
 950         flush_page_to_ram(page);
 951         entry = mk_pte(page, vma->vm_page_prot);
 952         if (write_access) {
 953                 entry = pte_mkwrite(pte_mkdirty(entry));
 954         } else if (mem_map[MAP_NR(page)].count > 1 && !(vma->vm_flags & VM_SHARED))
 955                 entry = pte_wrprotect(entry);
 956         flush_cache_page(vma, address);
 957         put_page(page_table, entry);
 958         flush_tlb_page(vma, address);
 959 }
 960 
 961 /*
 962  * The above separate functions for the no-page and wp-page
 963  * cases will go away (they mostly do the same thing anyway),
 964  * and we'll instead use only a general "handle_mm_fault()".
 965  *
 966  * These routines also need to handle stuff like marking pages dirty
 967  * and/or accessed for architectures that don't do it in hardware (most
 968  * RISC architectures).  The early dirtying is also good on the i386.
 969  *
 970  * There is also a hook called "update_mmu_cache()" that architectures
 971  * with external mmu caches can use to update those (ie the Sparc or
 972  * PowerPC hashed page tables that act as extended TLBs).
 973  */
 974 static inline void handle_pte_fault(struct vm_area_struct * vma, unsigned long address,
     /* [previous][next][first][last][top][bottom][index][help] */
 975         int write_access, pte_t * pte)
 976 {
 977         if (!pte_present(*pte)) {
 978                 do_no_page(current, vma, address, write_access);
 979                 return;
 980         }
 981         set_pte(pte, pte_mkyoung(*pte));
 982         flush_tlb_page(vma, address);
 983         if (!write_access)
 984                 return;
 985         if (pte_write(*pte)) {
 986                 set_pte(pte, pte_mkdirty(*pte));
 987                 flush_tlb_page(vma, address);
 988                 return;
 989         }
 990         do_wp_page(current, vma, address, write_access);
 991 }
 992 
 993 void handle_mm_fault(struct vm_area_struct * vma, unsigned long address,
     /* [previous][next][first][last][top][bottom][index][help] */
 994         int write_access)
 995 {
 996         pgd_t *pgd;
 997         pmd_t *pmd;
 998         pte_t *pte;
 999 
1000         pgd = pgd_offset(vma->vm_mm, address);
1001         pmd = pmd_alloc(pgd, address);
1002         if (!pmd)
1003                 goto no_memory;
1004         pte = pte_alloc(pmd, address);
1005         if (!pte)
1006                 goto no_memory;
1007         handle_pte_fault(vma, address, write_access, pte);
1008         update_mmu_cache(vma, address, *pte);
1009         return;
1010 no_memory:
1011         oom(current);
1012 }

/* [previous][next][first][last][top][bottom][index][help] */