root/mm/memory.c

/* [previous][next][first][last][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. copy_page
  2. oom
  3. free_one_pmd
  4. free_one_pgd
  5. clear_page_tables
  6. free_page_tables
  7. new_page_tables
  8. copy_one_pte
  9. copy_pte_range
  10. copy_pmd_range
  11. copy_page_range
  12. forget_pte
  13. zap_pte_range
  14. zap_pmd_range
  15. zap_page_range
  16. zeromap_pte_range
  17. zeromap_pmd_range
  18. zeromap_page_range
  19. remap_pte_range
  20. remap_pmd_range
  21. remap_page_range
  22. put_page
  23. put_dirty_page
  24. do_wp_page
  25. verify_area
  26. get_empty_page
  27. partial_clear
  28. vmtruncate
  29. get_empty_pgtable
  30. do_swap_page
  31. do_no_page
  32. handle_pte_fault
  33. handle_mm_fault

   1 /*
   2  *  linux/mm/memory.c
   3  *
   4  *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
   5  */
   6 
   7 /*
   8  * demand-loading started 01.12.91 - seems it is high on the list of
   9  * things wanted, and it should be easy to implement. - Linus
  10  */
  11 
  12 /*
  13  * Ok, demand-loading was easy, shared pages a little bit tricker. Shared
  14  * pages started 02.12.91, seems to work. - Linus.
  15  *
  16  * Tested sharing by executing about 30 /bin/sh: under the old kernel it
  17  * would have taken more than the 6M I have free, but it worked well as
  18  * far as I could see.
  19  *
  20  * Also corrected some "invalidate()"s - I wasn't doing enough of them.
  21  */
  22 
  23 /*
  24  * Real VM (paging to/from disk) started 18.12.91. Much more work and
  25  * thought has to go into this. Oh, well..
  26  * 19.12.91  -  works, somewhat. Sometimes I get faults, don't know why.
  27  *              Found it. Everything seems to work now.
  28  * 20.12.91  -  Ok, making the swap-device changeable like the root.
  29  */
  30 
  31 /*
  32  * 05.04.94  -  Multi-page memory management added for v1.1.
  33  *              Idea by Alex Bligh (alex@cconcepts.co.uk)
  34  */
  35 
  36 #include <linux/signal.h>
  37 #include <linux/sched.h>
  38 #include <linux/head.h>
  39 #include <linux/kernel.h>
  40 #include <linux/errno.h>
  41 #include <linux/string.h>
  42 #include <linux/types.h>
  43 #include <linux/ptrace.h>
  44 #include <linux/mman.h>
  45 #include <linux/mm.h>
  46 #include <linux/swap.h>
  47 
  48 #include <asm/system.h>
  49 #include <asm/segment.h>
  50 #include <asm/pgtable.h>
  51 #include <asm/string.h>
  52 
  53 unsigned long high_memory = 0;
  54 
  55 /*
  56  * The free_area_list arrays point to the queue heads of the free areas
  57  * of different sizes
  58  */
  59 int nr_swap_pages = 0;
  60 int nr_free_pages = 0;
  61 struct mem_list free_area_list[NR_MEM_LISTS];
  62 unsigned int * free_area_map[NR_MEM_LISTS];
  63 
  64 /*
  65  * We special-case the C-O-W ZERO_PAGE, because it's such
  66  * a common occurrence (no need to read the page to know
  67  * that it's zero - better for the cache and memory subsystem).
  68  */
  69 static inline void copy_page(unsigned long from, unsigned long to)
     /* [previous][next][first][last][top][bottom][index][help] */
  70 {
  71         if (from == ZERO_PAGE) {
  72                 memset((void *) to, 0, PAGE_SIZE);
  73                 return;
  74         }
  75         memcpy((void *) to, (void *) from, PAGE_SIZE);
  76 }
  77 
  78 #define USER_PTRS_PER_PGD (TASK_SIZE / PGDIR_SIZE)
  79 
  80 mem_map_t * mem_map = NULL;
  81 
  82 /*
  83  * oom() prints a message (so that the user knows why the process died),
  84  * and gives the process an untrappable SIGKILL.
  85  */
  86 void oom(struct task_struct * task)
     /* [previous][next][first][last][top][bottom][index][help] */
  87 {
  88         printk("\nOut of memory for %s.\n", current->comm);
  89         task->sig->action[SIGKILL-1].sa_handler = NULL;
  90         task->blocked &= ~(1<<(SIGKILL-1));
  91         send_sig(SIGKILL,task,1);
  92 }
  93 
  94 /*
  95  * Note: this doesn't free the actual pages themselves. That
  96  * has been handled earlier when unmapping all the memory regions.
  97  */
  98 static inline void free_one_pmd(pmd_t * dir)
     /* [previous][next][first][last][top][bottom][index][help] */
  99 {
 100         pte_t * pte;
 101 
 102         if (pmd_none(*dir))
 103                 return;
 104         if (pmd_bad(*dir)) {
 105                 printk("free_one_pmd: bad directory entry %08lx\n", pmd_val(*dir));
 106                 pmd_clear(dir);
 107                 return;
 108         }
 109         pte = pte_offset(dir, 0);
 110         pmd_clear(dir);
 111         pte_free(pte);
 112 }
 113 
 114 static inline void free_one_pgd(pgd_t * dir)
     /* [previous][next][first][last][top][bottom][index][help] */
 115 {
 116         pmd_t * pmd;
 117 
 118         if (pgd_none(*dir))
 119                 return;
 120         if (pgd_bad(*dir)) {
 121                 printk("free_one_pgd: bad directory entry %08lx\n", pgd_val(*dir));
 122                 pgd_clear(dir);
 123                 return;
 124         }
 125         pmd = pmd_offset(dir, 0);
 126         pgd_clear(dir);
 127         if (!pmd_inuse(pmd)) {
 128                 int j;
 129                 for (j = 0; j < PTRS_PER_PMD ; j++)
 130                         free_one_pmd(pmd+j);
 131         }
 132         pmd_free(pmd);
 133 }
 134         
 135 /*
 136  * This function clears all user-level page tables of a process - this
 137  * is needed by execve(), so that old pages aren't in the way.
 138  */
 139 void clear_page_tables(struct task_struct * tsk)
     /* [previous][next][first][last][top][bottom][index][help] */
 140 {
 141         int i;
 142         pgd_t * page_dir;
 143 
 144         page_dir = tsk->mm->pgd;
 145         if (!page_dir || page_dir == swapper_pg_dir) {
 146                 printk("%s trying to clear kernel page-directory: not good\n", tsk->comm);
 147                 return;
 148         }
 149         for (i = 0 ; i < USER_PTRS_PER_PGD ; i++)
 150                 free_one_pgd(page_dir + i);
 151         invalidate_mm(tsk->mm);
 152 }
 153 
 154 /*
 155  * This function frees up all page tables of a process when it exits. It
 156  * is the same as "clear_page_tables()", except it also changes the process'
 157  * page table directory to the kernel page tables and then frees the old
 158  * page table directory.
 159  */
 160 void free_page_tables(struct task_struct * tsk)
     /* [previous][next][first][last][top][bottom][index][help] */
 161 {
 162         int i;
 163         pgd_t * page_dir;
 164 
 165         page_dir = tsk->mm->pgd;
 166         if (!page_dir || page_dir == swapper_pg_dir) {
 167                 printk("%s trying to free kernel page-directory: not good\n", tsk->comm);
 168                 return;
 169         }
 170         invalidate_mm(tsk->mm);
 171         SET_PAGE_DIR(tsk, swapper_pg_dir);
 172         tsk->mm->pgd = swapper_pg_dir;  /* or else... */
 173         for (i = 0 ; i < PTRS_PER_PGD ; i++)
 174                 free_one_pgd(page_dir + i);
 175         pgd_free(page_dir);
 176 }
 177 
 178 int new_page_tables(struct task_struct * tsk)
     /* [previous][next][first][last][top][bottom][index][help] */
 179 {
 180         pgd_t * page_dir, * new_pg;
 181 
 182         if (!(new_pg = pgd_alloc()))
 183                 return -ENOMEM;
 184         page_dir = pgd_offset(&init_mm, 0);
 185         memcpy(new_pg + USER_PTRS_PER_PGD, page_dir + USER_PTRS_PER_PGD,
 186                (PTRS_PER_PGD - USER_PTRS_PER_PGD) * sizeof (pgd_t));
 187         invalidate_mm(tsk->mm);
 188         SET_PAGE_DIR(tsk, new_pg);
 189         tsk->mm->pgd = new_pg;
 190         return 0;
 191 }
 192 
 193 static inline void copy_one_pte(pte_t * old_pte, pte_t * new_pte, int cow)
     /* [previous][next][first][last][top][bottom][index][help] */
 194 {
 195         pte_t pte = *old_pte;
 196 
 197         if (pte_none(pte))
 198                 return;
 199         if (!pte_present(pte)) {
 200                 swap_duplicate(pte_val(pte));
 201                 set_pte(new_pte, pte);
 202                 return;
 203         }
 204         if (pte_page(pte) > high_memory || mem_map[MAP_NR(pte_page(pte))].reserved) {
 205                 set_pte(new_pte, pte);
 206                 return;
 207         }
 208         if (cow)
 209                 pte = pte_wrprotect(pte);
 210         if (delete_from_swap_cache(pte_page(pte)))
 211                 pte = pte_mkdirty(pte);
 212         set_pte(new_pte, pte_mkold(pte));
 213         set_pte(old_pte, pte);
 214         mem_map[MAP_NR(pte_page(pte))].count++;
 215 }
 216 
 217 static inline int copy_pte_range(pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long address, unsigned long size, int cow)
     /* [previous][next][first][last][top][bottom][index][help] */
 218 {
 219         pte_t * src_pte, * dst_pte;
 220         unsigned long end;
 221 
 222         if (pmd_none(*src_pmd))
 223                 return 0;
 224         if (pmd_bad(*src_pmd)) {
 225                 printk("copy_pte_range: bad pmd (%08lx)\n", pmd_val(*src_pmd));
 226                 pmd_clear(src_pmd);
 227                 return 0;
 228         }
 229         src_pte = pte_offset(src_pmd, address);
 230         if (pmd_none(*dst_pmd)) {
 231                 if (!pte_alloc(dst_pmd, 0))
 232                         return -ENOMEM;
 233         }
 234         dst_pte = pte_offset(dst_pmd, address);
 235         address &= ~PMD_MASK;
 236         end = address + size;
 237         if (end >= PMD_SIZE)
 238                 end = PMD_SIZE;
 239         do {
 240                 /* I would like to switch arguments here, to make it
 241                  * consistent with copy_xxx_range and memcpy syntax.
 242                  */
 243                 copy_one_pte(src_pte++, dst_pte++, cow);
 244                 address += PAGE_SIZE;
 245         } while (address < end);
 246         return 0;
 247 }
 248 
 249 static inline int copy_pmd_range(pgd_t *dst_pgd, pgd_t *src_pgd, unsigned long address, unsigned long size, int cow)
     /* [previous][next][first][last][top][bottom][index][help] */
 250 {
 251         pmd_t * src_pmd, * dst_pmd;
 252         unsigned long end;
 253         int error = 0;
 254 
 255         if (pgd_none(*src_pgd))
 256                 return 0;
 257         if (pgd_bad(*src_pgd)) {
 258                 printk("copy_pmd_range: bad pgd (%08lx)\n", pgd_val(*src_pgd));
 259                 pgd_clear(src_pgd);
 260                 return 0;
 261         }
 262         src_pmd = pmd_offset(src_pgd, address);
 263         if (pgd_none(*dst_pgd)) {
 264                 if (!pmd_alloc(dst_pgd, 0))
 265                         return -ENOMEM;
 266         }
 267         dst_pmd = pmd_offset(dst_pgd, address);
 268         address &= ~PGDIR_MASK;
 269         end = address + size;
 270         if (end > PGDIR_SIZE)
 271                 end = PGDIR_SIZE;
 272         do {
 273                 error = copy_pte_range(dst_pmd++, src_pmd++, address, end - address, cow);
 274                 if (error)
 275                         break;
 276                 address = (address + PMD_SIZE) & PMD_MASK; 
 277         } while (address < end);
 278         return error;
 279 }
 280 
 281 /*
 282  * copy one vm_area from one task to the other. Assumes the page tables
 283  * already present in the new task to be cleared in the whole range
 284  * covered by this vma.
 285  */
 286 int copy_page_range(struct mm_struct *dst, struct mm_struct *src,
     /* [previous][next][first][last][top][bottom][index][help] */
 287                         struct vm_area_struct *vma)
 288 {
 289         pgd_t * src_pgd, * dst_pgd;
 290         unsigned long address = vma->vm_start;
 291         unsigned long end = vma->vm_end;
 292         int error = 0, cow;
 293 
 294         cow = (vma->vm_flags & (VM_SHARED | VM_WRITE)) == VM_WRITE;
 295         src_pgd = pgd_offset(src, address);
 296         dst_pgd = pgd_offset(dst, address);
 297         while (address < end) {
 298                 error = copy_pmd_range(dst_pgd++, src_pgd++, address, end - address, cow);
 299                 if (error)
 300                         break;
 301                 address = (address + PGDIR_SIZE) & PGDIR_MASK;
 302         }
 303         /* Note that the src ptes get c-o-w treatment, so they change too. */
 304         invalidate_range(src, vma->vm_start, vma->vm_end);
 305         invalidate_range(dst, vma->vm_start, vma->vm_end);
 306         return error;
 307 }
 308 
 309 static inline void forget_pte(pte_t page)
     /* [previous][next][first][last][top][bottom][index][help] */
 310 {
 311         if (pte_none(page))
 312                 return;
 313         if (pte_present(page)) {
 314                 unsigned long addr = pte_page(page);
 315                 if (addr >= high_memory || mem_map[MAP_NR(addr)].reserved)
 316                         return;
 317                 free_page(addr);
 318                 if (current->mm->rss <= 0)
 319                         return;
 320                 current->mm->rss--;
 321                 return;
 322         }
 323         swap_free(pte_val(page));
 324 }
 325 
 326 static inline void zap_pte_range(pmd_t * pmd, unsigned long address, unsigned long size)
     /* [previous][next][first][last][top][bottom][index][help] */
 327 {
 328         pte_t * pte;
 329         unsigned long end;
 330 
 331         if (pmd_none(*pmd))
 332                 return;
 333         if (pmd_bad(*pmd)) {
 334                 printk("zap_pte_range: bad pmd (%08lx)\n", pmd_val(*pmd));
 335                 pmd_clear(pmd);
 336                 return;
 337         }
 338         pte = pte_offset(pmd, address);
 339         address &= ~PMD_MASK;
 340         end = address + size;
 341         if (end >= PMD_SIZE)
 342                 end = PMD_SIZE;
 343         do {
 344                 pte_t page = *pte;
 345                 pte_clear(pte);
 346                 forget_pte(page);
 347                 address += PAGE_SIZE;
 348                 pte++;
 349         } while (address < end);
 350 }
 351 
 352 static inline void zap_pmd_range(pgd_t * dir, unsigned long address, unsigned long size)
     /* [previous][next][first][last][top][bottom][index][help] */
 353 {
 354         pmd_t * pmd;
 355         unsigned long end;
 356 
 357         if (pgd_none(*dir))
 358                 return;
 359         if (pgd_bad(*dir)) {
 360                 printk("zap_pmd_range: bad pgd (%08lx)\n", pgd_val(*dir));
 361                 pgd_clear(dir);
 362                 return;
 363         }
 364         pmd = pmd_offset(dir, address);
 365         address &= ~PGDIR_MASK;
 366         end = address + size;
 367         if (end > PGDIR_SIZE)
 368                 end = PGDIR_SIZE;
 369         do {
 370                 zap_pte_range(pmd, address, end - address);
 371                 address = (address + PMD_SIZE) & PMD_MASK; 
 372                 pmd++;
 373         } while (address < end);
 374 }
 375 
 376 /*
 377  * remove user pages in a given range.
 378  */
 379 int zap_page_range(struct mm_struct *mm, unsigned long address, unsigned long size)
     /* [previous][next][first][last][top][bottom][index][help] */
 380 {
 381         pgd_t * dir;
 382         unsigned long end = address + size;
 383 
 384         dir = pgd_offset(mm, address);
 385         while (address < end) {
 386                 zap_pmd_range(dir, address, end - address);
 387                 address = (address + PGDIR_SIZE) & PGDIR_MASK;
 388                 dir++;
 389         }
 390         invalidate_range(mm, end - size, end);
 391         return 0;
 392 }
 393 
 394 static inline void zeromap_pte_range(pte_t * pte, unsigned long address, unsigned long size, pte_t zero_pte)
     /* [previous][next][first][last][top][bottom][index][help] */
 395 {
 396         unsigned long end;
 397 
 398         address &= ~PMD_MASK;
 399         end = address + size;
 400         if (end > PMD_SIZE)
 401                 end = PMD_SIZE;
 402         do {
 403                 pte_t oldpage = *pte;
 404                 set_pte(pte, zero_pte);
 405                 forget_pte(oldpage);
 406                 address += PAGE_SIZE;
 407                 pte++;
 408         } while (address < end);
 409 }
 410 
 411 static inline int zeromap_pmd_range(pmd_t * pmd, unsigned long address, unsigned long size, pte_t zero_pte)
     /* [previous][next][first][last][top][bottom][index][help] */
 412 {
 413         unsigned long end;
 414 
 415         address &= ~PGDIR_MASK;
 416         end = address + size;
 417         if (end > PGDIR_SIZE)
 418                 end = PGDIR_SIZE;
 419         do {
 420                 pte_t * pte = pte_alloc(pmd, address);
 421                 if (!pte)
 422                         return -ENOMEM;
 423                 zeromap_pte_range(pte, address, end - address, zero_pte);
 424                 address = (address + PMD_SIZE) & PMD_MASK;
 425                 pmd++;
 426         } while (address < end);
 427         return 0;
 428 }
 429 
 430 int zeromap_page_range(unsigned long address, unsigned long size, pgprot_t prot)
     /* [previous][next][first][last][top][bottom][index][help] */
 431 {
 432         int error = 0;
 433         pgd_t * dir;
 434         unsigned long beg = address;
 435         unsigned long end = address + size;
 436         pte_t zero_pte;
 437 
 438         zero_pte = pte_wrprotect(mk_pte(ZERO_PAGE, prot));
 439         dir = pgd_offset(current->mm, address);
 440         while (address < end) {
 441                 pmd_t *pmd = pmd_alloc(dir, address);
 442                 error = -ENOMEM;
 443                 if (!pmd)
 444                         break;
 445                 error = zeromap_pmd_range(pmd, address, end - address, zero_pte);
 446                 if (error)
 447                         break;
 448                 address = (address + PGDIR_SIZE) & PGDIR_MASK;
 449                 dir++;
 450         }
 451         invalidate_range(current->mm, beg, end);
 452         return error;
 453 }
 454 
 455 /*
 456  * maps a range of physical memory into the requested pages. the old
 457  * mappings are removed. any references to nonexistent pages results
 458  * in null mappings (currently treated as "copy-on-access")
 459  */
 460 static inline void remap_pte_range(pte_t * pte, unsigned long address, unsigned long size,
     /* [previous][next][first][last][top][bottom][index][help] */
 461         unsigned long offset, pgprot_t prot)
 462 {
 463         unsigned long end;
 464 
 465         address &= ~PMD_MASK;
 466         end = address + size;
 467         if (end > PMD_SIZE)
 468                 end = PMD_SIZE;
 469         do {
 470                 pte_t oldpage = *pte;
 471                 pte_clear(pte);
 472                 if (offset >= high_memory || mem_map[MAP_NR(offset)].reserved)
 473                         set_pte(pte, mk_pte(offset, prot));
 474                 forget_pte(oldpage);
 475                 address += PAGE_SIZE;
 476                 offset += PAGE_SIZE;
 477                 pte++;
 478         } while (address < end);
 479 }
 480 
 481 static inline int remap_pmd_range(pmd_t * pmd, unsigned long address, unsigned long size,
     /* [previous][next][first][last][top][bottom][index][help] */
 482         unsigned long offset, pgprot_t prot)
 483 {
 484         unsigned long end;
 485 
 486         address &= ~PGDIR_MASK;
 487         end = address + size;
 488         if (end > PGDIR_SIZE)
 489                 end = PGDIR_SIZE;
 490         offset -= address;
 491         do {
 492                 pte_t * pte = pte_alloc(pmd, address);
 493                 if (!pte)
 494                         return -ENOMEM;
 495                 remap_pte_range(pte, address, end - address, address + offset, prot);
 496                 address = (address + PMD_SIZE) & PMD_MASK;
 497                 pmd++;
 498         } while (address < end);
 499         return 0;
 500 }
 501 
 502 int remap_page_range(unsigned long from, unsigned long offset, unsigned long size, pgprot_t prot)
     /* [previous][next][first][last][top][bottom][index][help] */
 503 {
 504         int error = 0;
 505         pgd_t * dir;
 506         unsigned long beg = from;
 507         unsigned long end = from + size;
 508 
 509         offset -= from;
 510         dir = pgd_offset(current->mm, from);
 511         while (from < end) {
 512                 pmd_t *pmd = pmd_alloc(dir, from);
 513                 error = -ENOMEM;
 514                 if (!pmd)
 515                         break;
 516                 error = remap_pmd_range(pmd, from, end - from, offset + from, prot);
 517                 if (error)
 518                         break;
 519                 from = (from + PGDIR_SIZE) & PGDIR_MASK;
 520                 dir++;
 521         }
 522         invalidate_range(current->mm, beg, from);
 523         return error;
 524 }
 525 
 526 /*
 527  * sanity-check function..
 528  */
 529 static void put_page(pte_t * page_table, pte_t pte)
     /* [previous][next][first][last][top][bottom][index][help] */
 530 {
 531         if (!pte_none(*page_table)) {
 532                 printk("put_page: page already exists %08lx\n", pte_val(*page_table));
 533                 free_page(pte_page(pte));
 534                 return;
 535         }
 536 /* no need for invalidate */
 537         set_pte(page_table, pte);
 538 }
 539 
 540 /*
 541  * This routine is used to map in a page into an address space: needed by
 542  * execve() for the initial stack and environment pages.
 543  */
 544 unsigned long put_dirty_page(struct task_struct * tsk, unsigned long page, unsigned long address)
     /* [previous][next][first][last][top][bottom][index][help] */
 545 {
 546         pgd_t * pgd;
 547         pmd_t * pmd;
 548         pte_t * pte;
 549 
 550         if (page >= high_memory)
 551                 printk("put_dirty_page: trying to put page %08lx at %08lx\n",page,address);
 552         if (mem_map[MAP_NR(page)].count != 1)
 553                 printk("mem_map disagrees with %08lx at %08lx\n",page,address);
 554         pgd = pgd_offset(tsk->mm,address);
 555         pmd = pmd_alloc(pgd, address);
 556         if (!pmd) {
 557                 free_page(page);
 558                 oom(tsk);
 559                 return 0;
 560         }
 561         pte = pte_alloc(pmd, address);
 562         if (!pte) {
 563                 free_page(page);
 564                 oom(tsk);
 565                 return 0;
 566         }
 567         if (!pte_none(*pte)) {
 568                 printk("put_dirty_page: page already exists\n");
 569                 free_page(page);
 570                 return 0;
 571         }
 572         set_pte(pte, pte_mkwrite(pte_mkdirty(mk_pte(page, PAGE_COPY))));
 573 /* no need for invalidate */
 574         return page;
 575 }
 576 
 577 /*
 578  * This routine handles present pages, when users try to write
 579  * to a shared page. It is done by copying the page to a new address
 580  * and decrementing the shared-page counter for the old page.
 581  *
 582  * Goto-purists beware: the only reason for goto's here is that it results
 583  * in better assembly code.. The "default" path will see no jumps at all.
 584  *
 585  * Note that this routine assumes that the protection checks have been
 586  * done by the caller (the low-level page fault routine in most cases).
 587  * Thus we can safely just mark it writable once we've done any necessary
 588  * COW.
 589  *
 590  * We also mark the page dirty at this point even though the page will
 591  * change only once the write actually happens. This avoids a few races,
 592  * and potentially makes it more efficient.
 593  */
 594 void do_wp_page(struct task_struct * tsk, struct vm_area_struct * vma,
     /* [previous][next][first][last][top][bottom][index][help] */
 595         unsigned long address, int write_access)
 596 {
 597         pgd_t *page_dir;
 598         pmd_t *page_middle;
 599         pte_t *page_table, pte;
 600         unsigned long old_page, new_page;
 601 
 602         new_page = __get_free_page(GFP_KERNEL);
 603         page_dir = pgd_offset(vma->vm_mm, address);
 604         if (pgd_none(*page_dir))
 605                 goto end_wp_page;
 606         if (pgd_bad(*page_dir))
 607                 goto bad_wp_pagedir;
 608         page_middle = pmd_offset(page_dir, address);
 609         if (pmd_none(*page_middle))
 610                 goto end_wp_page;
 611         if (pmd_bad(*page_middle))
 612                 goto bad_wp_pagemiddle;
 613         page_table = pte_offset(page_middle, address);
 614         pte = *page_table;
 615         if (!pte_present(pte))
 616                 goto end_wp_page;
 617         if (pte_write(pte))
 618                 goto end_wp_page;
 619         old_page = pte_page(pte);
 620         if (old_page >= high_memory)
 621                 goto bad_wp_page;
 622         tsk->min_flt++;
 623         /*
 624          * Do we need to copy?
 625          */
 626         if (mem_map[MAP_NR(old_page)].count != 1) {
 627                 if (new_page) {
 628                         if (mem_map[MAP_NR(old_page)].reserved)
 629                                 ++vma->vm_mm->rss;
 630                         copy_page(old_page,new_page);
 631                         set_pte(page_table, pte_mkwrite(pte_mkdirty(mk_pte(new_page, vma->vm_page_prot))));
 632                         free_page(old_page);
 633                         invalidate_page(vma, address);
 634                         return;
 635                 }
 636                 set_pte(page_table, BAD_PAGE);
 637                 free_page(old_page);
 638                 oom(tsk);
 639                 invalidate_page(vma, address);
 640                 return;
 641         }
 642         set_pte(page_table, pte_mkdirty(pte_mkwrite(pte)));
 643         invalidate_page(vma, address);
 644         if (new_page)
 645                 free_page(new_page);
 646         return;
 647 bad_wp_page:
 648         printk("do_wp_page: bogus page at address %08lx (%08lx)\n",address,old_page);
 649         send_sig(SIGKILL, tsk, 1);
 650         goto end_wp_page;
 651 bad_wp_pagemiddle:
 652         printk("do_wp_page: bogus page-middle at address %08lx (%08lx)\n", address, pmd_val(*page_middle));
 653         send_sig(SIGKILL, tsk, 1);
 654         goto end_wp_page;
 655 bad_wp_pagedir:
 656         printk("do_wp_page: bogus page-dir entry at address %08lx (%08lx)\n", address, pgd_val(*page_dir));
 657         send_sig(SIGKILL, tsk, 1);
 658 end_wp_page:
 659         if (new_page)
 660                 free_page(new_page);
 661         return;
 662 }
 663 
 664 /*
 665  * Ugly, ugly, but the goto's result in better assembly..
 666  */
 667 int verify_area(int type, const void * addr, unsigned long size)
     /* [previous][next][first][last][top][bottom][index][help] */
 668 {
 669         struct vm_area_struct * vma;
 670         unsigned long start = (unsigned long) addr;
 671 
 672         /* If the current user space is mapped to kernel space (for the
 673          * case where we use a fake user buffer with get_fs/set_fs()) we
 674          * don't expect to find the address in the user vm map.
 675          */
 676         if (!size || get_fs() == get_ds())
 677                 return 0;
 678 
 679         vma = find_vma(current, start);
 680         if (!vma)
 681                 goto bad_area;
 682         if (vma->vm_start <= start)
 683                 goto good_area;
 684         if (!(vma->vm_flags & VM_GROWSDOWN))
 685                 goto bad_area;
 686         if (expand_stack(vma, start))
 687                 goto bad_area;
 688 
 689 good_area:
 690         if (type == VERIFY_WRITE)
 691                 goto check_write;
 692         for (;;) {
 693                 struct vm_area_struct * next;
 694                 if (!(vma->vm_flags & VM_READ))
 695                         goto bad_area;
 696                 if (vma->vm_end - start >= size)
 697                         return 0;
 698                 next = vma->vm_next;
 699                 if (!next || vma->vm_end != next->vm_start)
 700                         goto bad_area;
 701                 vma = next;
 702         }
 703 
 704 check_write:
 705         if (!(vma->vm_flags & VM_WRITE))
 706                 goto bad_area;
 707         if (!wp_works_ok)
 708                 goto check_wp_fault_by_hand;
 709         for (;;) {
 710                 if (vma->vm_end - start >= size)
 711                         break;
 712                 if (!vma->vm_next || vma->vm_end != vma->vm_next->vm_start)
 713                         goto bad_area;
 714                 vma = vma->vm_next;
 715                 if (!(vma->vm_flags & VM_WRITE))
 716                         goto bad_area;
 717         }
 718         return 0;
 719 
 720 check_wp_fault_by_hand:
 721         size--;
 722         size += start & ~PAGE_MASK;
 723         size >>= PAGE_SHIFT;
 724         start &= PAGE_MASK;
 725 
 726         for (;;) {
 727                 do_wp_page(current, vma, start, 1);
 728                 if (!size)
 729                         break;
 730                 size--;
 731                 start += PAGE_SIZE;
 732                 if (start < vma->vm_end)
 733                         continue;
 734                 vma = vma->vm_next;
 735                 if (!vma || vma->vm_start != start)
 736                         goto bad_area;
 737                 if (!(vma->vm_flags & VM_WRITE))
 738                         goto bad_area;;
 739         }
 740         return 0;
 741 
 742 bad_area:
 743         return -EFAULT;
 744 }
 745 
 746 static inline void get_empty_page(struct task_struct * tsk, struct vm_area_struct * vma, pte_t * page_table)
     /* [previous][next][first][last][top][bottom][index][help] */
 747 {
 748         unsigned long tmp;
 749 
 750         if (!(tmp = get_free_page(GFP_KERNEL))) {
 751                 oom(tsk);
 752                 put_page(page_table, BAD_PAGE);
 753                 return;
 754         }
 755         put_page(page_table, pte_mkwrite(mk_pte(tmp, vma->vm_page_prot)));
 756 }
 757 
 758 /*
 759  * This function zeroes out partial mmap'ed pages at truncation time..
 760  */
 761 static void partial_clear(struct vm_area_struct *vma, unsigned long address)
     /* [previous][next][first][last][top][bottom][index][help] */
 762 {
 763         pgd_t *page_dir;
 764         pmd_t *page_middle;
 765         pte_t *page_table, pte;
 766 
 767         page_dir = pgd_offset(vma->vm_mm, address);
 768         if (pgd_none(*page_dir))
 769                 return;
 770         if (pgd_bad(*page_dir)) {
 771                 printk("bad page table directory entry %p:[%lx]\n", page_dir, pgd_val(*page_dir));
 772                 pgd_clear(page_dir);
 773                 return;
 774         }
 775         page_middle = pmd_offset(page_dir, address);
 776         if (pmd_none(*page_middle))
 777                 return;
 778         if (pmd_bad(*page_middle)) {
 779                 printk("bad page table directory entry %p:[%lx]\n", page_dir, pgd_val(*page_dir));
 780                 pmd_clear(page_middle);
 781                 return;
 782         }
 783         page_table = pte_offset(page_middle, address);
 784         pte = *page_table;
 785         if (!pte_present(pte))
 786                 return;
 787         address &= ~PAGE_MASK;
 788         address += pte_page(pte);
 789         if (address >= high_memory)
 790                 return;
 791         memset((void *) address, 0, PAGE_SIZE - (address & ~PAGE_MASK));
 792 }
 793 
 794 /*
 795  * Handle all mappings that got truncated by a "truncate()"
 796  * system call.
 797  *
 798  * NOTE! We have to be ready to update the memory sharing
 799  * between the file and the memory map for a potential last
 800  * incomplete page.  Ugly, but necessary.
 801  */
 802 void vmtruncate(struct inode * inode, unsigned long offset)
     /* [previous][next][first][last][top][bottom][index][help] */
 803 {
 804         struct vm_area_struct * mpnt;
 805 
 806         truncate_inode_pages(inode, offset);
 807         if (!inode->i_mmap)
 808                 return;
 809         mpnt = inode->i_mmap;
 810         do {
 811                 unsigned long start = mpnt->vm_start;
 812                 unsigned long len = mpnt->vm_end - start;
 813                 unsigned long diff;
 814 
 815                 /* mapping wholly truncated? */
 816                 if (mpnt->vm_offset >= offset) {
 817                         zap_page_range(mpnt->vm_mm, start, len);
 818                         continue;
 819                 }
 820                 /* mapping wholly unaffected? */
 821                 diff = offset - mpnt->vm_offset;
 822                 if (diff >= len)
 823                         continue;
 824                 /* Ok, partially affected.. */
 825                 start += diff;
 826                 len = (len - diff) & PAGE_MASK;
 827                 if (start & ~PAGE_MASK) {
 828                         partial_clear(mpnt, start);
 829                         start = (start + ~PAGE_MASK) & PAGE_MASK;
 830                 }
 831                 zap_page_range(mpnt->vm_mm, start, len);
 832         } while ((mpnt = mpnt->vm_next_share) != inode->i_mmap);
 833 }
 834 
 835 /*
 836  * fill in an empty page-table if none exists.
 837  */
 838 static inline pte_t * get_empty_pgtable(struct task_struct * tsk,unsigned long address)
     /* [previous][next][first][last][top][bottom][index][help] */
 839 {
 840         pgd_t *pgd;
 841         pmd_t *pmd;
 842         pte_t *pte;
 843 
 844         pgd = pgd_offset(tsk->mm, address);
 845         pmd = pmd_alloc(pgd, address);
 846         if (!pmd) {
 847                 oom(tsk);
 848                 return NULL;
 849         }
 850         pte = pte_alloc(pmd, address);
 851         if (!pte) {
 852                 oom(tsk);
 853                 return NULL;
 854         }
 855         return pte;
 856 }
 857 
 858 static inline void do_swap_page(struct task_struct * tsk, 
     /* [previous][next][first][last][top][bottom][index][help] */
 859         struct vm_area_struct * vma, unsigned long address,
 860         pte_t * page_table, pte_t entry, int write_access)
 861 {
 862         pte_t page;
 863 
 864         if (!vma->vm_ops || !vma->vm_ops->swapin) {
 865                 swap_in(tsk, vma, page_table, pte_val(entry), write_access);
 866                 return;
 867         }
 868         page = vma->vm_ops->swapin(vma, address - vma->vm_start + vma->vm_offset, pte_val(entry));
 869         if (pte_val(*page_table) != pte_val(entry)) {
 870                 free_page(pte_page(page));
 871                 return;
 872         }
 873         if (mem_map[MAP_NR(pte_page(page))].count > 1 && !(vma->vm_flags & VM_SHARED))
 874                 page = pte_wrprotect(page);
 875         ++vma->vm_mm->rss;
 876         ++tsk->maj_flt;
 877         set_pte(page_table, page);
 878         return;
 879 }
 880 
 881 /*
 882  * do_no_page() tries to create a new page mapping. It aggressively
 883  * tries to share with existing pages, but makes a separate copy if
 884  * the "write_access" parameter is true in order to avoid the next
 885  * page fault.
 886  */
 887 void do_no_page(struct task_struct * tsk, struct vm_area_struct * vma,
     /* [previous][next][first][last][top][bottom][index][help] */
 888         unsigned long address, int write_access)
 889 {
 890         pte_t * page_table;
 891         pte_t entry;
 892         unsigned long page;
 893 
 894         page_table = get_empty_pgtable(tsk, address);
 895         if (!page_table)
 896                 return;
 897         entry = *page_table;
 898         if (pte_present(entry))
 899                 return;
 900         if (!pte_none(entry)) {
 901                 do_swap_page(tsk, vma, address, page_table, entry, write_access);
 902                 return;
 903         }
 904         address &= PAGE_MASK;
 905         if (!vma->vm_ops || !vma->vm_ops->nopage) {
 906                 ++vma->vm_mm->rss;
 907                 ++tsk->min_flt;
 908                 get_empty_page(tsk, vma, page_table);
 909                 return;
 910         }
 911         ++tsk->maj_flt;
 912         ++vma->vm_mm->rss;
 913         /*
 914          * The third argument is "no_share", which tells the low-level code
 915          * to copy, not share the page even if sharing is possible.  It's
 916          * essentially an early COW detection 
 917          */
 918         page = vma->vm_ops->nopage(vma, address, write_access && !(vma->vm_flags & VM_SHARED));
 919         if (!page) {
 920                 send_sig(SIGBUS, current, 1);
 921                 put_page(page_table, BAD_PAGE);
 922                 return;
 923         }
 924         /*
 925          * This silly early PAGE_DIRTY setting removes a race
 926          * due to the bad i386 page protection. But it's valid
 927          * for other architectures too.
 928          *
 929          * Note that if write_access is true, we either now have
 930          * a exclusive copy of the page, or this is a shared mapping,
 931          * so we can make it writable and dirty to avoid having to
 932          * handle that later.
 933          */
 934         entry = mk_pte(page, vma->vm_page_prot);
 935         if (write_access) {
 936                 entry = pte_mkwrite(pte_mkdirty(entry));
 937         } else if (mem_map[MAP_NR(page)].count > 1 && !(vma->vm_flags & VM_SHARED))
 938                 entry = pte_wrprotect(entry);
 939         put_page(page_table, entry);
 940 }
 941 
 942 /*
 943  * The above separate functions for the no-page and wp-page
 944  * cases will go away (they mostly do the same thing anyway),
 945  * and we'll instead use only a general "handle_mm_fault()".
 946  *
 947  * These routines also need to handle stuff like marking pages dirty
 948  * and/or accessed for architectures that don't do it in hardware (most
 949  * RISC architectures).  The early dirtying is also good on the i386.
 950  *
 951  * There is also a hook called "update_mmu_cache()" that architectures
 952  * with external mmu caches can use to update those (ie the Sparc or
 953  * PowerPC hashed page tables that act as extended TLBs).
 954  */
 955 static inline void handle_pte_fault(struct vm_area_struct * vma, unsigned long address,
     /* [previous][next][first][last][top][bottom][index][help] */
 956         int write_access, pte_t * pte)
 957 {
 958         if (!pte_present(*pte)) {
 959                 do_no_page(current, vma, address, write_access);
 960                 return;
 961         }
 962         set_pte(pte, pte_mkyoung(*pte));
 963         if (!write_access)
 964                 return;
 965         if (pte_write(*pte)) {
 966                 set_pte(pte, pte_mkdirty(*pte));
 967                 return;
 968         }
 969         do_wp_page(current, vma, address, write_access);
 970 }
 971 
 972 void handle_mm_fault(struct vm_area_struct * vma, unsigned long address,
     /* [previous][next][first][last][top][bottom][index][help] */
 973         int write_access)
 974 {
 975         pgd_t *pgd;
 976         pmd_t *pmd;
 977         pte_t *pte;
 978 
 979         pgd = pgd_offset(vma->vm_mm, address);
 980         pmd = pmd_alloc(pgd, address);
 981         if (!pmd)
 982                 goto no_memory;
 983         pte = pte_alloc(pmd, address);
 984         if (!pte)
 985                 goto no_memory;
 986         handle_pte_fault(vma, address, write_access, pte);
 987         update_mmu_cache(vma, address, *pte);
 988         return;
 989 no_memory:
 990         oom(current);
 991 }

/* [previous][next][first][last][top][bottom][index][help] */