root/mm/memory.c

/* [previous][next][first][last][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. copy_page
  2. oom
  3. free_one_pmd
  4. free_one_pgd
  5. clear_page_tables
  6. free_page_tables
  7. new_page_tables
  8. copy_one_pte
  9. copy_pte_range
  10. copy_pmd_range
  11. copy_page_range
  12. forget_pte
  13. zap_pte_range
  14. zap_pmd_range
  15. zap_page_range
  16. zeromap_pte_range
  17. zeromap_pmd_range
  18. zeromap_page_range
  19. remap_pte_range
  20. remap_pmd_range
  21. remap_page_range
  22. put_page
  23. put_dirty_page
  24. do_wp_page
  25. verify_area
  26. get_empty_page
  27. partial_clear
  28. vmtruncate
  29. get_empty_pgtable
  30. do_swap_page
  31. do_no_page
  32. handle_pte_fault
  33. handle_mm_fault

   1 /*
   2  *  linux/mm/memory.c
   3  *
   4  *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
   5  */
   6 
   7 /*
   8  * demand-loading started 01.12.91 - seems it is high on the list of
   9  * things wanted, and it should be easy to implement. - Linus
  10  */
  11 
  12 /*
  13  * Ok, demand-loading was easy, shared pages a little bit tricker. Shared
  14  * pages started 02.12.91, seems to work. - Linus.
  15  *
  16  * Tested sharing by executing about 30 /bin/sh: under the old kernel it
  17  * would have taken more than the 6M I have free, but it worked well as
  18  * far as I could see.
  19  *
  20  * Also corrected some "invalidate()"s - I wasn't doing enough of them.
  21  */
  22 
  23 /*
  24  * Real VM (paging to/from disk) started 18.12.91. Much more work and
  25  * thought has to go into this. Oh, well..
  26  * 19.12.91  -  works, somewhat. Sometimes I get faults, don't know why.
  27  *              Found it. Everything seems to work now.
  28  * 20.12.91  -  Ok, making the swap-device changeable like the root.
  29  */
  30 
  31 /*
  32  * 05.04.94  -  Multi-page memory management added for v1.1.
  33  *              Idea by Alex Bligh (alex@cconcepts.co.uk)
  34  */
  35 
  36 #include <linux/signal.h>
  37 #include <linux/sched.h>
  38 #include <linux/head.h>
  39 #include <linux/kernel.h>
  40 #include <linux/errno.h>
  41 #include <linux/string.h>
  42 #include <linux/types.h>
  43 #include <linux/ptrace.h>
  44 #include <linux/mman.h>
  45 #include <linux/mm.h>
  46 #include <linux/swap.h>
  47 
  48 #include <asm/system.h>
  49 #include <asm/segment.h>
  50 #include <asm/pgtable.h>
  51 
  52 unsigned long high_memory = 0;
  53 
  54 /*
  55  * The free_area_list arrays point to the queue heads of the free areas
  56  * of different sizes
  57  */
  58 int nr_swap_pages = 0;
  59 int nr_free_pages = 0;
  60 struct mem_list free_area_list[NR_MEM_LISTS];
  61 unsigned int * free_area_map[NR_MEM_LISTS];
  62 
  63 /*
  64  * We special-case the C-O-W ZERO_PAGE, because it's such
  65  * a common occurrence (no need to read the page to know
  66  * that it's zero - better for the cache and memory subsystem).
  67  */
  68 static inline void copy_page(unsigned long from, unsigned long to)
     /* [previous][next][first][last][top][bottom][index][help] */
  69 {
  70         if (from == ZERO_PAGE) {
  71                 memset((void *) to, 0, PAGE_SIZE);
  72                 return;
  73         }
  74         memcpy((void *) to, (void *) from, PAGE_SIZE);
  75 }
  76 
  77 #define USER_PTRS_PER_PGD (TASK_SIZE / PGDIR_SIZE)
  78 
  79 mem_map_t * mem_map = NULL;
  80 
  81 /*
  82  * oom() prints a message (so that the user knows why the process died),
  83  * and gives the process an untrappable SIGKILL.
  84  */
  85 void oom(struct task_struct * task)
     /* [previous][next][first][last][top][bottom][index][help] */
  86 {
  87         printk("\nOut of memory for %s.\n", current->comm);
  88         task->sig->action[SIGKILL-1].sa_handler = NULL;
  89         task->blocked &= ~(1<<(SIGKILL-1));
  90         send_sig(SIGKILL,task,1);
  91 }
  92 
  93 /*
  94  * Note: this doesn't free the actual pages themselves. That
  95  * has been handled earlier when unmapping all the memory regions.
  96  */
  97 static inline void free_one_pmd(pmd_t * dir)
     /* [previous][next][first][last][top][bottom][index][help] */
  98 {
  99         pte_t * pte;
 100 
 101         if (pmd_none(*dir))
 102                 return;
 103         if (pmd_bad(*dir)) {
 104                 printk("free_one_pmd: bad directory entry %08lx\n", pmd_val(*dir));
 105                 pmd_clear(dir);
 106                 return;
 107         }
 108         pte = pte_offset(dir, 0);
 109         pmd_clear(dir);
 110         pte_free(pte);
 111 }
 112 
 113 static inline void free_one_pgd(pgd_t * dir)
     /* [previous][next][first][last][top][bottom][index][help] */
 114 {
 115         pmd_t * pmd;
 116 
 117         if (pgd_none(*dir))
 118                 return;
 119         if (pgd_bad(*dir)) {
 120                 printk("free_one_pgd: bad directory entry %08lx\n", pgd_val(*dir));
 121                 pgd_clear(dir);
 122                 return;
 123         }
 124         pmd = pmd_offset(dir, 0);
 125         pgd_clear(dir);
 126         if (!pmd_inuse(pmd)) {
 127                 int j;
 128                 for (j = 0; j < PTRS_PER_PMD ; j++)
 129                         free_one_pmd(pmd+j);
 130         }
 131         pmd_free(pmd);
 132 }
 133         
 134 /*
 135  * This function clears all user-level page tables of a process - this
 136  * is needed by execve(), so that old pages aren't in the way.
 137  */
 138 void clear_page_tables(struct task_struct * tsk)
     /* [previous][next][first][last][top][bottom][index][help] */
 139 {
 140         int i;
 141         pgd_t * page_dir;
 142 
 143         page_dir = tsk->mm->pgd;
 144         if (!page_dir || page_dir == swapper_pg_dir) {
 145                 printk("%s trying to clear kernel page-directory: not good\n", tsk->comm);
 146                 return;
 147         }
 148         for (i = 0 ; i < USER_PTRS_PER_PGD ; i++)
 149                 free_one_pgd(page_dir + i);
 150         invalidate_mm(tsk->mm);
 151 }
 152 
 153 /*
 154  * This function frees up all page tables of a process when it exits. It
 155  * is the same as "clear_page_tables()", except it also changes the process'
 156  * page table directory to the kernel page tables and then frees the old
 157  * page table directory.
 158  */
 159 void free_page_tables(struct task_struct * tsk)
     /* [previous][next][first][last][top][bottom][index][help] */
 160 {
 161         int i;
 162         pgd_t * page_dir;
 163 
 164         page_dir = tsk->mm->pgd;
 165         if (!page_dir || page_dir == swapper_pg_dir) {
 166                 printk("%s trying to free kernel page-directory: not good\n", tsk->comm);
 167                 return;
 168         }
 169         invalidate_mm(tsk->mm);
 170         SET_PAGE_DIR(tsk, swapper_pg_dir);
 171         tsk->mm->pgd = swapper_pg_dir;  /* or else... */
 172         for (i = 0 ; i < PTRS_PER_PGD ; i++)
 173                 free_one_pgd(page_dir + i);
 174         pgd_free(page_dir);
 175 }
 176 
 177 int new_page_tables(struct task_struct * tsk)
     /* [previous][next][first][last][top][bottom][index][help] */
 178 {
 179         pgd_t * page_dir, * new_pg;
 180         int i;
 181 
 182         if (!(new_pg = pgd_alloc()))
 183                 return -ENOMEM;
 184         page_dir = pgd_offset(&init_mm, 0);
 185         for (i = USER_PTRS_PER_PGD ; i < PTRS_PER_PGD ; i++)
 186                 new_pg[i] = page_dir[i];
 187         invalidate_mm(tsk->mm);
 188         SET_PAGE_DIR(tsk, new_pg);
 189         tsk->mm->pgd = new_pg;
 190         return 0;
 191 }
 192 
 193 static inline void copy_one_pte(pte_t * old_pte, pte_t * new_pte, int cow)
     /* [previous][next][first][last][top][bottom][index][help] */
 194 {
 195         pte_t pte = *old_pte;
 196 
 197         if (pte_none(pte))
 198                 return;
 199         if (!pte_present(pte)) {
 200                 swap_duplicate(pte_val(pte));
 201                 set_pte(new_pte, pte);
 202                 return;
 203         }
 204         if (pte_page(pte) > high_memory || mem_map[MAP_NR(pte_page(pte))].reserved) {
 205                 set_pte(new_pte, pte);
 206                 return;
 207         }
 208         if (cow)
 209                 pte = pte_wrprotect(pte);
 210         if (delete_from_swap_cache(pte_page(pte)))
 211                 pte = pte_mkdirty(pte);
 212         set_pte(new_pte, pte_mkold(pte));
 213         set_pte(old_pte, pte);
 214         mem_map[MAP_NR(pte_page(pte))].count++;
 215 }
 216 
 217 static inline int copy_pte_range(pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long address, unsigned long size, int cow)
     /* [previous][next][first][last][top][bottom][index][help] */
 218 {
 219         pte_t * src_pte, * dst_pte;
 220         unsigned long end;
 221 
 222         if (pmd_none(*src_pmd))
 223                 return 0;
 224         if (pmd_bad(*src_pmd)) {
 225                 printk("copy_pte_range: bad pmd (%08lx)\n", pmd_val(*src_pmd));
 226                 pmd_clear(src_pmd);
 227                 return 0;
 228         }
 229         src_pte = pte_offset(src_pmd, address);
 230         if (pmd_none(*dst_pmd)) {
 231                 if (!pte_alloc(dst_pmd, 0))
 232                         return -ENOMEM;
 233         }
 234         dst_pte = pte_offset(dst_pmd, address);
 235         address &= ~PMD_MASK;
 236         end = address + size;
 237         if (end >= PMD_SIZE)
 238                 end = PMD_SIZE;
 239         do {
 240                 /* I would like to switch arguments here, to make it
 241                  * consistent with copy_xxx_range and memcpy syntax.
 242                  */
 243                 copy_one_pte(src_pte++, dst_pte++, cow);
 244                 address += PAGE_SIZE;
 245         } while (address < end);
 246         return 0;
 247 }
 248 
 249 static inline int copy_pmd_range(pgd_t *dst_pgd, pgd_t *src_pgd, unsigned long address, unsigned long size, int cow)
     /* [previous][next][first][last][top][bottom][index][help] */
 250 {
 251         pmd_t * src_pmd, * dst_pmd;
 252         unsigned long end;
 253         int error = 0;
 254 
 255         if (pgd_none(*src_pgd))
 256                 return 0;
 257         if (pgd_bad(*src_pgd)) {
 258                 printk("copy_pmd_range: bad pgd (%08lx)\n", pgd_val(*src_pgd));
 259                 pgd_clear(src_pgd);
 260                 return 0;
 261         }
 262         src_pmd = pmd_offset(src_pgd, address);
 263         if (pgd_none(*dst_pgd)) {
 264                 if (!pmd_alloc(dst_pgd, 0))
 265                         return -ENOMEM;
 266         }
 267         dst_pmd = pmd_offset(dst_pgd, address);
 268         address &= ~PGDIR_MASK;
 269         end = address + size;
 270         if (end > PGDIR_SIZE)
 271                 end = PGDIR_SIZE;
 272         do {
 273                 error = copy_pte_range(dst_pmd++, src_pmd++, address, end - address, cow);
 274                 if (error)
 275                         break;
 276                 address = (address + PMD_SIZE) & PMD_MASK; 
 277         } while (address < end);
 278         return error;
 279 }
 280 
 281 /*
 282  * copy one vm_area from one task to the other. Assumes the page tables
 283  * already present in the new task to be cleared in the whole range
 284  * covered by this vma.
 285  */
 286 int copy_page_range(struct mm_struct *dst, struct mm_struct *src,
     /* [previous][next][first][last][top][bottom][index][help] */
 287                         struct vm_area_struct *vma)
 288 {
 289         pgd_t * src_pgd, * dst_pgd;
 290         unsigned long address = vma->vm_start;
 291         unsigned long end = vma->vm_end;
 292         int error = 0, cow;
 293 
 294         cow = (vma->vm_flags & (VM_SHARED | VM_WRITE)) == VM_WRITE;
 295         src_pgd = pgd_offset(src, address);
 296         dst_pgd = pgd_offset(dst, address);
 297         while (address < end) {
 298                 error = copy_pmd_range(dst_pgd++, src_pgd++, address, end - address, cow);
 299                 if (error)
 300                         break;
 301                 address = (address + PGDIR_SIZE) & PGDIR_MASK;
 302         }
 303         /* Note that the src ptes get c-o-w treatment, so they change too. */
 304         invalidate_range(src, vma->vm_start, vma->vm_end);
 305         invalidate_range(dst, vma->vm_start, vma->vm_end);
 306         return error;
 307 }
 308 
 309 static inline void forget_pte(pte_t page)
     /* [previous][next][first][last][top][bottom][index][help] */
 310 {
 311         if (pte_none(page))
 312                 return;
 313         if (pte_present(page)) {
 314                 unsigned long addr = pte_page(page);
 315                 if (addr >= high_memory || mem_map[MAP_NR(addr)].reserved)
 316                         return;
 317                 free_page(addr);
 318                 if (current->mm->rss <= 0)
 319                         return;
 320                 current->mm->rss--;
 321                 return;
 322         }
 323         swap_free(pte_val(page));
 324 }
 325 
 326 static inline void zap_pte_range(pmd_t * pmd, unsigned long address, unsigned long size)
     /* [previous][next][first][last][top][bottom][index][help] */
 327 {
 328         pte_t * pte;
 329         unsigned long end;
 330 
 331         if (pmd_none(*pmd))
 332                 return;
 333         if (pmd_bad(*pmd)) {
 334                 printk("zap_pte_range: bad pmd (%08lx)\n", pmd_val(*pmd));
 335                 pmd_clear(pmd);
 336                 return;
 337         }
 338         pte = pte_offset(pmd, address);
 339         address &= ~PMD_MASK;
 340         end = address + size;
 341         if (end >= PMD_SIZE)
 342                 end = PMD_SIZE;
 343         do {
 344                 pte_t page = *pte;
 345                 pte_clear(pte);
 346                 forget_pte(page);
 347                 address += PAGE_SIZE;
 348                 pte++;
 349         } while (address < end);
 350 }
 351 
 352 static inline void zap_pmd_range(pgd_t * dir, unsigned long address, unsigned long size)
     /* [previous][next][first][last][top][bottom][index][help] */
 353 {
 354         pmd_t * pmd;
 355         unsigned long end;
 356 
 357         if (pgd_none(*dir))
 358                 return;
 359         if (pgd_bad(*dir)) {
 360                 printk("zap_pmd_range: bad pgd (%08lx)\n", pgd_val(*dir));
 361                 pgd_clear(dir);
 362                 return;
 363         }
 364         pmd = pmd_offset(dir, address);
 365         address &= ~PGDIR_MASK;
 366         end = address + size;
 367         if (end > PGDIR_SIZE)
 368                 end = PGDIR_SIZE;
 369         do {
 370                 zap_pte_range(pmd, address, end - address);
 371                 address = (address + PMD_SIZE) & PMD_MASK; 
 372                 pmd++;
 373         } while (address < end);
 374 }
 375 
 376 /*
 377  * remove user pages in a given range.
 378  */
 379 int zap_page_range(struct mm_struct *mm, unsigned long address, unsigned long size)
     /* [previous][next][first][last][top][bottom][index][help] */
 380 {
 381         pgd_t * dir;
 382         unsigned long end = address + size;
 383 
 384         dir = pgd_offset(mm, address);
 385         while (address < end) {
 386                 zap_pmd_range(dir, address, end - address);
 387                 address = (address + PGDIR_SIZE) & PGDIR_MASK;
 388                 dir++;
 389         }
 390         invalidate_range(mm, end - size, end);
 391         return 0;
 392 }
 393 
 394 static inline void zeromap_pte_range(pte_t * pte, unsigned long address, unsigned long size, pte_t zero_pte)
     /* [previous][next][first][last][top][bottom][index][help] */
 395 {
 396         unsigned long end;
 397 
 398         address &= ~PMD_MASK;
 399         end = address + size;
 400         if (end > PMD_SIZE)
 401                 end = PMD_SIZE;
 402         do {
 403                 pte_t oldpage = *pte;
 404                 set_pte(pte, zero_pte);
 405                 forget_pte(oldpage);
 406                 address += PAGE_SIZE;
 407                 pte++;
 408         } while (address < end);
 409 }
 410 
 411 static inline int zeromap_pmd_range(pmd_t * pmd, unsigned long address, unsigned long size, pte_t zero_pte)
     /* [previous][next][first][last][top][bottom][index][help] */
 412 {
 413         unsigned long end;
 414 
 415         address &= ~PGDIR_MASK;
 416         end = address + size;
 417         if (end > PGDIR_SIZE)
 418                 end = PGDIR_SIZE;
 419         do {
 420                 pte_t * pte = pte_alloc(pmd, address);
 421                 if (!pte)
 422                         return -ENOMEM;
 423                 zeromap_pte_range(pte, address, end - address, zero_pte);
 424                 address = (address + PMD_SIZE) & PMD_MASK;
 425                 pmd++;
 426         } while (address < end);
 427         return 0;
 428 }
 429 
 430 int zeromap_page_range(unsigned long address, unsigned long size, pgprot_t prot)
     /* [previous][next][first][last][top][bottom][index][help] */
 431 {
 432         int error = 0;
 433         pgd_t * dir;
 434         unsigned long end = address + size;
 435         pte_t zero_pte;
 436 
 437         zero_pte = pte_wrprotect(mk_pte(ZERO_PAGE, prot));
 438         dir = pgd_offset(current->mm, address);
 439         while (address < end) {
 440                 pmd_t *pmd = pmd_alloc(dir, address);
 441                 error = -ENOMEM;
 442                 if (!pmd)
 443                         break;
 444                 error = zeromap_pmd_range(pmd, address, end - address, zero_pte);
 445                 if (error)
 446                         break;
 447                 address = (address + PGDIR_SIZE) & PGDIR_MASK;
 448                 dir++;
 449         }
 450         invalidate_range(current->mm, end - size, end);
 451         return error;
 452 }
 453 
 454 /*
 455  * maps a range of physical memory into the requested pages. the old
 456  * mappings are removed. any references to nonexistent pages results
 457  * in null mappings (currently treated as "copy-on-access")
 458  */
 459 static inline void remap_pte_range(pte_t * pte, unsigned long address, unsigned long size,
     /* [previous][next][first][last][top][bottom][index][help] */
 460         unsigned long offset, pgprot_t prot)
 461 {
 462         unsigned long end;
 463 
 464         address &= ~PMD_MASK;
 465         end = address + size;
 466         if (end > PMD_SIZE)
 467                 end = PMD_SIZE;
 468         do {
 469                 pte_t oldpage = *pte;
 470                 pte_clear(pte);
 471                 if (offset >= high_memory || mem_map[MAP_NR(offset)].reserved)
 472                         set_pte(pte, mk_pte(offset, prot));
 473                 forget_pte(oldpage);
 474                 address += PAGE_SIZE;
 475                 offset += PAGE_SIZE;
 476                 pte++;
 477         } while (address < end);
 478 }
 479 
 480 static inline int remap_pmd_range(pmd_t * pmd, unsigned long address, unsigned long size,
     /* [previous][next][first][last][top][bottom][index][help] */
 481         unsigned long offset, pgprot_t prot)
 482 {
 483         unsigned long end;
 484 
 485         address &= ~PGDIR_MASK;
 486         end = address + size;
 487         if (end > PGDIR_SIZE)
 488                 end = PGDIR_SIZE;
 489         offset -= address;
 490         do {
 491                 pte_t * pte = pte_alloc(pmd, address);
 492                 if (!pte)
 493                         return -ENOMEM;
 494                 remap_pte_range(pte, address, end - address, address + offset, prot);
 495                 address = (address + PMD_SIZE) & PMD_MASK;
 496                 pmd++;
 497         } while (address < end);
 498         return 0;
 499 }
 500 
 501 int remap_page_range(unsigned long from, unsigned long offset, unsigned long size, pgprot_t prot)
     /* [previous][next][first][last][top][bottom][index][help] */
 502 {
 503         int error = 0;
 504         pgd_t * dir;
 505         unsigned long end = from + size;
 506 
 507         offset -= from;
 508         dir = pgd_offset(current->mm, from);
 509         while (from < end) {
 510                 pmd_t *pmd = pmd_alloc(dir, from);
 511                 error = -ENOMEM;
 512                 if (!pmd)
 513                         break;
 514                 error = remap_pmd_range(pmd, from, end - from, offset + from, prot);
 515                 if (error)
 516                         break;
 517                 from = (from + PGDIR_SIZE) & PGDIR_MASK;
 518                 dir++;
 519         }
 520         invalidate_range(current->mm, from - size, from);
 521         return error;
 522 }
 523 
 524 /*
 525  * sanity-check function..
 526  */
 527 static void put_page(pte_t * page_table, pte_t pte)
     /* [previous][next][first][last][top][bottom][index][help] */
 528 {
 529         if (!pte_none(*page_table)) {
 530                 printk("put_page: page already exists %08lx\n", pte_val(*page_table));
 531                 free_page(pte_page(pte));
 532                 return;
 533         }
 534 /* no need for invalidate */
 535         set_pte(page_table, pte);
 536 }
 537 
 538 /*
 539  * This routine is used to map in a page into an address space: needed by
 540  * execve() for the initial stack and environment pages.
 541  */
 542 unsigned long put_dirty_page(struct task_struct * tsk, unsigned long page, unsigned long address)
     /* [previous][next][first][last][top][bottom][index][help] */
 543 {
 544         pgd_t * pgd;
 545         pmd_t * pmd;
 546         pte_t * pte;
 547 
 548         if (page >= high_memory)
 549                 printk("put_dirty_page: trying to put page %08lx at %08lx\n",page,address);
 550         if (mem_map[MAP_NR(page)].count != 1)
 551                 printk("mem_map disagrees with %08lx at %08lx\n",page,address);
 552         pgd = pgd_offset(tsk->mm,address);
 553         pmd = pmd_alloc(pgd, address);
 554         if (!pmd) {
 555                 free_page(page);
 556                 oom(tsk);
 557                 return 0;
 558         }
 559         pte = pte_alloc(pmd, address);
 560         if (!pte) {
 561                 free_page(page);
 562                 oom(tsk);
 563                 return 0;
 564         }
 565         if (!pte_none(*pte)) {
 566                 printk("put_dirty_page: page already exists\n");
 567                 free_page(page);
 568                 return 0;
 569         }
 570         set_pte(pte, pte_mkwrite(pte_mkdirty(mk_pte(page, PAGE_COPY))));
 571 /* no need for invalidate */
 572         return page;
 573 }
 574 
 575 /*
 576  * This routine handles present pages, when users try to write
 577  * to a shared page. It is done by copying the page to a new address
 578  * and decrementing the shared-page counter for the old page.
 579  *
 580  * Goto-purists beware: the only reason for goto's here is that it results
 581  * in better assembly code.. The "default" path will see no jumps at all.
 582  *
 583  * Note that this routine assumes that the protection checks have been
 584  * done by the caller (the low-level page fault routine in most cases).
 585  * Thus we can safely just mark it writable once we've done any necessary
 586  * COW.
 587  *
 588  * We also mark the page dirty at this point even though the page will
 589  * change only once the write actually happens. This avoids a few races,
 590  * and potentially makes it more efficient.
 591  */
 592 void do_wp_page(struct task_struct * tsk, struct vm_area_struct * vma,
     /* [previous][next][first][last][top][bottom][index][help] */
 593         unsigned long address, int write_access)
 594 {
 595         pgd_t *page_dir;
 596         pmd_t *page_middle;
 597         pte_t *page_table, pte;
 598         unsigned long old_page, new_page;
 599 
 600         new_page = __get_free_page(GFP_KERNEL);
 601         page_dir = pgd_offset(vma->vm_mm, address);
 602         if (pgd_none(*page_dir))
 603                 goto end_wp_page;
 604         if (pgd_bad(*page_dir))
 605                 goto bad_wp_pagedir;
 606         page_middle = pmd_offset(page_dir, address);
 607         if (pmd_none(*page_middle))
 608                 goto end_wp_page;
 609         if (pmd_bad(*page_middle))
 610                 goto bad_wp_pagemiddle;
 611         page_table = pte_offset(page_middle, address);
 612         pte = *page_table;
 613         if (!pte_present(pte))
 614                 goto end_wp_page;
 615         if (pte_write(pte))
 616                 goto end_wp_page;
 617         old_page = pte_page(pte);
 618         if (old_page >= high_memory)
 619                 goto bad_wp_page;
 620         tsk->min_flt++;
 621         /*
 622          * Do we need to copy?
 623          */
 624         if (mem_map[MAP_NR(old_page)].count != 1) {
 625                 if (new_page) {
 626                         if (mem_map[MAP_NR(old_page)].reserved)
 627                                 ++vma->vm_mm->rss;
 628                         copy_page(old_page,new_page);
 629                         set_pte(page_table, pte_mkwrite(pte_mkdirty(mk_pte(new_page, vma->vm_page_prot))));
 630                         free_page(old_page);
 631                         invalidate_page(vma, address);
 632                         return;
 633                 }
 634                 set_pte(page_table, BAD_PAGE);
 635                 free_page(old_page);
 636                 oom(tsk);
 637                 invalidate_page(vma, address);
 638                 return;
 639         }
 640         set_pte(page_table, pte_mkdirty(pte_mkwrite(pte)));
 641         invalidate_page(vma, address);
 642         if (new_page)
 643                 free_page(new_page);
 644         return;
 645 bad_wp_page:
 646         printk("do_wp_page: bogus page at address %08lx (%08lx)\n",address,old_page);
 647         send_sig(SIGKILL, tsk, 1);
 648         goto end_wp_page;
 649 bad_wp_pagemiddle:
 650         printk("do_wp_page: bogus page-middle at address %08lx (%08lx)\n", address, pmd_val(*page_middle));
 651         send_sig(SIGKILL, tsk, 1);
 652         goto end_wp_page;
 653 bad_wp_pagedir:
 654         printk("do_wp_page: bogus page-dir entry at address %08lx (%08lx)\n", address, pgd_val(*page_dir));
 655         send_sig(SIGKILL, tsk, 1);
 656 end_wp_page:
 657         if (new_page)
 658                 free_page(new_page);
 659         return;
 660 }
 661 
 662 /*
 663  * Ugly, ugly, but the goto's result in better assembly..
 664  */
 665 int verify_area(int type, const void * addr, unsigned long size)
     /* [previous][next][first][last][top][bottom][index][help] */
 666 {
 667         struct vm_area_struct * vma;
 668         unsigned long start = (unsigned long) addr;
 669 
 670         /* If the current user space is mapped to kernel space (for the
 671          * case where we use a fake user buffer with get_fs/set_fs()) we
 672          * don't expect to find the address in the user vm map.
 673          */
 674         if (!size || get_fs() == get_ds())
 675                 return 0;
 676 
 677         vma = find_vma(current, start);
 678         if (!vma)
 679                 goto bad_area;
 680         if (vma->vm_start <= start)
 681                 goto good_area;
 682         if (!(vma->vm_flags & VM_GROWSDOWN))
 683                 goto bad_area;
 684         if (expand_stack(vma, start))
 685                 goto bad_area;
 686 
 687 good_area:
 688         if (type == VERIFY_WRITE)
 689                 goto check_write;
 690         for (;;) {
 691                 struct vm_area_struct * next;
 692                 if (!(vma->vm_flags & VM_READ))
 693                         goto bad_area;
 694                 if (vma->vm_end - start >= size)
 695                         return 0;
 696                 next = vma->vm_next;
 697                 if (!next || vma->vm_end != next->vm_start)
 698                         goto bad_area;
 699                 vma = next;
 700         }
 701 
 702 check_write:
 703         if (!(vma->vm_flags & VM_WRITE))
 704                 goto bad_area;
 705         if (!wp_works_ok)
 706                 goto check_wp_fault_by_hand;
 707         for (;;) {
 708                 if (vma->vm_end - start >= size)
 709                         break;
 710                 if (!vma->vm_next || vma->vm_end != vma->vm_next->vm_start)
 711                         goto bad_area;
 712                 vma = vma->vm_next;
 713                 if (!(vma->vm_flags & VM_WRITE))
 714                         goto bad_area;
 715         }
 716         return 0;
 717 
 718 check_wp_fault_by_hand:
 719         size--;
 720         size += start & ~PAGE_MASK;
 721         size >>= PAGE_SHIFT;
 722         start &= PAGE_MASK;
 723 
 724         for (;;) {
 725                 do_wp_page(current, vma, start, 1);
 726                 if (!size)
 727                         break;
 728                 size--;
 729                 start += PAGE_SIZE;
 730                 if (start < vma->vm_end)
 731                         continue;
 732                 vma = vma->vm_next;
 733                 if (!vma || vma->vm_start != start)
 734                         goto bad_area;
 735                 if (!(vma->vm_flags & VM_WRITE))
 736                         goto bad_area;;
 737         }
 738         return 0;
 739 
 740 bad_area:
 741         return -EFAULT;
 742 }
 743 
 744 static inline void get_empty_page(struct task_struct * tsk, struct vm_area_struct * vma, pte_t * page_table)
     /* [previous][next][first][last][top][bottom][index][help] */
 745 {
 746         unsigned long tmp;
 747 
 748         if (!(tmp = get_free_page(GFP_KERNEL))) {
 749                 oom(tsk);
 750                 put_page(page_table, BAD_PAGE);
 751                 return;
 752         }
 753         put_page(page_table, pte_mkwrite(mk_pte(tmp, vma->vm_page_prot)));
 754 }
 755 
 756 /*
 757  * This function zeroes out partial mmap'ed pages at truncation time..
 758  */
 759 static void partial_clear(struct vm_area_struct *vma, unsigned long address)
     /* [previous][next][first][last][top][bottom][index][help] */
 760 {
 761         pgd_t *page_dir;
 762         pmd_t *page_middle;
 763         pte_t *page_table, pte;
 764 
 765         page_dir = pgd_offset(vma->vm_mm, address);
 766         if (pgd_none(*page_dir))
 767                 return;
 768         if (pgd_bad(*page_dir)) {
 769                 printk("bad page table directory entry %p:[%lx]\n", page_dir, pgd_val(*page_dir));
 770                 pgd_clear(page_dir);
 771                 return;
 772         }
 773         page_middle = pmd_offset(page_dir, address);
 774         if (pmd_none(*page_middle))
 775                 return;
 776         if (pmd_bad(*page_middle)) {
 777                 printk("bad page table directory entry %p:[%lx]\n", page_dir, pgd_val(*page_dir));
 778                 pmd_clear(page_middle);
 779                 return;
 780         }
 781         page_table = pte_offset(page_middle, address);
 782         pte = *page_table;
 783         if (!pte_present(pte))
 784                 return;
 785         address &= ~PAGE_MASK;
 786         address += pte_page(pte);
 787         if (address >= high_memory)
 788                 return;
 789         memset((void *) address, 0, PAGE_SIZE - (address & ~PAGE_MASK));
 790 }
 791 
 792 /*
 793  * Handle all mappings that got truncated by a "truncate()"
 794  * system call.
 795  *
 796  * NOTE! We have to be ready to update the memory sharing
 797  * between the file and the memory map for a potential last
 798  * incomplete page.  Ugly, but necessary.
 799  */
 800 void vmtruncate(struct inode * inode, unsigned long offset)
     /* [previous][next][first][last][top][bottom][index][help] */
 801 {
 802         struct vm_area_struct * mpnt;
 803 
 804         invalidate_inode_pages(inode, offset);
 805         if (!inode->i_mmap)
 806                 return;
 807         mpnt = inode->i_mmap;
 808         do {
 809                 unsigned long start = mpnt->vm_start;
 810                 unsigned long len = mpnt->vm_end - start;
 811                 unsigned long diff;
 812 
 813                 /* mapping wholly truncated? */
 814                 if (mpnt->vm_offset >= offset) {
 815                         zap_page_range(mpnt->vm_mm, start, len);
 816                         continue;
 817                 }
 818                 /* mapping wholly unaffected? */
 819                 diff = offset - mpnt->vm_offset;
 820                 if (diff >= len)
 821                         continue;
 822                 /* Ok, partially affected.. */
 823                 start += diff;
 824                 len = (len - diff) & PAGE_MASK;
 825                 if (start & ~PAGE_MASK) {
 826                         partial_clear(mpnt, start);
 827                         start = (start + ~PAGE_MASK) & PAGE_MASK;
 828                 }
 829                 zap_page_range(mpnt->vm_mm, start, len);
 830         } while ((mpnt = mpnt->vm_next_share) != inode->i_mmap);
 831 }
 832 
 833 /*
 834  * fill in an empty page-table if none exists.
 835  */
 836 static inline pte_t * get_empty_pgtable(struct task_struct * tsk,unsigned long address)
     /* [previous][next][first][last][top][bottom][index][help] */
 837 {
 838         pgd_t *pgd;
 839         pmd_t *pmd;
 840         pte_t *pte;
 841 
 842         pgd = pgd_offset(tsk->mm, address);
 843         pmd = pmd_alloc(pgd, address);
 844         if (!pmd) {
 845                 oom(tsk);
 846                 return NULL;
 847         }
 848         pte = pte_alloc(pmd, address);
 849         if (!pte) {
 850                 oom(tsk);
 851                 return NULL;
 852         }
 853         return pte;
 854 }
 855 
 856 static inline void do_swap_page(struct task_struct * tsk, 
     /* [previous][next][first][last][top][bottom][index][help] */
 857         struct vm_area_struct * vma, unsigned long address,
 858         pte_t * page_table, pte_t entry, int write_access)
 859 {
 860         pte_t page;
 861 
 862         if (!vma->vm_ops || !vma->vm_ops->swapin) {
 863                 swap_in(tsk, vma, page_table, pte_val(entry), write_access);
 864                 return;
 865         }
 866         page = vma->vm_ops->swapin(vma, address - vma->vm_start + vma->vm_offset, pte_val(entry));
 867         if (pte_val(*page_table) != pte_val(entry)) {
 868                 free_page(pte_page(page));
 869                 return;
 870         }
 871         if (mem_map[MAP_NR(pte_page(page))].count > 1 && !(vma->vm_flags & VM_SHARED))
 872                 page = pte_wrprotect(page);
 873         ++vma->vm_mm->rss;
 874         ++tsk->maj_flt;
 875         set_pte(page_table, page);
 876         return;
 877 }
 878 
 879 /*
 880  * do_no_page() tries to create a new page mapping. It aggressively
 881  * tries to share with existing pages, but makes a separate copy if
 882  * the "write_access" parameter is true in order to avoid the next
 883  * page fault.
 884  */
 885 void do_no_page(struct task_struct * tsk, struct vm_area_struct * vma,
     /* [previous][next][first][last][top][bottom][index][help] */
 886         unsigned long address, int write_access)
 887 {
 888         pte_t * page_table;
 889         pte_t entry;
 890         unsigned long page;
 891 
 892         page_table = get_empty_pgtable(tsk, address);
 893         if (!page_table)
 894                 return;
 895         entry = *page_table;
 896         if (pte_present(entry))
 897                 return;
 898         if (!pte_none(entry)) {
 899                 do_swap_page(tsk, vma, address, page_table, entry, write_access);
 900                 return;
 901         }
 902         address &= PAGE_MASK;
 903         if (!vma->vm_ops || !vma->vm_ops->nopage) {
 904                 ++vma->vm_mm->rss;
 905                 ++tsk->min_flt;
 906                 get_empty_page(tsk, vma, page_table);
 907                 return;
 908         }
 909         ++tsk->maj_flt;
 910         ++vma->vm_mm->rss;
 911         /*
 912          * The third argument is "no_share", which tells the low-level code
 913          * to copy, not share the page even if sharing is possible.  It's
 914          * essentially an early COW detection 
 915          */
 916         page = vma->vm_ops->nopage(vma, address, write_access && !(vma->vm_flags & VM_SHARED));
 917         if (!page) {
 918                 send_sig(SIGBUS, current, 1);
 919                 put_page(page_table, BAD_PAGE);
 920                 return;
 921         }
 922         /*
 923          * This silly early PAGE_DIRTY setting removes a race
 924          * due to the bad i386 page protection. But it's valid
 925          * for other architectures too.
 926          *
 927          * Note that if write_access is true, we either now have
 928          * a exclusive copy of the page, or this is a shared mapping,
 929          * so we can make it writable and dirty to avoid having to
 930          * handle that later.
 931          */
 932         entry = mk_pte(page, vma->vm_page_prot);
 933         if (write_access) {
 934                 entry = pte_mkwrite(pte_mkdirty(entry));
 935         } else if (mem_map[MAP_NR(page)].count > 1 && !(vma->vm_flags & VM_SHARED))
 936                 entry = pte_wrprotect(entry);
 937         put_page(page_table, entry);
 938 }
 939 
 940 /*
 941  * The above separate functions for the no-page and wp-page
 942  * cases will go away (they mostly do the same thing anyway),
 943  * and we'll instead use only a general "handle_mm_fault()".
 944  *
 945  * These routines also need to handle stuff like marking pages dirty
 946  * and/or accessed for architectures that don't do it in hardware (most
 947  * RISC architectures).  The early dirtying is also good on the i386.
 948  *
 949  * There is also a hook called "update_mmu_cache()" that architectures
 950  * with external mmu caches can use to update those (ie the Sparc or
 951  * PowerPC hashed page tables that act as extended TLBs).
 952  */
 953 static inline void handle_pte_fault(struct vm_area_struct * vma, unsigned long address,
     /* [previous][next][first][last][top][bottom][index][help] */
 954         int write_access, pte_t * pte)
 955 {
 956         if (!pte_present(*pte)) {
 957                 do_no_page(current, vma, address, write_access);
 958                 return;
 959         }
 960         set_pte(pte, pte_mkyoung(*pte));
 961         if (!write_access)
 962                 return;
 963         if (pte_write(*pte)) {
 964                 set_pte(pte, pte_mkdirty(*pte));
 965                 return;
 966         }
 967         do_wp_page(current, vma, address, write_access);
 968 }
 969 
 970 void handle_mm_fault(struct vm_area_struct * vma, unsigned long address,
     /* [previous][next][first][last][top][bottom][index][help] */
 971         int write_access)
 972 {
 973         pgd_t *pgd;
 974         pmd_t *pmd;
 975         pte_t *pte;
 976 
 977         pgd = pgd_offset(vma->vm_mm, address);
 978         pmd = pmd_alloc(pgd, address);
 979         if (!pmd)
 980                 goto no_memory;
 981         pte = pte_alloc(pmd, address);
 982         if (!pte)
 983                 goto no_memory;
 984         handle_pte_fault(vma, address, write_access, pte);
 985         update_mmu_cache(vma, address, *pte);
 986         return;
 987 no_memory:
 988         oom(current);
 989 }

/* [previous][next][first][last][top][bottom][index][help] */