root/mm/memory.c

/* [previous][next][first][last][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. copy_page
  2. oom
  3. free_one_pmd
  4. free_one_pgd
  5. clear_page_tables
  6. free_page_tables
  7. new_page_tables
  8. copy_one_pte
  9. copy_pte_range
  10. copy_pmd_range
  11. copy_page_range
  12. forget_pte
  13. zap_pte_range
  14. zap_pmd_range
  15. zap_page_range
  16. zeromap_pte_range
  17. zeromap_pmd_range
  18. zeromap_page_range
  19. remap_pte_range
  20. remap_pmd_range
  21. remap_page_range
  22. put_page
  23. put_dirty_page
  24. do_wp_page
  25. verify_area
  26. get_empty_page
  27. partial_clear
  28. vmtruncate
  29. get_empty_pgtable
  30. do_swap_page
  31. do_no_page
  32. handle_pte_fault
  33. handle_mm_fault

   1 /*
   2  *  linux/mm/memory.c
   3  *
   4  *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
   5  */
   6 
   7 /*
   8  * demand-loading started 01.12.91 - seems it is high on the list of
   9  * things wanted, and it should be easy to implement. - Linus
  10  */
  11 
  12 /*
  13  * Ok, demand-loading was easy, shared pages a little bit tricker. Shared
  14  * pages started 02.12.91, seems to work. - Linus.
  15  *
  16  * Tested sharing by executing about 30 /bin/sh: under the old kernel it
  17  * would have taken more than the 6M I have free, but it worked well as
  18  * far as I could see.
  19  *
  20  * Also corrected some "invalidate()"s - I wasn't doing enough of them.
  21  */
  22 
  23 /*
  24  * Real VM (paging to/from disk) started 18.12.91. Much more work and
  25  * thought has to go into this. Oh, well..
  26  * 19.12.91  -  works, somewhat. Sometimes I get faults, don't know why.
  27  *              Found it. Everything seems to work now.
  28  * 20.12.91  -  Ok, making the swap-device changeable like the root.
  29  */
  30 
  31 /*
  32  * 05.04.94  -  Multi-page memory management added for v1.1.
  33  *              Idea by Alex Bligh (alex@cconcepts.co.uk)
  34  */
  35 
  36 #include <linux/signal.h>
  37 #include <linux/sched.h>
  38 #include <linux/head.h>
  39 #include <linux/kernel.h>
  40 #include <linux/errno.h>
  41 #include <linux/string.h>
  42 #include <linux/types.h>
  43 #include <linux/ptrace.h>
  44 #include <linux/mman.h>
  45 #include <linux/mm.h>
  46 #include <linux/swap.h>
  47 
  48 #include <asm/system.h>
  49 #include <asm/segment.h>
  50 #include <asm/pgtable.h>
  51 #include <asm/string.h>
  52 
  53 unsigned long high_memory = 0;
  54 
  55 /*
  56  * We special-case the C-O-W ZERO_PAGE, because it's such
  57  * a common occurrence (no need to read the page to know
  58  * that it's zero - better for the cache and memory subsystem).
  59  */
  60 static inline void copy_page(unsigned long from, unsigned long to)
     /* [previous][next][first][last][top][bottom][index][help] */
  61 {
  62         if (from == ZERO_PAGE) {
  63                 memset((void *) to, 0, PAGE_SIZE);
  64                 return;
  65         }
  66         memcpy((void *) to, (void *) from, PAGE_SIZE);
  67 }
  68 
  69 #define USER_PTRS_PER_PGD (TASK_SIZE / PGDIR_SIZE)
  70 
  71 mem_map_t * mem_map = NULL;
  72 
  73 /*
  74  * oom() prints a message (so that the user knows why the process died),
  75  * and gives the process an untrappable SIGKILL.
  76  */
  77 void oom(struct task_struct * task)
     /* [previous][next][first][last][top][bottom][index][help] */
  78 {
  79         printk("\nOut of memory for %s.\n", current->comm);
  80         task->sig->action[SIGKILL-1].sa_handler = NULL;
  81         task->blocked &= ~(1<<(SIGKILL-1));
  82         send_sig(SIGKILL,task,1);
  83 }
  84 
  85 /*
  86  * Note: this doesn't free the actual pages themselves. That
  87  * has been handled earlier when unmapping all the memory regions.
  88  */
  89 static inline void free_one_pmd(pmd_t * dir)
     /* [previous][next][first][last][top][bottom][index][help] */
  90 {
  91         pte_t * pte;
  92 
  93         if (pmd_none(*dir))
  94                 return;
  95         if (pmd_bad(*dir)) {
  96                 printk("free_one_pmd: bad directory entry %08lx\n", pmd_val(*dir));
  97                 pmd_clear(dir);
  98                 return;
  99         }
 100         pte = pte_offset(dir, 0);
 101         pmd_clear(dir);
 102         pte_free(pte);
 103 }
 104 
 105 static inline void free_one_pgd(pgd_t * dir)
     /* [previous][next][first][last][top][bottom][index][help] */
 106 {
 107         int j;
 108         pmd_t * pmd;
 109 
 110         if (pgd_none(*dir))
 111                 return;
 112         if (pgd_bad(*dir)) {
 113                 printk("free_one_pgd: bad directory entry %08lx\n", pgd_val(*dir));
 114                 pgd_clear(dir);
 115                 return;
 116         }
 117         pmd = pmd_offset(dir, 0);
 118         pgd_clear(dir);
 119         for (j = 0; j < PTRS_PER_PMD ; j++)
 120                 free_one_pmd(pmd+j);
 121         pmd_free(pmd);
 122 }
 123         
 124 /*
 125  * This function clears all user-level page tables of a process - this
 126  * is needed by execve(), so that old pages aren't in the way.
 127  */
 128 void clear_page_tables(struct task_struct * tsk)
     /* [previous][next][first][last][top][bottom][index][help] */
 129 {
 130         int i;
 131         pgd_t * page_dir;
 132 
 133         page_dir = tsk->mm->pgd;
 134         if (!page_dir || page_dir == swapper_pg_dir) {
 135                 printk("%s trying to clear kernel page-directory: not good\n", tsk->comm);
 136                 return;
 137         }
 138         for (i = 0 ; i < USER_PTRS_PER_PGD ; i++)
 139                 free_one_pgd(page_dir + i);
 140         invalidate_mm(tsk->mm);
 141 }
 142 
 143 /*
 144  * This function frees up all page tables of a process when it exits. It
 145  * is the same as "clear_page_tables()", except it also changes the process'
 146  * page table directory to the kernel page tables and then frees the old
 147  * page table directory.
 148  */
 149 void free_page_tables(struct task_struct * tsk)
     /* [previous][next][first][last][top][bottom][index][help] */
 150 {
 151         int i;
 152         pgd_t * page_dir;
 153 
 154         page_dir = tsk->mm->pgd;
 155         if (!page_dir || page_dir == swapper_pg_dir) {
 156                 printk("%s trying to free kernel page-directory: not good\n", tsk->comm);
 157                 return;
 158         }
 159         invalidate_mm(tsk->mm);
 160         SET_PAGE_DIR(tsk, swapper_pg_dir);
 161         tsk->mm->pgd = swapper_pg_dir;  /* or else... */
 162         for (i = 0 ; i < USER_PTRS_PER_PGD ; i++)
 163                 free_one_pgd(page_dir + i);
 164         pgd_free(page_dir);
 165 }
 166 
 167 int new_page_tables(struct task_struct * tsk)
     /* [previous][next][first][last][top][bottom][index][help] */
 168 {
 169         pgd_t * page_dir, * new_pg;
 170 
 171         if (!(new_pg = pgd_alloc()))
 172                 return -ENOMEM;
 173         page_dir = pgd_offset(&init_mm, 0);
 174         memcpy(new_pg + USER_PTRS_PER_PGD, page_dir + USER_PTRS_PER_PGD,
 175                (PTRS_PER_PGD - USER_PTRS_PER_PGD) * sizeof (pgd_t));
 176         invalidate_mm(tsk->mm);
 177         SET_PAGE_DIR(tsk, new_pg);
 178         tsk->mm->pgd = new_pg;
 179         return 0;
 180 }
 181 
 182 static inline void copy_one_pte(pte_t * old_pte, pte_t * new_pte, int cow)
     /* [previous][next][first][last][top][bottom][index][help] */
 183 {
 184         pte_t pte = *old_pte;
 185         unsigned long page_nr;
 186 
 187         if (pte_none(pte))
 188                 return;
 189         if (!pte_present(pte)) {
 190                 swap_duplicate(pte_val(pte));
 191                 set_pte(new_pte, pte);
 192                 return;
 193         }
 194         page_nr = MAP_NR(pte_page(pte));
 195         if (page_nr >= MAP_NR(high_memory) || mem_map[page_nr].reserved) {
 196                 set_pte(new_pte, pte);
 197                 return;
 198         }
 199         if (cow)
 200                 pte = pte_wrprotect(pte);
 201         if (delete_from_swap_cache(page_nr))
 202                 pte = pte_mkdirty(pte);
 203         set_pte(new_pte, pte_mkold(pte));
 204         set_pte(old_pte, pte);
 205         mem_map[page_nr].count++;
 206 }
 207 
 208 static inline int copy_pte_range(pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long address, unsigned long size, int cow)
     /* [previous][next][first][last][top][bottom][index][help] */
 209 {
 210         pte_t * src_pte, * dst_pte;
 211         unsigned long end;
 212 
 213         if (pmd_none(*src_pmd))
 214                 return 0;
 215         if (pmd_bad(*src_pmd)) {
 216                 printk("copy_pte_range: bad pmd (%08lx)\n", pmd_val(*src_pmd));
 217                 pmd_clear(src_pmd);
 218                 return 0;
 219         }
 220         src_pte = pte_offset(src_pmd, address);
 221         if (pmd_none(*dst_pmd)) {
 222                 if (!pte_alloc(dst_pmd, 0))
 223                         return -ENOMEM;
 224         }
 225         dst_pte = pte_offset(dst_pmd, address);
 226         address &= ~PMD_MASK;
 227         end = address + size;
 228         if (end >= PMD_SIZE)
 229                 end = PMD_SIZE;
 230         do {
 231                 /* I would like to switch arguments here, to make it
 232                  * consistent with copy_xxx_range and memcpy syntax.
 233                  */
 234                 copy_one_pte(src_pte++, dst_pte++, cow);
 235                 address += PAGE_SIZE;
 236         } while (address < end);
 237         return 0;
 238 }
 239 
 240 static inline int copy_pmd_range(pgd_t *dst_pgd, pgd_t *src_pgd, unsigned long address, unsigned long size, int cow)
     /* [previous][next][first][last][top][bottom][index][help] */
 241 {
 242         pmd_t * src_pmd, * dst_pmd;
 243         unsigned long end;
 244         int error = 0;
 245 
 246         if (pgd_none(*src_pgd))
 247                 return 0;
 248         if (pgd_bad(*src_pgd)) {
 249                 printk("copy_pmd_range: bad pgd (%08lx)\n", pgd_val(*src_pgd));
 250                 pgd_clear(src_pgd);
 251                 return 0;
 252         }
 253         src_pmd = pmd_offset(src_pgd, address);
 254         if (pgd_none(*dst_pgd)) {
 255                 if (!pmd_alloc(dst_pgd, 0))
 256                         return -ENOMEM;
 257         }
 258         dst_pmd = pmd_offset(dst_pgd, address);
 259         address &= ~PGDIR_MASK;
 260         end = address + size;
 261         if (end > PGDIR_SIZE)
 262                 end = PGDIR_SIZE;
 263         do {
 264                 error = copy_pte_range(dst_pmd++, src_pmd++, address, end - address, cow);
 265                 if (error)
 266                         break;
 267                 address = (address + PMD_SIZE) & PMD_MASK; 
 268         } while (address < end);
 269         return error;
 270 }
 271 
 272 /*
 273  * copy one vm_area from one task to the other. Assumes the page tables
 274  * already present in the new task to be cleared in the whole range
 275  * covered by this vma.
 276  */
 277 int copy_page_range(struct mm_struct *dst, struct mm_struct *src,
     /* [previous][next][first][last][top][bottom][index][help] */
 278                         struct vm_area_struct *vma)
 279 {
 280         pgd_t * src_pgd, * dst_pgd;
 281         unsigned long address = vma->vm_start;
 282         unsigned long end = vma->vm_end;
 283         int error = 0, cow;
 284 
 285         cow = (vma->vm_flags & (VM_SHARED | VM_WRITE)) == VM_WRITE;
 286         src_pgd = pgd_offset(src, address);
 287         dst_pgd = pgd_offset(dst, address);
 288         while (address < end) {
 289                 error = copy_pmd_range(dst_pgd++, src_pgd++, address, end - address, cow);
 290                 if (error)
 291                         break;
 292                 address = (address + PGDIR_SIZE) & PGDIR_MASK;
 293         }
 294         /* Note that the src ptes get c-o-w treatment, so they change too. */
 295         invalidate_range(src, vma->vm_start, vma->vm_end);
 296         invalidate_range(dst, vma->vm_start, vma->vm_end);
 297         return error;
 298 }
 299 
 300 static inline void forget_pte(pte_t page)
     /* [previous][next][first][last][top][bottom][index][help] */
 301 {
 302         if (pte_none(page))
 303                 return;
 304         if (pte_present(page)) {
 305                 unsigned long addr = pte_page(page);
 306                 if (addr >= high_memory || mem_map[MAP_NR(addr)].reserved)
 307                         return;
 308                 free_page(addr);
 309                 if (current->mm->rss <= 0)
 310                         return;
 311                 current->mm->rss--;
 312                 return;
 313         }
 314         swap_free(pte_val(page));
 315 }
 316 
 317 static inline void zap_pte_range(pmd_t * pmd, unsigned long address, unsigned long size)
     /* [previous][next][first][last][top][bottom][index][help] */
 318 {
 319         pte_t * pte;
 320         unsigned long end;
 321 
 322         if (pmd_none(*pmd))
 323                 return;
 324         if (pmd_bad(*pmd)) {
 325                 printk("zap_pte_range: bad pmd (%08lx)\n", pmd_val(*pmd));
 326                 pmd_clear(pmd);
 327                 return;
 328         }
 329         pte = pte_offset(pmd, address);
 330         address &= ~PMD_MASK;
 331         end = address + size;
 332         if (end >= PMD_SIZE)
 333                 end = PMD_SIZE;
 334         do {
 335                 pte_t page = *pte;
 336                 pte_clear(pte);
 337                 forget_pte(page);
 338                 address += PAGE_SIZE;
 339                 pte++;
 340         } while (address < end);
 341 }
 342 
 343 static inline void zap_pmd_range(pgd_t * dir, unsigned long address, unsigned long size)
     /* [previous][next][first][last][top][bottom][index][help] */
 344 {
 345         pmd_t * pmd;
 346         unsigned long end;
 347 
 348         if (pgd_none(*dir))
 349                 return;
 350         if (pgd_bad(*dir)) {
 351                 printk("zap_pmd_range: bad pgd (%08lx)\n", pgd_val(*dir));
 352                 pgd_clear(dir);
 353                 return;
 354         }
 355         pmd = pmd_offset(dir, address);
 356         address &= ~PGDIR_MASK;
 357         end = address + size;
 358         if (end > PGDIR_SIZE)
 359                 end = PGDIR_SIZE;
 360         do {
 361                 zap_pte_range(pmd, address, end - address);
 362                 address = (address + PMD_SIZE) & PMD_MASK; 
 363                 pmd++;
 364         } while (address < end);
 365 }
 366 
 367 /*
 368  * remove user pages in a given range.
 369  */
 370 int zap_page_range(struct mm_struct *mm, unsigned long address, unsigned long size)
     /* [previous][next][first][last][top][bottom][index][help] */
 371 {
 372         pgd_t * dir;
 373         unsigned long end = address + size;
 374 
 375         dir = pgd_offset(mm, address);
 376         while (address < end) {
 377                 zap_pmd_range(dir, address, end - address);
 378                 address = (address + PGDIR_SIZE) & PGDIR_MASK;
 379                 dir++;
 380         }
 381         invalidate_range(mm, end - size, end);
 382         return 0;
 383 }
 384 
 385 static inline void zeromap_pte_range(pte_t * pte, unsigned long address, unsigned long size, pte_t zero_pte)
     /* [previous][next][first][last][top][bottom][index][help] */
 386 {
 387         unsigned long end;
 388 
 389         address &= ~PMD_MASK;
 390         end = address + size;
 391         if (end > PMD_SIZE)
 392                 end = PMD_SIZE;
 393         do {
 394                 pte_t oldpage = *pte;
 395                 set_pte(pte, zero_pte);
 396                 forget_pte(oldpage);
 397                 address += PAGE_SIZE;
 398                 pte++;
 399         } while (address < end);
 400 }
 401 
 402 static inline int zeromap_pmd_range(pmd_t * pmd, unsigned long address, unsigned long size, pte_t zero_pte)
     /* [previous][next][first][last][top][bottom][index][help] */
 403 {
 404         unsigned long end;
 405 
 406         address &= ~PGDIR_MASK;
 407         end = address + size;
 408         if (end > PGDIR_SIZE)
 409                 end = PGDIR_SIZE;
 410         do {
 411                 pte_t * pte = pte_alloc(pmd, address);
 412                 if (!pte)
 413                         return -ENOMEM;
 414                 zeromap_pte_range(pte, address, end - address, zero_pte);
 415                 address = (address + PMD_SIZE) & PMD_MASK;
 416                 pmd++;
 417         } while (address < end);
 418         return 0;
 419 }
 420 
 421 int zeromap_page_range(unsigned long address, unsigned long size, pgprot_t prot)
     /* [previous][next][first][last][top][bottom][index][help] */
 422 {
 423         int error = 0;
 424         pgd_t * dir;
 425         unsigned long beg = address;
 426         unsigned long end = address + size;
 427         pte_t zero_pte;
 428 
 429         zero_pte = pte_wrprotect(mk_pte(ZERO_PAGE, prot));
 430         dir = pgd_offset(current->mm, address);
 431         while (address < end) {
 432                 pmd_t *pmd = pmd_alloc(dir, address);
 433                 error = -ENOMEM;
 434                 if (!pmd)
 435                         break;
 436                 error = zeromap_pmd_range(pmd, address, end - address, zero_pte);
 437                 if (error)
 438                         break;
 439                 address = (address + PGDIR_SIZE) & PGDIR_MASK;
 440                 dir++;
 441         }
 442         invalidate_range(current->mm, beg, end);
 443         return error;
 444 }
 445 
 446 /*
 447  * maps a range of physical memory into the requested pages. the old
 448  * mappings are removed. any references to nonexistent pages results
 449  * in null mappings (currently treated as "copy-on-access")
 450  */
 451 static inline void remap_pte_range(pte_t * pte, unsigned long address, unsigned long size,
     /* [previous][next][first][last][top][bottom][index][help] */
 452         unsigned long offset, pgprot_t prot)
 453 {
 454         unsigned long end;
 455 
 456         address &= ~PMD_MASK;
 457         end = address + size;
 458         if (end > PMD_SIZE)
 459                 end = PMD_SIZE;
 460         do {
 461                 pte_t oldpage = *pte;
 462                 pte_clear(pte);
 463                 if (offset >= high_memory || mem_map[MAP_NR(offset)].reserved)
 464                         set_pte(pte, mk_pte(offset, prot));
 465                 forget_pte(oldpage);
 466                 address += PAGE_SIZE;
 467                 offset += PAGE_SIZE;
 468                 pte++;
 469         } while (address < end);
 470 }
 471 
 472 static inline int remap_pmd_range(pmd_t * pmd, unsigned long address, unsigned long size,
     /* [previous][next][first][last][top][bottom][index][help] */
 473         unsigned long offset, pgprot_t prot)
 474 {
 475         unsigned long end;
 476 
 477         address &= ~PGDIR_MASK;
 478         end = address + size;
 479         if (end > PGDIR_SIZE)
 480                 end = PGDIR_SIZE;
 481         offset -= address;
 482         do {
 483                 pte_t * pte = pte_alloc(pmd, address);
 484                 if (!pte)
 485                         return -ENOMEM;
 486                 remap_pte_range(pte, address, end - address, address + offset, prot);
 487                 address = (address + PMD_SIZE) & PMD_MASK;
 488                 pmd++;
 489         } while (address < end);
 490         return 0;
 491 }
 492 
 493 int remap_page_range(unsigned long from, unsigned long offset, unsigned long size, pgprot_t prot)
     /* [previous][next][first][last][top][bottom][index][help] */
 494 {
 495         int error = 0;
 496         pgd_t * dir;
 497         unsigned long beg = from;
 498         unsigned long end = from + size;
 499 
 500         offset -= from;
 501         dir = pgd_offset(current->mm, from);
 502         while (from < end) {
 503                 pmd_t *pmd = pmd_alloc(dir, from);
 504                 error = -ENOMEM;
 505                 if (!pmd)
 506                         break;
 507                 error = remap_pmd_range(pmd, from, end - from, offset + from, prot);
 508                 if (error)
 509                         break;
 510                 from = (from + PGDIR_SIZE) & PGDIR_MASK;
 511                 dir++;
 512         }
 513         invalidate_range(current->mm, beg, from);
 514         return error;
 515 }
 516 
 517 /*
 518  * sanity-check function..
 519  */
 520 static void put_page(pte_t * page_table, pte_t pte)
     /* [previous][next][first][last][top][bottom][index][help] */
 521 {
 522         if (!pte_none(*page_table)) {
 523                 printk("put_page: page already exists %08lx\n", pte_val(*page_table));
 524                 free_page(pte_page(pte));
 525                 return;
 526         }
 527 /* no need for invalidate */
 528         set_pte(page_table, pte);
 529 }
 530 
 531 /*
 532  * This routine is used to map in a page into an address space: needed by
 533  * execve() for the initial stack and environment pages.
 534  */
 535 unsigned long put_dirty_page(struct task_struct * tsk, unsigned long page, unsigned long address)
     /* [previous][next][first][last][top][bottom][index][help] */
 536 {
 537         pgd_t * pgd;
 538         pmd_t * pmd;
 539         pte_t * pte;
 540 
 541         if (page >= high_memory)
 542                 printk("put_dirty_page: trying to put page %08lx at %08lx\n",page,address);
 543         if (mem_map[MAP_NR(page)].count != 1)
 544                 printk("mem_map disagrees with %08lx at %08lx\n",page,address);
 545         pgd = pgd_offset(tsk->mm,address);
 546         pmd = pmd_alloc(pgd, address);
 547         if (!pmd) {
 548                 free_page(page);
 549                 oom(tsk);
 550                 return 0;
 551         }
 552         pte = pte_alloc(pmd, address);
 553         if (!pte) {
 554                 free_page(page);
 555                 oom(tsk);
 556                 return 0;
 557         }
 558         if (!pte_none(*pte)) {
 559                 printk("put_dirty_page: page already exists\n");
 560                 free_page(page);
 561                 return 0;
 562         }
 563         set_pte(pte, pte_mkwrite(pte_mkdirty(mk_pte(page, PAGE_COPY))));
 564 /* no need for invalidate */
 565         return page;
 566 }
 567 
 568 /*
 569  * This routine handles present pages, when users try to write
 570  * to a shared page. It is done by copying the page to a new address
 571  * and decrementing the shared-page counter for the old page.
 572  *
 573  * Goto-purists beware: the only reason for goto's here is that it results
 574  * in better assembly code.. The "default" path will see no jumps at all.
 575  *
 576  * Note that this routine assumes that the protection checks have been
 577  * done by the caller (the low-level page fault routine in most cases).
 578  * Thus we can safely just mark it writable once we've done any necessary
 579  * COW.
 580  *
 581  * We also mark the page dirty at this point even though the page will
 582  * change only once the write actually happens. This avoids a few races,
 583  * and potentially makes it more efficient.
 584  */
 585 void do_wp_page(struct task_struct * tsk, struct vm_area_struct * vma,
     /* [previous][next][first][last][top][bottom][index][help] */
 586         unsigned long address, int write_access)
 587 {
 588         pgd_t *page_dir;
 589         pmd_t *page_middle;
 590         pte_t *page_table, pte;
 591         unsigned long old_page, new_page;
 592 
 593         new_page = __get_free_page(GFP_KERNEL);
 594         page_dir = pgd_offset(vma->vm_mm, address);
 595         if (pgd_none(*page_dir))
 596                 goto end_wp_page;
 597         if (pgd_bad(*page_dir))
 598                 goto bad_wp_pagedir;
 599         page_middle = pmd_offset(page_dir, address);
 600         if (pmd_none(*page_middle))
 601                 goto end_wp_page;
 602         if (pmd_bad(*page_middle))
 603                 goto bad_wp_pagemiddle;
 604         page_table = pte_offset(page_middle, address);
 605         pte = *page_table;
 606         if (!pte_present(pte))
 607                 goto end_wp_page;
 608         if (pte_write(pte))
 609                 goto end_wp_page;
 610         old_page = pte_page(pte);
 611         if (old_page >= high_memory)
 612                 goto bad_wp_page;
 613         tsk->min_flt++;
 614         /*
 615          * Do we need to copy?
 616          */
 617         if (mem_map[MAP_NR(old_page)].count != 1) {
 618                 if (new_page) {
 619                         if (mem_map[MAP_NR(old_page)].reserved)
 620                                 ++vma->vm_mm->rss;
 621                         copy_page(old_page,new_page);
 622                         set_pte(page_table, pte_mkwrite(pte_mkdirty(mk_pte(new_page, vma->vm_page_prot))));
 623                         free_page(old_page);
 624                         invalidate_page(vma, address);
 625                         return;
 626                 }
 627                 set_pte(page_table, BAD_PAGE);
 628                 free_page(old_page);
 629                 oom(tsk);
 630                 invalidate_page(vma, address);
 631                 return;
 632         }
 633         set_pte(page_table, pte_mkdirty(pte_mkwrite(pte)));
 634         invalidate_page(vma, address);
 635         if (new_page)
 636                 free_page(new_page);
 637         return;
 638 bad_wp_page:
 639         printk("do_wp_page: bogus page at address %08lx (%08lx)\n",address,old_page);
 640         send_sig(SIGKILL, tsk, 1);
 641         goto end_wp_page;
 642 bad_wp_pagemiddle:
 643         printk("do_wp_page: bogus page-middle at address %08lx (%08lx)\n", address, pmd_val(*page_middle));
 644         send_sig(SIGKILL, tsk, 1);
 645         goto end_wp_page;
 646 bad_wp_pagedir:
 647         printk("do_wp_page: bogus page-dir entry at address %08lx (%08lx)\n", address, pgd_val(*page_dir));
 648         send_sig(SIGKILL, tsk, 1);
 649 end_wp_page:
 650         if (new_page)
 651                 free_page(new_page);
 652         return;
 653 }
 654 
 655 /*
 656  * Ugly, ugly, but the goto's result in better assembly..
 657  */
 658 int verify_area(int type, const void * addr, unsigned long size)
     /* [previous][next][first][last][top][bottom][index][help] */
 659 {
 660         struct vm_area_struct * vma;
 661         unsigned long start = (unsigned long) addr;
 662 
 663         /* If the current user space is mapped to kernel space (for the
 664          * case where we use a fake user buffer with get_fs/set_fs()) we
 665          * don't expect to find the address in the user vm map.
 666          */
 667         if (!size || get_fs() == get_ds())
 668                 return 0;
 669 
 670         vma = find_vma(current, start);
 671         if (!vma)
 672                 goto bad_area;
 673         if (vma->vm_start <= start)
 674                 goto good_area;
 675         if (!(vma->vm_flags & VM_GROWSDOWN))
 676                 goto bad_area;
 677         if (expand_stack(vma, start))
 678                 goto bad_area;
 679 
 680 good_area:
 681         if (type == VERIFY_WRITE)
 682                 goto check_write;
 683         for (;;) {
 684                 struct vm_area_struct * next;
 685                 if (!(vma->vm_flags & VM_READ))
 686                         goto bad_area;
 687                 if (vma->vm_end - start >= size)
 688                         return 0;
 689                 next = vma->vm_next;
 690                 if (!next || vma->vm_end != next->vm_start)
 691                         goto bad_area;
 692                 vma = next;
 693         }
 694 
 695 check_write:
 696         if (!(vma->vm_flags & VM_WRITE))
 697                 goto bad_area;
 698         if (!wp_works_ok)
 699                 goto check_wp_fault_by_hand;
 700         for (;;) {
 701                 if (vma->vm_end - start >= size)
 702                         break;
 703                 if (!vma->vm_next || vma->vm_end != vma->vm_next->vm_start)
 704                         goto bad_area;
 705                 vma = vma->vm_next;
 706                 if (!(vma->vm_flags & VM_WRITE))
 707                         goto bad_area;
 708         }
 709         return 0;
 710 
 711 check_wp_fault_by_hand:
 712         size--;
 713         size += start & ~PAGE_MASK;
 714         size >>= PAGE_SHIFT;
 715         start &= PAGE_MASK;
 716 
 717         for (;;) {
 718                 do_wp_page(current, vma, start, 1);
 719                 if (!size)
 720                         break;
 721                 size--;
 722                 start += PAGE_SIZE;
 723                 if (start < vma->vm_end)
 724                         continue;
 725                 vma = vma->vm_next;
 726                 if (!vma || vma->vm_start != start)
 727                         goto bad_area;
 728                 if (!(vma->vm_flags & VM_WRITE))
 729                         goto bad_area;;
 730         }
 731         return 0;
 732 
 733 bad_area:
 734         return -EFAULT;
 735 }
 736 
 737 static inline void get_empty_page(struct task_struct * tsk, struct vm_area_struct * vma, pte_t * page_table)
     /* [previous][next][first][last][top][bottom][index][help] */
 738 {
 739         unsigned long tmp;
 740 
 741         if (!(tmp = get_free_page(GFP_KERNEL))) {
 742                 oom(tsk);
 743                 put_page(page_table, BAD_PAGE);
 744                 return;
 745         }
 746         put_page(page_table, pte_mkwrite(mk_pte(tmp, vma->vm_page_prot)));
 747 }
 748 
 749 /*
 750  * This function zeroes out partial mmap'ed pages at truncation time..
 751  */
 752 static void partial_clear(struct vm_area_struct *vma, unsigned long address)
     /* [previous][next][first][last][top][bottom][index][help] */
 753 {
 754         pgd_t *page_dir;
 755         pmd_t *page_middle;
 756         pte_t *page_table, pte;
 757 
 758         page_dir = pgd_offset(vma->vm_mm, address);
 759         if (pgd_none(*page_dir))
 760                 return;
 761         if (pgd_bad(*page_dir)) {
 762                 printk("bad page table directory entry %p:[%lx]\n", page_dir, pgd_val(*page_dir));
 763                 pgd_clear(page_dir);
 764                 return;
 765         }
 766         page_middle = pmd_offset(page_dir, address);
 767         if (pmd_none(*page_middle))
 768                 return;
 769         if (pmd_bad(*page_middle)) {
 770                 printk("bad page table directory entry %p:[%lx]\n", page_dir, pgd_val(*page_dir));
 771                 pmd_clear(page_middle);
 772                 return;
 773         }
 774         page_table = pte_offset(page_middle, address);
 775         pte = *page_table;
 776         if (!pte_present(pte))
 777                 return;
 778         address &= ~PAGE_MASK;
 779         address += pte_page(pte);
 780         if (address >= high_memory)
 781                 return;
 782         memset((void *) address, 0, PAGE_SIZE - (address & ~PAGE_MASK));
 783 }
 784 
 785 /*
 786  * Handle all mappings that got truncated by a "truncate()"
 787  * system call.
 788  *
 789  * NOTE! We have to be ready to update the memory sharing
 790  * between the file and the memory map for a potential last
 791  * incomplete page.  Ugly, but necessary.
 792  */
 793 void vmtruncate(struct inode * inode, unsigned long offset)
     /* [previous][next][first][last][top][bottom][index][help] */
 794 {
 795         struct vm_area_struct * mpnt;
 796 
 797         truncate_inode_pages(inode, offset);
 798         if (!inode->i_mmap)
 799                 return;
 800         mpnt = inode->i_mmap;
 801         do {
 802                 unsigned long start = mpnt->vm_start;
 803                 unsigned long len = mpnt->vm_end - start;
 804                 unsigned long diff;
 805 
 806                 /* mapping wholly truncated? */
 807                 if (mpnt->vm_offset >= offset) {
 808                         zap_page_range(mpnt->vm_mm, start, len);
 809                         continue;
 810                 }
 811                 /* mapping wholly unaffected? */
 812                 diff = offset - mpnt->vm_offset;
 813                 if (diff >= len)
 814                         continue;
 815                 /* Ok, partially affected.. */
 816                 start += diff;
 817                 len = (len - diff) & PAGE_MASK;
 818                 if (start & ~PAGE_MASK) {
 819                         partial_clear(mpnt, start);
 820                         start = (start + ~PAGE_MASK) & PAGE_MASK;
 821                 }
 822                 zap_page_range(mpnt->vm_mm, start, len);
 823         } while ((mpnt = mpnt->vm_next_share) != inode->i_mmap);
 824 }
 825 
 826 /*
 827  * fill in an empty page-table if none exists.
 828  */
 829 static inline pte_t * get_empty_pgtable(struct task_struct * tsk,unsigned long address)
     /* [previous][next][first][last][top][bottom][index][help] */
 830 {
 831         pgd_t *pgd;
 832         pmd_t *pmd;
 833         pte_t *pte;
 834 
 835         pgd = pgd_offset(tsk->mm, address);
 836         pmd = pmd_alloc(pgd, address);
 837         if (!pmd) {
 838                 oom(tsk);
 839                 return NULL;
 840         }
 841         pte = pte_alloc(pmd, address);
 842         if (!pte) {
 843                 oom(tsk);
 844                 return NULL;
 845         }
 846         return pte;
 847 }
 848 
 849 static inline void do_swap_page(struct task_struct * tsk, 
     /* [previous][next][first][last][top][bottom][index][help] */
 850         struct vm_area_struct * vma, unsigned long address,
 851         pte_t * page_table, pte_t entry, int write_access)
 852 {
 853         pte_t page;
 854 
 855         if (!vma->vm_ops || !vma->vm_ops->swapin) {
 856                 swap_in(tsk, vma, page_table, pte_val(entry), write_access);
 857                 return;
 858         }
 859         page = vma->vm_ops->swapin(vma, address - vma->vm_start + vma->vm_offset, pte_val(entry));
 860         if (pte_val(*page_table) != pte_val(entry)) {
 861                 free_page(pte_page(page));
 862                 return;
 863         }
 864         if (mem_map[MAP_NR(pte_page(page))].count > 1 && !(vma->vm_flags & VM_SHARED))
 865                 page = pte_wrprotect(page);
 866         ++vma->vm_mm->rss;
 867         ++tsk->maj_flt;
 868         set_pte(page_table, page);
 869         return;
 870 }
 871 
 872 /*
 873  * do_no_page() tries to create a new page mapping. It aggressively
 874  * tries to share with existing pages, but makes a separate copy if
 875  * the "write_access" parameter is true in order to avoid the next
 876  * page fault.
 877  */
 878 void do_no_page(struct task_struct * tsk, struct vm_area_struct * vma,
     /* [previous][next][first][last][top][bottom][index][help] */
 879         unsigned long address, int write_access)
 880 {
 881         pte_t * page_table;
 882         pte_t entry;
 883         unsigned long page;
 884 
 885         page_table = get_empty_pgtable(tsk, address);
 886         if (!page_table)
 887                 return;
 888         entry = *page_table;
 889         if (pte_present(entry))
 890                 return;
 891         if (!pte_none(entry)) {
 892                 do_swap_page(tsk, vma, address, page_table, entry, write_access);
 893                 return;
 894         }
 895         address &= PAGE_MASK;
 896         if (!vma->vm_ops || !vma->vm_ops->nopage) {
 897                 ++vma->vm_mm->rss;
 898                 ++tsk->min_flt;
 899                 get_empty_page(tsk, vma, page_table);
 900                 return;
 901         }
 902         ++tsk->maj_flt;
 903         ++vma->vm_mm->rss;
 904         /*
 905          * The third argument is "no_share", which tells the low-level code
 906          * to copy, not share the page even if sharing is possible.  It's
 907          * essentially an early COW detection 
 908          */
 909         page = vma->vm_ops->nopage(vma, address, write_access && !(vma->vm_flags & VM_SHARED));
 910         if (!page) {
 911                 send_sig(SIGBUS, current, 1);
 912                 put_page(page_table, BAD_PAGE);
 913                 return;
 914         }
 915         /*
 916          * This silly early PAGE_DIRTY setting removes a race
 917          * due to the bad i386 page protection. But it's valid
 918          * for other architectures too.
 919          *
 920          * Note that if write_access is true, we either now have
 921          * a exclusive copy of the page, or this is a shared mapping,
 922          * so we can make it writable and dirty to avoid having to
 923          * handle that later.
 924          */
 925         entry = mk_pte(page, vma->vm_page_prot);
 926         if (write_access) {
 927                 entry = pte_mkwrite(pte_mkdirty(entry));
 928         } else if (mem_map[MAP_NR(page)].count > 1 && !(vma->vm_flags & VM_SHARED))
 929                 entry = pte_wrprotect(entry);
 930         put_page(page_table, entry);
 931 }
 932 
 933 /*
 934  * The above separate functions for the no-page and wp-page
 935  * cases will go away (they mostly do the same thing anyway),
 936  * and we'll instead use only a general "handle_mm_fault()".
 937  *
 938  * These routines also need to handle stuff like marking pages dirty
 939  * and/or accessed for architectures that don't do it in hardware (most
 940  * RISC architectures).  The early dirtying is also good on the i386.
 941  *
 942  * There is also a hook called "update_mmu_cache()" that architectures
 943  * with external mmu caches can use to update those (ie the Sparc or
 944  * PowerPC hashed page tables that act as extended TLBs).
 945  */
 946 static inline void handle_pte_fault(struct vm_area_struct * vma, unsigned long address,
     /* [previous][next][first][last][top][bottom][index][help] */
 947         int write_access, pte_t * pte)
 948 {
 949         if (!pte_present(*pte)) {
 950                 do_no_page(current, vma, address, write_access);
 951                 return;
 952         }
 953         set_pte(pte, pte_mkyoung(*pte));
 954         if (!write_access)
 955                 return;
 956         if (pte_write(*pte)) {
 957                 set_pte(pte, pte_mkdirty(*pte));
 958                 return;
 959         }
 960         do_wp_page(current, vma, address, write_access);
 961 }
 962 
 963 void handle_mm_fault(struct vm_area_struct * vma, unsigned long address,
     /* [previous][next][first][last][top][bottom][index][help] */
 964         int write_access)
 965 {
 966         pgd_t *pgd;
 967         pmd_t *pmd;
 968         pte_t *pte;
 969 
 970         pgd = pgd_offset(vma->vm_mm, address);
 971         pmd = pmd_alloc(pgd, address);
 972         if (!pmd)
 973                 goto no_memory;
 974         pte = pte_alloc(pmd, address);
 975         if (!pte)
 976                 goto no_memory;
 977         handle_pte_fault(vma, address, write_access, pte);
 978         update_mmu_cache(vma, address, *pte);
 979         return;
 980 no_memory:
 981         oom(current);
 982 }

/* [previous][next][first][last][top][bottom][index][help] */