root/mm/memory.c

/* [previous][next][first][last][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. copy_page
  2. oom
  3. free_one_pmd
  4. free_one_pgd
  5. clear_page_tables
  6. free_page_tables
  7. new_page_tables
  8. copy_one_pte
  9. copy_pte_range
  10. copy_pmd_range
  11. copy_page_range
  12. forget_pte
  13. zap_pte_range
  14. zap_pmd_range
  15. zap_page_range
  16. zeromap_pte_range
  17. zeromap_pmd_range
  18. zeromap_page_range
  19. remap_pte_range
  20. remap_pmd_range
  21. remap_page_range
  22. put_page
  23. put_dirty_page
  24. do_wp_page
  25. verify_area
  26. get_empty_page
  27. try_to_share
  28. share_page
  29. partial_clear
  30. vmtruncate
  31. get_empty_pgtable
  32. do_swap_page
  33. do_no_page
  34. handle_pte_fault
  35. handle_mm_fault

   1 /*
   2  *  linux/mm/memory.c
   3  *
   4  *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
   5  */
   6 
   7 /*
   8  * demand-loading started 01.12.91 - seems it is high on the list of
   9  * things wanted, and it should be easy to implement. - Linus
  10  */
  11 
  12 /*
  13  * Ok, demand-loading was easy, shared pages a little bit tricker. Shared
  14  * pages started 02.12.91, seems to work. - Linus.
  15  *
  16  * Tested sharing by executing about 30 /bin/sh: under the old kernel it
  17  * would have taken more than the 6M I have free, but it worked well as
  18  * far as I could see.
  19  *
  20  * Also corrected some "invalidate()"s - I wasn't doing enough of them.
  21  */
  22 
  23 /*
  24  * Real VM (paging to/from disk) started 18.12.91. Much more work and
  25  * thought has to go into this. Oh, well..
  26  * 19.12.91  -  works, somewhat. Sometimes I get faults, don't know why.
  27  *              Found it. Everything seems to work now.
  28  * 20.12.91  -  Ok, making the swap-device changeable like the root.
  29  */
  30 
  31 /*
  32  * 05.04.94  -  Multi-page memory management added for v1.1.
  33  *              Idea by Alex Bligh (alex@cconcepts.co.uk)
  34  */
  35 
  36 #include <linux/signal.h>
  37 #include <linux/sched.h>
  38 #include <linux/head.h>
  39 #include <linux/kernel.h>
  40 #include <linux/errno.h>
  41 #include <linux/string.h>
  42 #include <linux/types.h>
  43 #include <linux/ptrace.h>
  44 #include <linux/mman.h>
  45 #include <linux/mm.h>
  46 
  47 #include <asm/system.h>
  48 #include <asm/segment.h>
  49 #include <asm/pgtable.h>
  50 
  51 unsigned long high_memory = 0;
  52 
  53 /*
  54  * The free_area_list arrays point to the queue heads of the free areas
  55  * of different sizes
  56  */
  57 int nr_swap_pages = 0;
  58 int nr_free_pages = 0;
  59 struct mem_list free_area_list[NR_MEM_LISTS];
  60 unsigned char * free_area_map[NR_MEM_LISTS];
  61 
  62 /*
  63  * We special-case the C-O-W ZERO_PAGE, because it's such
  64  * a common occurrence (no need to read the page to know
  65  * that it's zero - better for the cache and memory subsystem).
  66  */
  67 static inline void copy_page(unsigned long from, unsigned long to)
     /* [previous][next][first][last][top][bottom][index][help] */
  68 {
  69         if (from == ZERO_PAGE) {
  70                 memset((void *) to, 0, PAGE_SIZE);
  71                 return;
  72         }
  73         memcpy((void *) to, (void *) from, PAGE_SIZE);
  74 }
  75 
  76 #define USER_PTRS_PER_PGD (TASK_SIZE / PGDIR_SIZE)
  77 
  78 mem_map_t * mem_map = NULL;
  79 
  80 /*
  81  * oom() prints a message (so that the user knows why the process died),
  82  * and gives the process an untrappable SIGKILL.
  83  */
  84 void oom(struct task_struct * task)
     /* [previous][next][first][last][top][bottom][index][help] */
  85 {
  86         printk("\nOut of memory for %s.\n", current->comm);
  87         task->sig->action[SIGKILL-1].sa_handler = NULL;
  88         task->blocked &= ~(1<<(SIGKILL-1));
  89         send_sig(SIGKILL,task,1);
  90 }
  91 
  92 /*
  93  * Note: this doesn't free the actual pages themselves. That
  94  * has been handled earlier when unmapping all the memory regions.
  95  */
  96 static inline void free_one_pmd(pmd_t * dir)
     /* [previous][next][first][last][top][bottom][index][help] */
  97 {
  98         pte_t * pte;
  99 
 100         if (pmd_none(*dir))
 101                 return;
 102         if (pmd_bad(*dir)) {
 103                 printk("free_one_pmd: bad directory entry %08lx\n", pmd_val(*dir));
 104                 pmd_clear(dir);
 105                 return;
 106         }
 107         pte = pte_offset(dir, 0);
 108         pmd_clear(dir);
 109         pte_free(pte);
 110 }
 111 
 112 static inline void free_one_pgd(pgd_t * dir)
     /* [previous][next][first][last][top][bottom][index][help] */
 113 {
 114         pmd_t * pmd;
 115 
 116         if (pgd_none(*dir))
 117                 return;
 118         if (pgd_bad(*dir)) {
 119                 printk("free_one_pgd: bad directory entry %08lx\n", pgd_val(*dir));
 120                 pgd_clear(dir);
 121                 return;
 122         }
 123         pmd = pmd_offset(dir, 0);
 124         pgd_clear(dir);
 125         if (!pmd_inuse(pmd)) {
 126                 int j;
 127                 for (j = 0; j < PTRS_PER_PMD ; j++)
 128                         free_one_pmd(pmd+j);
 129         }
 130         pmd_free(pmd);
 131 }
 132         
 133 /*
 134  * This function clears all user-level page tables of a process - this
 135  * is needed by execve(), so that old pages aren't in the way.
 136  */
 137 void clear_page_tables(struct task_struct * tsk)
     /* [previous][next][first][last][top][bottom][index][help] */
 138 {
 139         int i;
 140         pgd_t * page_dir;
 141 
 142         page_dir = tsk->mm->pgd;
 143         if (!page_dir || page_dir == swapper_pg_dir) {
 144                 printk("%s trying to clear kernel page-directory: not good\n", tsk->comm);
 145                 return;
 146         }
 147         for (i = 0 ; i < USER_PTRS_PER_PGD ; i++)
 148                 free_one_pgd(page_dir + i);
 149         invalidate_mm(tsk->mm);
 150 }
 151 
 152 /*
 153  * This function frees up all page tables of a process when it exits. It
 154  * is the same as "clear_page_tables()", except it also changes the process'
 155  * page table directory to the kernel page tables and then frees the old
 156  * page table directory.
 157  */
 158 void free_page_tables(struct task_struct * tsk)
     /* [previous][next][first][last][top][bottom][index][help] */
 159 {
 160         int i;
 161         pgd_t * page_dir;
 162 
 163         page_dir = tsk->mm->pgd;
 164         if (!page_dir || page_dir == swapper_pg_dir) {
 165                 printk("%s trying to free kernel page-directory: not good\n", tsk->comm);
 166                 return;
 167         }
 168         invalidate_mm(tsk->mm);
 169         SET_PAGE_DIR(tsk, swapper_pg_dir);
 170         tsk->mm->pgd = swapper_pg_dir;  /* or else... */
 171         for (i = 0 ; i < PTRS_PER_PGD ; i++)
 172                 free_one_pgd(page_dir + i);
 173         pgd_free(page_dir);
 174 }
 175 
 176 int new_page_tables(struct task_struct * tsk)
     /* [previous][next][first][last][top][bottom][index][help] */
 177 {
 178         pgd_t * page_dir, * new_pg;
 179         int i;
 180 
 181         if (!(new_pg = pgd_alloc()))
 182                 return -ENOMEM;
 183         page_dir = pgd_offset(&init_mm, 0);
 184         for (i = USER_PTRS_PER_PGD ; i < PTRS_PER_PGD ; i++)
 185                 new_pg[i] = page_dir[i];
 186         invalidate_mm(tsk->mm);
 187         SET_PAGE_DIR(tsk, new_pg);
 188         tsk->mm->pgd = new_pg;
 189         return 0;
 190 }
 191 
 192 static inline void copy_one_pte(pte_t * old_pte, pte_t * new_pte)
     /* [previous][next][first][last][top][bottom][index][help] */
 193 {
 194         pte_t pte = *old_pte;
 195 
 196         if (pte_none(pte))
 197                 return;
 198         if (!pte_present(pte)) {
 199                 swap_duplicate(pte_val(pte));
 200                 set_pte(new_pte, pte);
 201                 return;
 202         }
 203         if (pte_page(pte) > high_memory || mem_map[MAP_NR(pte_page(pte))].reserved) {
 204                 set_pte(new_pte, pte);
 205                 return;
 206         }
 207         if (pte_cow(pte))
 208                 pte = pte_wrprotect(pte);
 209         if (delete_from_swap_cache(pte_page(pte)))
 210                 pte = pte_mkdirty(pte);
 211         set_pte(new_pte, pte_mkold(pte));
 212         set_pte(old_pte, pte);
 213         mem_map[MAP_NR(pte_page(pte))].count++;
 214 }
 215 
 216 static inline int copy_pte_range(pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long address, unsigned long size)
     /* [previous][next][first][last][top][bottom][index][help] */
 217 {
 218         pte_t * src_pte, * dst_pte;
 219         unsigned long end;
 220 
 221         if (pmd_none(*src_pmd))
 222                 return 0;
 223         if (pmd_bad(*src_pmd)) {
 224                 printk("copy_pte_range: bad pmd (%08lx)\n", pmd_val(*src_pmd));
 225                 pmd_clear(src_pmd);
 226                 return 0;
 227         }
 228         src_pte = pte_offset(src_pmd, address);
 229         if (pmd_none(*dst_pmd)) {
 230                 if (!pte_alloc(dst_pmd, 0))
 231                         return -ENOMEM;
 232         }
 233         dst_pte = pte_offset(dst_pmd, address);
 234         address &= ~PMD_MASK;
 235         end = address + size;
 236         if (end >= PMD_SIZE)
 237                 end = PMD_SIZE;
 238         do {
 239                 /* I would like to switch arguments here, to make it
 240                  * consistent with copy_xxx_range and memcpy syntax.
 241                  */
 242                 copy_one_pte(src_pte++, dst_pte++);
 243                 address += PAGE_SIZE;
 244         } while (address < end);
 245         return 0;
 246 }
 247 
 248 static inline int copy_pmd_range(pgd_t *dst_pgd, pgd_t *src_pgd, unsigned long address, unsigned long size)
     /* [previous][next][first][last][top][bottom][index][help] */
 249 {
 250         pmd_t * src_pmd, * dst_pmd;
 251         unsigned long end;
 252         int error = 0;
 253 
 254         if (pgd_none(*src_pgd))
 255                 return 0;
 256         if (pgd_bad(*src_pgd)) {
 257                 printk("copy_pmd_range: bad pgd (%08lx)\n", pgd_val(*src_pgd));
 258                 pgd_clear(src_pgd);
 259                 return 0;
 260         }
 261         src_pmd = pmd_offset(src_pgd, address);
 262         if (pgd_none(*dst_pgd)) {
 263                 if (!pmd_alloc(dst_pgd, 0))
 264                         return -ENOMEM;
 265         }
 266         dst_pmd = pmd_offset(dst_pgd, address);
 267         address &= ~PGDIR_MASK;
 268         end = address + size;
 269         if (end > PGDIR_SIZE)
 270                 end = PGDIR_SIZE;
 271         do {
 272                 error = copy_pte_range(dst_pmd++, src_pmd++, address, end - address);
 273                 if (error)
 274                         break;
 275                 address = (address + PMD_SIZE) & PMD_MASK; 
 276         } while (address < end);
 277         return error;
 278 }
 279 
 280 /*
 281  * copy one vm_area from one task to the other. Assumes the page tables
 282  * already present in the new task to be cleared in the whole range
 283  * covered by this vma.
 284  */
 285 int copy_page_range(struct mm_struct *dst, struct mm_struct *src,
     /* [previous][next][first][last][top][bottom][index][help] */
 286                         struct vm_area_struct *vma)
 287 {
 288         pgd_t * src_pgd, * dst_pgd;
 289         unsigned long address = vma->vm_start;
 290         unsigned long end = vma->vm_end;
 291         int error = 0;
 292 
 293         src_pgd = pgd_offset(src, address);
 294         dst_pgd = pgd_offset(dst, address);
 295         while (address < end) {
 296                 error = copy_pmd_range(dst_pgd++, src_pgd++, address, end - address);
 297                 if (error)
 298                         break;
 299                 address = (address + PGDIR_SIZE) & PGDIR_MASK;
 300         }
 301         /* Note that the src ptes get c-o-w treatment, so they change too. */
 302         invalidate_range(src, vma->vm_start, vma->vm_end);
 303         invalidate_range(dst, vma->vm_start, vma->vm_end);
 304         return error;
 305 }
 306 
 307 static inline void forget_pte(pte_t page)
     /* [previous][next][first][last][top][bottom][index][help] */
 308 {
 309         if (pte_none(page))
 310                 return;
 311         if (pte_present(page)) {
 312                 unsigned long addr = pte_page(page);
 313                 if (addr >= high_memory || mem_map[MAP_NR(addr)].reserved)
 314                         return;
 315                 free_page(addr);
 316                 if (current->mm->rss <= 0)
 317                         return;
 318                 current->mm->rss--;
 319                 return;
 320         }
 321         swap_free(pte_val(page));
 322 }
 323 
 324 static inline void zap_pte_range(pmd_t * pmd, unsigned long address, unsigned long size)
     /* [previous][next][first][last][top][bottom][index][help] */
 325 {
 326         pte_t * pte;
 327         unsigned long end;
 328 
 329         if (pmd_none(*pmd))
 330                 return;
 331         if (pmd_bad(*pmd)) {
 332                 printk("zap_pte_range: bad pmd (%08lx)\n", pmd_val(*pmd));
 333                 pmd_clear(pmd);
 334                 return;
 335         }
 336         pte = pte_offset(pmd, address);
 337         address &= ~PMD_MASK;
 338         end = address + size;
 339         if (end >= PMD_SIZE)
 340                 end = PMD_SIZE;
 341         do {
 342                 pte_t page = *pte;
 343                 pte_clear(pte);
 344                 forget_pte(page);
 345                 address += PAGE_SIZE;
 346                 pte++;
 347         } while (address < end);
 348 }
 349 
 350 static inline void zap_pmd_range(pgd_t * dir, unsigned long address, unsigned long size)
     /* [previous][next][first][last][top][bottom][index][help] */
 351 {
 352         pmd_t * pmd;
 353         unsigned long end;
 354 
 355         if (pgd_none(*dir))
 356                 return;
 357         if (pgd_bad(*dir)) {
 358                 printk("zap_pmd_range: bad pgd (%08lx)\n", pgd_val(*dir));
 359                 pgd_clear(dir);
 360                 return;
 361         }
 362         pmd = pmd_offset(dir, address);
 363         address &= ~PGDIR_MASK;
 364         end = address + size;
 365         if (end > PGDIR_SIZE)
 366                 end = PGDIR_SIZE;
 367         do {
 368                 zap_pte_range(pmd, address, end - address);
 369                 address = (address + PMD_SIZE) & PMD_MASK; 
 370                 pmd++;
 371         } while (address < end);
 372 }
 373 
 374 /*
 375  * remove user pages in a given range.
 376  */
 377 int zap_page_range(struct mm_struct *mm, unsigned long address, unsigned long size)
     /* [previous][next][first][last][top][bottom][index][help] */
 378 {
 379         pgd_t * dir;
 380         unsigned long end = address + size;
 381 
 382         dir = pgd_offset(mm, address);
 383         while (address < end) {
 384                 zap_pmd_range(dir, address, end - address);
 385                 address = (address + PGDIR_SIZE) & PGDIR_MASK;
 386                 dir++;
 387         }
 388         invalidate_range(mm, end - size, end);
 389         return 0;
 390 }
 391 
 392 static inline void zeromap_pte_range(pte_t * pte, unsigned long address, unsigned long size, pte_t zero_pte)
     /* [previous][next][first][last][top][bottom][index][help] */
 393 {
 394         unsigned long end;
 395 
 396         address &= ~PMD_MASK;
 397         end = address + size;
 398         if (end > PMD_SIZE)
 399                 end = PMD_SIZE;
 400         do {
 401                 pte_t oldpage = *pte;
 402                 set_pte(pte, zero_pte);
 403                 forget_pte(oldpage);
 404                 address += PAGE_SIZE;
 405                 pte++;
 406         } while (address < end);
 407 }
 408 
 409 static inline int zeromap_pmd_range(pmd_t * pmd, unsigned long address, unsigned long size, pte_t zero_pte)
     /* [previous][next][first][last][top][bottom][index][help] */
 410 {
 411         unsigned long end;
 412 
 413         address &= ~PGDIR_MASK;
 414         end = address + size;
 415         if (end > PGDIR_SIZE)
 416                 end = PGDIR_SIZE;
 417         do {
 418                 pte_t * pte = pte_alloc(pmd, address);
 419                 if (!pte)
 420                         return -ENOMEM;
 421                 zeromap_pte_range(pte, address, end - address, zero_pte);
 422                 address = (address + PMD_SIZE) & PMD_MASK;
 423                 pmd++;
 424         } while (address < end);
 425         return 0;
 426 }
 427 
 428 int zeromap_page_range(unsigned long address, unsigned long size, pgprot_t prot)
     /* [previous][next][first][last][top][bottom][index][help] */
 429 {
 430         int error = 0;
 431         pgd_t * dir;
 432         unsigned long end = address + size;
 433         pte_t zero_pte;
 434 
 435         zero_pte = pte_wrprotect(mk_pte(ZERO_PAGE, prot));
 436         dir = pgd_offset(current->mm, address);
 437         while (address < end) {
 438                 pmd_t *pmd = pmd_alloc(dir, address);
 439                 error = -ENOMEM;
 440                 if (!pmd)
 441                         break;
 442                 error = zeromap_pmd_range(pmd, address, end - address, zero_pte);
 443                 if (error)
 444                         break;
 445                 address = (address + PGDIR_SIZE) & PGDIR_MASK;
 446                 dir++;
 447         }
 448         invalidate_range(current->mm, end - size, end);
 449         return error;
 450 }
 451 
 452 /*
 453  * maps a range of physical memory into the requested pages. the old
 454  * mappings are removed. any references to nonexistent pages results
 455  * in null mappings (currently treated as "copy-on-access")
 456  */
 457 static inline void remap_pte_range(pte_t * pte, unsigned long address, unsigned long size,
     /* [previous][next][first][last][top][bottom][index][help] */
 458         unsigned long offset, pgprot_t prot)
 459 {
 460         unsigned long end;
 461 
 462         address &= ~PMD_MASK;
 463         end = address + size;
 464         if (end > PMD_SIZE)
 465                 end = PMD_SIZE;
 466         do {
 467                 pte_t oldpage = *pte;
 468                 pte_clear(pte);
 469                 if (offset >= high_memory || mem_map[MAP_NR(offset)].reserved)
 470                         set_pte(pte, mk_pte(offset, prot));
 471                 forget_pte(oldpage);
 472                 address += PAGE_SIZE;
 473                 offset += PAGE_SIZE;
 474                 pte++;
 475         } while (address < end);
 476 }
 477 
 478 static inline int remap_pmd_range(pmd_t * pmd, unsigned long address, unsigned long size,
     /* [previous][next][first][last][top][bottom][index][help] */
 479         unsigned long offset, pgprot_t prot)
 480 {
 481         unsigned long end;
 482 
 483         address &= ~PGDIR_MASK;
 484         end = address + size;
 485         if (end > PGDIR_SIZE)
 486                 end = PGDIR_SIZE;
 487         offset -= address;
 488         do {
 489                 pte_t * pte = pte_alloc(pmd, address);
 490                 if (!pte)
 491                         return -ENOMEM;
 492                 remap_pte_range(pte, address, end - address, address + offset, prot);
 493                 address = (address + PMD_SIZE) & PMD_MASK;
 494                 pmd++;
 495         } while (address < end);
 496         return 0;
 497 }
 498 
 499 int remap_page_range(unsigned long from, unsigned long offset, unsigned long size, pgprot_t prot)
     /* [previous][next][first][last][top][bottom][index][help] */
 500 {
 501         int error = 0;
 502         pgd_t * dir;
 503         unsigned long end = from + size;
 504 
 505         offset -= from;
 506         dir = pgd_offset(current->mm, from);
 507         while (from < end) {
 508                 pmd_t *pmd = pmd_alloc(dir, from);
 509                 error = -ENOMEM;
 510                 if (!pmd)
 511                         break;
 512                 error = remap_pmd_range(pmd, from, end - from, offset + from, prot);
 513                 if (error)
 514                         break;
 515                 from = (from + PGDIR_SIZE) & PGDIR_MASK;
 516                 dir++;
 517         }
 518         invalidate_range(current->mm, from - size, from);
 519         return error;
 520 }
 521 
 522 /*
 523  * sanity-check function..
 524  */
 525 static void put_page(pte_t * page_table, pte_t pte)
     /* [previous][next][first][last][top][bottom][index][help] */
 526 {
 527         if (!pte_none(*page_table)) {
 528                 printk("put_page: page already exists %08lx\n", pte_val(*page_table));
 529                 free_page(pte_page(pte));
 530                 return;
 531         }
 532 /* no need for invalidate */
 533         set_pte(page_table, pte);
 534 }
 535 
 536 /*
 537  * This routine is used to map in a page into an address space: needed by
 538  * execve() for the initial stack and environment pages.
 539  */
 540 unsigned long put_dirty_page(struct task_struct * tsk, unsigned long page, unsigned long address)
     /* [previous][next][first][last][top][bottom][index][help] */
 541 {
 542         pgd_t * pgd;
 543         pmd_t * pmd;
 544         pte_t * pte;
 545 
 546         if (page >= high_memory)
 547                 printk("put_dirty_page: trying to put page %08lx at %08lx\n",page,address);
 548         if (mem_map[MAP_NR(page)].count != 1)
 549                 printk("mem_map disagrees with %08lx at %08lx\n",page,address);
 550         pgd = pgd_offset(tsk->mm,address);
 551         pmd = pmd_alloc(pgd, address);
 552         if (!pmd) {
 553                 free_page(page);
 554                 oom(tsk);
 555                 return 0;
 556         }
 557         pte = pte_alloc(pmd, address);
 558         if (!pte) {
 559                 free_page(page);
 560                 oom(tsk);
 561                 return 0;
 562         }
 563         if (!pte_none(*pte)) {
 564                 printk("put_dirty_page: page already exists\n");
 565                 free_page(page);
 566                 return 0;
 567         }
 568         set_pte(pte, pte_mkwrite(pte_mkdirty(mk_pte(page, PAGE_COPY))));
 569 /* no need for invalidate */
 570         return page;
 571 }
 572 
 573 /*
 574  * This routine handles present pages, when users try to write
 575  * to a shared page. It is done by copying the page to a new address
 576  * and decrementing the shared-page counter for the old page.
 577  *
 578  * Goto-purists beware: the only reason for goto's here is that it results
 579  * in better assembly code.. The "default" path will see no jumps at all.
 580  *
 581  * Note that this routine assumes that the protection checks have been
 582  * done by the caller (the low-level page fault routine in most cases).
 583  * Thus we can safely just mark it writable once we've done any necessary
 584  * COW.
 585  *
 586  * We also mark the page dirty at this point even though the page will
 587  * change only once the write actually happens. This avoids a few races,
 588  * and potentially makes it more efficient.
 589  */
 590 void do_wp_page(struct task_struct * tsk, struct vm_area_struct * vma,
     /* [previous][next][first][last][top][bottom][index][help] */
 591         unsigned long address, int write_access)
 592 {
 593         pgd_t *page_dir;
 594         pmd_t *page_middle;
 595         pte_t *page_table, pte;
 596         unsigned long old_page, new_page;
 597 
 598         new_page = __get_free_page(GFP_KERNEL);
 599         page_dir = pgd_offset(vma->vm_mm, address);
 600         if (pgd_none(*page_dir))
 601                 goto end_wp_page;
 602         if (pgd_bad(*page_dir))
 603                 goto bad_wp_pagedir;
 604         page_middle = pmd_offset(page_dir, address);
 605         if (pmd_none(*page_middle))
 606                 goto end_wp_page;
 607         if (pmd_bad(*page_middle))
 608                 goto bad_wp_pagemiddle;
 609         page_table = pte_offset(page_middle, address);
 610         pte = *page_table;
 611         if (!pte_present(pte))
 612                 goto end_wp_page;
 613         if (pte_write(pte))
 614                 goto end_wp_page;
 615         old_page = pte_page(pte);
 616         if (old_page >= high_memory)
 617                 goto bad_wp_page;
 618         tsk->min_flt++;
 619         /*
 620          * Do we need to copy?
 621          */
 622         if (mem_map[MAP_NR(old_page)].count != 1) {
 623                 if (new_page) {
 624                         if (mem_map[MAP_NR(old_page)].reserved)
 625                                 ++vma->vm_mm->rss;
 626                         copy_page(old_page,new_page);
 627                         set_pte(page_table, pte_mkwrite(pte_mkdirty(mk_pte(new_page, vma->vm_page_prot))));
 628                         free_page(old_page);
 629                         invalidate_page(vma, address);
 630                         return;
 631                 }
 632                 set_pte(page_table, BAD_PAGE);
 633                 free_page(old_page);
 634                 oom(tsk);
 635                 invalidate_page(vma, address);
 636                 return;
 637         }
 638         set_pte(page_table, pte_mkdirty(pte_mkwrite(pte)));
 639         invalidate_page(vma, address);
 640         if (new_page)
 641                 free_page(new_page);
 642         return;
 643 bad_wp_page:
 644         printk("do_wp_page: bogus page at address %08lx (%08lx)\n",address,old_page);
 645         send_sig(SIGKILL, tsk, 1);
 646         goto end_wp_page;
 647 bad_wp_pagemiddle:
 648         printk("do_wp_page: bogus page-middle at address %08lx (%08lx)\n", address, pmd_val(*page_middle));
 649         send_sig(SIGKILL, tsk, 1);
 650         goto end_wp_page;
 651 bad_wp_pagedir:
 652         printk("do_wp_page: bogus page-dir entry at address %08lx (%08lx)\n", address, pgd_val(*page_dir));
 653         send_sig(SIGKILL, tsk, 1);
 654 end_wp_page:
 655         if (new_page)
 656                 free_page(new_page);
 657         return;
 658 }
 659 
 660 /*
 661  * Ugly, ugly, but the goto's result in better assembly..
 662  */
 663 int verify_area(int type, const void * addr, unsigned long size)
     /* [previous][next][first][last][top][bottom][index][help] */
 664 {
 665         struct vm_area_struct * vma;
 666         unsigned long start = (unsigned long) addr;
 667 
 668         /* If the current user space is mapped to kernel space (for the
 669          * case where we use a fake user buffer with get_fs/set_fs()) we
 670          * don't expect to find the address in the user vm map.
 671          */
 672         if (get_fs() == get_ds())
 673                 return 0;
 674 
 675         vma = find_vma(current, start);
 676         if (!vma)
 677                 goto bad_area;
 678         if (vma->vm_start <= start)
 679                 goto good_area;
 680         if (!(vma->vm_flags & VM_GROWSDOWN))
 681                 goto bad_area;
 682         if (expand_stack(vma, start))
 683                 goto bad_area;
 684 
 685 good_area:
 686         if (type == VERIFY_WRITE)
 687                 goto check_write;
 688         for (;;) {
 689                 struct vm_area_struct * next;
 690                 if (!(vma->vm_flags & VM_READ))
 691                         goto bad_area;
 692                 if (vma->vm_end - start >= size)
 693                         return 0;
 694                 next = vma->vm_next;
 695                 if (!next || vma->vm_end != next->vm_start)
 696                         goto bad_area;
 697                 vma = next;
 698         }
 699 
 700 check_write:
 701         if (!(vma->vm_flags & VM_WRITE))
 702                 goto bad_area;
 703         if (!wp_works_ok)
 704                 goto check_wp_fault_by_hand;
 705         for (;;) {
 706                 if (vma->vm_end - start >= size)
 707                         break;
 708                 if (!vma->vm_next || vma->vm_end != vma->vm_next->vm_start)
 709                         goto bad_area;
 710                 vma = vma->vm_next;
 711                 if (!(vma->vm_flags & VM_WRITE))
 712                         goto bad_area;
 713         }
 714         return 0;
 715 
 716 check_wp_fault_by_hand:
 717         size--;
 718         size += start & ~PAGE_MASK;
 719         size >>= PAGE_SHIFT;
 720         start &= PAGE_MASK;
 721 
 722         for (;;) {
 723                 do_wp_page(current, vma, start, 1);
 724                 if (!size)
 725                         break;
 726                 size--;
 727                 start += PAGE_SIZE;
 728                 if (start < vma->vm_end)
 729                         continue;
 730                 vma = vma->vm_next;
 731                 if (!vma || vma->vm_start != start)
 732                         goto bad_area;
 733                 if (!(vma->vm_flags & VM_WRITE))
 734                         goto bad_area;;
 735         }
 736         return 0;
 737 
 738 bad_area:
 739         return -EFAULT;
 740 }
 741 
 742 static inline void get_empty_page(struct task_struct * tsk, struct vm_area_struct * vma, pte_t * page_table)
     /* [previous][next][first][last][top][bottom][index][help] */
 743 {
 744         unsigned long tmp;
 745 
 746         if (!(tmp = get_free_page(GFP_KERNEL))) {
 747                 oom(tsk);
 748                 put_page(page_table, BAD_PAGE);
 749                 return;
 750         }
 751         put_page(page_table, pte_mkwrite(mk_pte(tmp, vma->vm_page_prot)));
 752 }
 753 
 754 /*
 755  * try_to_share() checks the page at address "address" in the task "p",
 756  * to see if it exists, and if it is clean. If so, share it with the current
 757  * task.
 758  *
 759  * NOTE! This assumes we have checked that p != current, and that they
 760  * share the same inode and can generally otherwise be shared.
 761  */
 762 static int try_to_share(unsigned long to_address, struct vm_area_struct * to_area,
     /* [previous][next][first][last][top][bottom][index][help] */
 763         unsigned long from_address, struct vm_area_struct * from_area,
 764         unsigned long newpage)
 765 {
 766         pgd_t * from_dir, * to_dir;
 767         pmd_t * from_middle, * to_middle;
 768         pte_t * from_table, * to_table;
 769         pte_t from, to;
 770 
 771         from_dir = pgd_offset(from_area->vm_mm,from_address);
 772 /* is there a page-directory at from? */
 773         if (pgd_none(*from_dir))
 774                 return 0;
 775         if (pgd_bad(*from_dir)) {
 776                 printk("try_to_share: bad page directory %08lx\n", pgd_val(*from_dir));
 777                 pgd_clear(from_dir);
 778                 return 0;
 779         }
 780         from_middle = pmd_offset(from_dir, from_address);
 781 /* is there a mid-directory at from? */
 782         if (pmd_none(*from_middle))
 783                 return 0;
 784         if (pmd_bad(*from_middle)) {
 785                 printk("try_to_share: bad mid directory %08lx\n", pmd_val(*from_middle));
 786                 pmd_clear(from_middle);
 787                 return 0;
 788         }
 789         from_table = pte_offset(from_middle, from_address);
 790         from = *from_table;
 791 /* is the page present? */
 792         if (!pte_present(from))
 793                 return 0;
 794 /* if it is dirty it must be from a shared mapping to be shared */
 795         if (pte_dirty(from)) {
 796                 if (!(from_area->vm_flags & VM_SHARED))
 797                         return 0;
 798         }
 799 /* is the page reasonable at all? */
 800         if (pte_page(from) >= high_memory)
 801                 return 0;
 802         if (mem_map[MAP_NR(pte_page(from))].reserved)
 803                 return 0;
 804 /* is the destination ok? */
 805         to_dir = pgd_offset(to_area->vm_mm,to_address);
 806 /* is there a page-directory at to? */
 807         if (pgd_none(*to_dir))
 808                 return 0;
 809         if (pgd_bad(*to_dir)) {
 810                 printk("try_to_share: bad page directory %08lx\n", pgd_val(*to_dir));
 811                 return 0;
 812         }
 813         to_middle = pmd_offset(to_dir, to_address);
 814 /* is there a mid-directory at to? */
 815         if (pmd_none(*to_middle))
 816                 return 0;
 817         if (pmd_bad(*to_middle)) {
 818                 printk("try_to_share: bad mid directory %08lx\n", pmd_val(*to_middle));
 819                 return 0;
 820         }
 821         to_table = pte_offset(to_middle, to_address);
 822         to = *to_table;
 823         if (!pte_none(to))
 824                 return 0;
 825 /* do we copy? */
 826         if (newpage) {
 827                 /* if it's in the swap cache, it's dirty by implication */
 828                 /* so we can't use it if it's not from a shared mapping */
 829                 if (in_swap_cache(pte_page(from))) {
 830                         if (!(from_area->vm_flags & VM_SHARED))
 831                                 return 0;
 832                 }
 833                 copy_page(pte_page(from), newpage);
 834                 set_pte(to_table, mk_pte(newpage, to_area->vm_page_prot));
 835                 return 1;
 836         }
 837 /*
 838  * do a final swap-cache test before sharing them: if it's in the swap
 839  * cache, we have to remove it now, as we get two pointers to the same
 840  * physical page and the cache can't handle it. Mark the original dirty.
 841  *
 842  * NOTE! Even if "from" is dirty, "to" will be clean: if we get here
 843  * with a dirty "from", the from-mapping is a shared map, so we can trust
 844  * the page contents to be up-to-date
 845  */
 846         if (in_swap_cache(pte_page(from))) {
 847                 if (!(from_area->vm_flags & VM_SHARED))
 848                         return 0;
 849                 set_pte(from_table, pte_mkdirty(from));
 850                 delete_from_swap_cache(pte_page(from));
 851         }
 852         mem_map[MAP_NR(pte_page(from))].count++;
 853         set_pte(to_table, mk_pte(pte_page(from), to_area->vm_page_prot));
 854 /* Check if we need to do anything at all to the 'from' field */
 855         if (!pte_write(from))
 856                 return 1;
 857         if (from_area->vm_flags & VM_SHARED)
 858                 return 1;
 859 /* ok, need to mark it read-only, so invalidate any possible old TB entry */
 860         set_pte(from_table, pte_wrprotect(from));
 861         invalidate_page(from_area, from_address);
 862         return 1;
 863 }
 864 
 865 /*
 866  * share_page() tries to find a process that could share a page with
 867  * the current one.
 868  *
 869  * We first check if it is at all feasible by checking inode->i_count.
 870  * It should be >1 if there are other tasks sharing this inode.
 871  */
 872 static int share_page(struct vm_area_struct * area, unsigned long address,
     /* [previous][next][first][last][top][bottom][index][help] */
 873         int write_access, unsigned long newpage)
 874 {
 875         struct inode * inode;
 876         unsigned long offset;
 877         unsigned long from_address;
 878         unsigned long give_page;
 879         struct vm_area_struct * mpnt;
 880 
 881         if (!area || !(inode = area->vm_inode) || inode->i_count < 2)
 882                 return 0;
 883         /* do we need to copy or can we just share? */
 884         give_page = 0;
 885         if (write_access && !(area->vm_flags & VM_SHARED)) {
 886                 if (!newpage)
 887                         return 0;
 888                 give_page = newpage;
 889         }
 890         offset = address - area->vm_start + area->vm_offset;
 891         /* See if there is something in the VM we can share pages with. */
 892         /* Traverse the entire circular i_mmap list, except `area' itself. */
 893         for (mpnt = area->vm_next_share; mpnt != area; mpnt = mpnt->vm_next_share) {
 894                 /* must be same inode */
 895                 if (mpnt->vm_inode != inode) {
 896                         printk("Aiee! Corrupt vm_area_struct i_mmap ring\n");
 897                         break;  
 898                 }
 899                 /* offsets must be mutually page-aligned */
 900                 if ((mpnt->vm_offset ^ area->vm_offset) & ~PAGE_MASK)
 901                         continue;
 902                 /* the other area must actually cover the wanted page.. */
 903                 from_address = offset + mpnt->vm_start - mpnt->vm_offset;
 904                 if (from_address < mpnt->vm_start || from_address >= mpnt->vm_end)
 905                         continue;
 906                 /* .. NOW we can actually try to use the same physical page */
 907                 if (!try_to_share(address, area, from_address, mpnt, give_page))
 908                         continue;
 909                 /* free newpage if we never used it.. */
 910                 if (give_page || !newpage)
 911                         return 1;
 912                 free_page(newpage);
 913                 return 1;
 914         }
 915         return 0;
 916 }
 917 
 918 /*
 919  * This function zeroes out partial mmap'ed pages at truncation time..
 920  */
 921 static void partial_clear(struct vm_area_struct *vma, unsigned long address)
     /* [previous][next][first][last][top][bottom][index][help] */
 922 {
 923         pgd_t *page_dir;
 924         pmd_t *page_middle;
 925         pte_t *page_table, pte;
 926 
 927         page_dir = pgd_offset(vma->vm_mm, address);
 928         if (pgd_none(*page_dir))
 929                 return;
 930         if (pgd_bad(*page_dir)) {
 931                 printk("bad page table directory entry %p:[%lx]\n", page_dir, pgd_val(*page_dir));
 932                 pgd_clear(page_dir);
 933                 return;
 934         }
 935         page_middle = pmd_offset(page_dir, address);
 936         if (pmd_none(*page_middle))
 937                 return;
 938         if (pmd_bad(*page_middle)) {
 939                 printk("bad page table directory entry %p:[%lx]\n", page_dir, pgd_val(*page_dir));
 940                 pmd_clear(page_middle);
 941                 return;
 942         }
 943         page_table = pte_offset(page_middle, address);
 944         pte = *page_table;
 945         if (!pte_present(pte))
 946                 return;
 947         address &= ~PAGE_MASK;
 948         address += pte_page(pte);
 949         if (address >= high_memory)
 950                 return;
 951         memset((void *) address, 0, PAGE_SIZE - (address & ~PAGE_MASK));
 952 }
 953 
 954 /*
 955  * Handle all mappings that got truncated by a "truncate()"
 956  * system call.
 957  *
 958  * NOTE! We have to be ready to update the memory sharing
 959  * between the file and the memory map for a potential last
 960  * incomplete page.  Ugly, but necessary.
 961  */
 962 void vmtruncate(struct inode * inode, unsigned long offset)
     /* [previous][next][first][last][top][bottom][index][help] */
 963 {
 964         struct vm_area_struct * mpnt;
 965 
 966         invalidate_inode_pages(inode, offset);
 967         if (!inode->i_mmap)
 968                 return;
 969         mpnt = inode->i_mmap;
 970         do {
 971                 unsigned long start = mpnt->vm_start;
 972                 unsigned long len = mpnt->vm_end - start;
 973                 unsigned long diff;
 974 
 975                 /* mapping wholly truncated? */
 976                 if (mpnt->vm_offset >= offset) {
 977                         zap_page_range(mpnt->vm_mm, start, len);
 978                         continue;
 979                 }
 980                 /* mapping wholly unaffected? */
 981                 diff = offset - mpnt->vm_offset;
 982                 if (diff >= len)
 983                         continue;
 984                 /* Ok, partially affected.. */
 985                 start += diff;
 986                 len = (len - diff) & PAGE_MASK;
 987                 if (start & ~PAGE_MASK) {
 988                         partial_clear(mpnt, start);
 989                         start = (start + ~PAGE_MASK) & PAGE_MASK;
 990                 }
 991                 zap_page_range(mpnt->vm_mm, start, len);
 992         } while ((mpnt = mpnt->vm_next_share) != inode->i_mmap);
 993 }
 994 
 995 /*
 996  * fill in an empty page-table if none exists.
 997  */
 998 static inline pte_t * get_empty_pgtable(struct task_struct * tsk,unsigned long address)
     /* [previous][next][first][last][top][bottom][index][help] */
 999 {
1000         pgd_t *pgd;
1001         pmd_t *pmd;
1002         pte_t *pte;
1003 
1004         pgd = pgd_offset(tsk->mm, address);
1005         pmd = pmd_alloc(pgd, address);
1006         if (!pmd) {
1007                 oom(tsk);
1008                 return NULL;
1009         }
1010         pte = pte_alloc(pmd, address);
1011         if (!pte) {
1012                 oom(tsk);
1013                 return NULL;
1014         }
1015         return pte;
1016 }
1017 
1018 static inline void do_swap_page(struct task_struct * tsk, 
     /* [previous][next][first][last][top][bottom][index][help] */
1019         struct vm_area_struct * vma, unsigned long address,
1020         pte_t * page_table, pte_t entry, int write_access)
1021 {
1022         pte_t page;
1023 
1024         if (!vma->vm_ops || !vma->vm_ops->swapin) {
1025                 swap_in(tsk, vma, page_table, pte_val(entry), write_access);
1026                 return;
1027         }
1028         page = vma->vm_ops->swapin(vma, address - vma->vm_start + vma->vm_offset, pte_val(entry));
1029         if (pte_val(*page_table) != pte_val(entry)) {
1030                 free_page(pte_page(page));
1031                 return;
1032         }
1033         if (mem_map[MAP_NR(pte_page(page))].count > 1 && !(vma->vm_flags & VM_SHARED))
1034                 page = pte_wrprotect(page);
1035         ++vma->vm_mm->rss;
1036         ++tsk->maj_flt;
1037         set_pte(page_table, page);
1038         return;
1039 }
1040 
1041 /*
1042  * do_no_page() tries to create a new page mapping. It aggressively
1043  * tries to share with existing pages, but makes a separate copy if
1044  * the "write_access" parameter is true in order to avoid the next
1045  * page fault.
1046  */
1047 void do_no_page(struct task_struct * tsk, struct vm_area_struct * vma,
     /* [previous][next][first][last][top][bottom][index][help] */
1048         unsigned long address, int write_access)
1049 {
1050         pte_t * page_table;
1051         pte_t entry;
1052         unsigned long page;
1053 
1054         page_table = get_empty_pgtable(tsk, address);
1055         if (!page_table)
1056                 return;
1057         entry = *page_table;
1058         if (pte_present(entry))
1059                 return;
1060         if (!pte_none(entry)) {
1061                 do_swap_page(tsk, vma, address, page_table, entry, write_access);
1062                 return;
1063         }
1064         address &= PAGE_MASK;
1065         if (!vma->vm_ops || !vma->vm_ops->nopage) {
1066                 ++vma->vm_mm->rss;
1067                 ++tsk->min_flt;
1068                 get_empty_page(tsk, vma, page_table);
1069                 return;
1070         }
1071         page = __get_free_page(GFP_KERNEL);
1072         if (share_page(vma, address, write_access, page)) {
1073                 ++vma->vm_mm->rss;
1074                 ++tsk->min_flt;
1075                 return;
1076         }
1077         if (!page) {
1078                 oom(tsk);
1079                 put_page(page_table, BAD_PAGE);
1080                 return;
1081         }
1082         ++tsk->maj_flt;
1083         ++vma->vm_mm->rss;
1084         /*
1085          * The fourth argument is "no_share", which tells the low-level code
1086          * to copy, not share the page even if sharing is possible.  It's
1087          * essentially an early COW detection 
1088          */
1089         page = vma->vm_ops->nopage(vma, address, page,
1090                 write_access && !(vma->vm_flags & VM_SHARED));
1091         if (share_page(vma, address, write_access, 0)) {
1092                 free_page(page);
1093                 return;
1094         }
1095         /*
1096          * This silly early PAGE_DIRTY setting removes a race
1097          * due to the bad i386 page protection. But it's valid
1098          * for other architectures too.
1099          *
1100          * Note that if write_access is true, we either now have
1101          * a exclusive copy of the page, or this is a shared mapping,
1102          * so we can make it writable and dirty to avoid having to
1103          * handle that later.
1104          */
1105         entry = mk_pte(page, vma->vm_page_prot);
1106         if (write_access) {
1107                 entry = pte_mkwrite(pte_mkdirty(entry));
1108         } else if (mem_map[MAP_NR(page)].count > 1 && !(vma->vm_flags & VM_SHARED))
1109                 entry = pte_wrprotect(entry);
1110         put_page(page_table, entry);
1111 }
1112 
1113 /*
1114  * The above separate functions for the no-page and wp-page
1115  * cases will go away (they mostly do the same thing anyway),
1116  * and we'll instead use only a general "handle_mm_fault()".
1117  *
1118  * These routines also need to handle stuff like marking pages dirty
1119  * and/or accessed for architectures that don't do it in hardware (most
1120  * RISC architectures).  The early dirtying is also good on the i386.
1121  *
1122  * There is also a hook called "update_mmu_cache()" that architectures
1123  * with external mmu caches can use to update those (ie the Sparc or
1124  * PowerPC hashed page tables that act as extended TLBs).
1125  */
1126 static inline void handle_pte_fault(struct vm_area_struct * vma, unsigned long address,
     /* [previous][next][first][last][top][bottom][index][help] */
1127         int write_access, pte_t * pte)
1128 {
1129         if (!pte_present(*pte)) {
1130                 do_no_page(current, vma, address, write_access);
1131                 return;
1132         }
1133         set_pte(pte, pte_mkyoung(*pte));
1134         if (!write_access)
1135                 return;
1136         if (pte_write(*pte)) {
1137                 set_pte(pte, pte_mkdirty(*pte));
1138                 return;
1139         }
1140         do_wp_page(current, vma, address, write_access);
1141 }
1142 
1143 void handle_mm_fault(struct vm_area_struct * vma, unsigned long address,
     /* [previous][next][first][last][top][bottom][index][help] */
1144         int write_access)
1145 {
1146         pgd_t *pgd;
1147         pmd_t *pmd;
1148         pte_t *pte;
1149 
1150         pgd = pgd_offset(vma->vm_mm, address);
1151         pmd = pmd_alloc(pgd, address);
1152         if (!pmd)
1153                 goto no_memory;
1154         pte = pte_alloc(pmd, address);
1155         if (!pte)
1156                 goto no_memory;
1157         handle_pte_fault(vma, address, write_access, pte);
1158         update_mmu_cache(vma, address, *pte);
1159         return;
1160 no_memory:
1161         oom(current);
1162 }

/* [previous][next][first][last][top][bottom][index][help] */