root/mm/memory.c

/* [previous][next][first][last][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. oom
  2. free_one_pmd
  3. free_one_pgd
  4. new_page_tables
  5. clear_page_tables
  6. free_page_tables
  7. copy_one_pte
  8. copy_pte_range
  9. copy_pmd_range
  10. copy_page_range
  11. forget_pte
  12. unmap_pte_range
  13. unmap_pmd_range
  14. zap_page_range
  15. unmap_page_range
  16. zeromap_pte_range
  17. zeromap_pmd_range
  18. zeromap_page_range
  19. remap_pte_range
  20. remap_pmd_range
  21. remap_page_range
  22. put_page
  23. put_dirty_page
  24. do_wp_page
  25. verify_area
  26. get_empty_page
  27. try_to_share
  28. share_page
  29. unshare
  30. vmtruncate
  31. get_empty_pgtable
  32. do_swap_page
  33. do_no_page
  34. handle_pte_fault
  35. handle_mm_fault

   1 /*
   2  *  linux/mm/memory.c
   3  *
   4  *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
   5  */
   6 
   7 /*
   8  * demand-loading started 01.12.91 - seems it is high on the list of
   9  * things wanted, and it should be easy to implement. - Linus
  10  */
  11 
  12 /*
  13  * Ok, demand-loading was easy, shared pages a little bit tricker. Shared
  14  * pages started 02.12.91, seems to work. - Linus.
  15  *
  16  * Tested sharing by executing about 30 /bin/sh: under the old kernel it
  17  * would have taken more than the 6M I have free, but it worked well as
  18  * far as I could see.
  19  *
  20  * Also corrected some "invalidate()"s - I wasn't doing enough of them.
  21  */
  22 
  23 /*
  24  * Real VM (paging to/from disk) started 18.12.91. Much more work and
  25  * thought has to go into this. Oh, well..
  26  * 19.12.91  -  works, somewhat. Sometimes I get faults, don't know why.
  27  *              Found it. Everything seems to work now.
  28  * 20.12.91  -  Ok, making the swap-device changeable like the root.
  29  */
  30 
  31 /*
  32  * 05.04.94  -  Multi-page memory management added for v1.1.
  33  *              Idea by Alex Bligh (alex@cconcepts.co.uk)
  34  */
  35 
  36 #include <linux/config.h>
  37 #include <linux/signal.h>
  38 #include <linux/sched.h>
  39 #include <linux/head.h>
  40 #include <linux/kernel.h>
  41 #include <linux/errno.h>
  42 #include <linux/string.h>
  43 #include <linux/types.h>
  44 #include <linux/ptrace.h>
  45 #include <linux/mman.h>
  46 #include <linux/mm.h>
  47 
  48 #include <asm/system.h>
  49 #include <asm/segment.h>
  50 #include <asm/pgtable.h>
  51 
  52 unsigned long high_memory = 0;
  53 
  54 /*
  55  * The free_area_list arrays point to the queue heads of the free areas
  56  * of different sizes
  57  */
  58 int nr_swap_pages = 0;
  59 int nr_free_pages = 0;
  60 struct mem_list free_area_list[NR_MEM_LISTS];
  61 unsigned char * free_area_map[NR_MEM_LISTS];
  62 
  63 #define copy_page(from,to) memcpy((void *) to, (void *) from, PAGE_SIZE)
  64 
  65 #define USER_PTRS_PER_PGD (TASK_SIZE / PGDIR_SIZE)
  66 
  67 mem_map_t * mem_map = NULL;
  68 
  69 /*
  70  * oom() prints a message (so that the user knows why the process died),
  71  * and gives the process an untrappable SIGKILL.
  72  */
  73 void oom(struct task_struct * task)
     /* [previous][next][first][last][top][bottom][index][help] */
  74 {
  75         printk("\nOut of memory for %s.\n", current->comm);
  76         task->sig->action[SIGKILL-1].sa_handler = NULL;
  77         task->blocked &= ~(1<<(SIGKILL-1));
  78         send_sig(SIGKILL,task,1);
  79 }
  80 
  81 /*
  82  * Note: this doesn't free the actual pages themselves. That
  83  * has been handled earlier when unmapping all the memory regions.
  84  */
  85 static inline void free_one_pmd(pmd_t * dir)
     /* [previous][next][first][last][top][bottom][index][help] */
  86 {
  87         pte_t * pte;
  88 
  89         if (pmd_none(*dir))
  90                 return;
  91         if (pmd_bad(*dir)) {
  92                 printk("free_one_pmd: bad directory entry %08lx\n", pmd_val(*dir));
  93                 pmd_clear(dir);
  94                 return;
  95         }
  96         pte = pte_offset(dir, 0);
  97         pmd_clear(dir);
  98         pte_free(pte);
  99 }
 100 
 101 static inline void free_one_pgd(pgd_t * dir)
     /* [previous][next][first][last][top][bottom][index][help] */
 102 {
 103         pmd_t * pmd;
 104 
 105         if (pgd_none(*dir))
 106                 return;
 107         if (pgd_bad(*dir)) {
 108                 printk("free_one_pgd: bad directory entry %08lx\n", pgd_val(*dir));
 109                 pgd_clear(dir);
 110                 return;
 111         }
 112         pmd = pmd_offset(dir, 0);
 113         pgd_clear(dir);
 114         if (!pmd_inuse(pmd)) {
 115                 int j;
 116                 for (j = 0; j < PTRS_PER_PMD ; j++)
 117                         free_one_pmd(pmd+j);
 118         }
 119         pmd_free(pmd);
 120 }
 121         
 122 int new_page_tables(struct task_struct * tsk)
     /* [previous][next][first][last][top][bottom][index][help] */
 123 {
 124         pgd_t * page_dir, * new_pg;
 125         int i;
 126 
 127         if (!(new_pg = pgd_alloc()))
 128                 return -ENOMEM;
 129         page_dir = pgd_offset(&init_mm, 0);
 130         for (i = USER_PTRS_PER_PGD ; i < PTRS_PER_PGD ; i++)
 131                 new_pg[i] = page_dir[i];
 132         SET_PAGE_DIR(tsk, new_pg);
 133         tsk->mm->pgd = new_pg;
 134         return 0;
 135 }
 136 
 137 /*
 138  * This function clears all user-level page tables of a process - this
 139  * is needed by execve(), so that old pages aren't in the way. Note that
 140  * unlike 'free_page_tables()', this function still leaves a valid
 141  * page-table-tree in memory: it just removes the user pages. The two
 142  * functions are similar, but there is a fundamental difference.
 143  */
 144 void clear_page_tables(struct task_struct * tsk)
     /* [previous][next][first][last][top][bottom][index][help] */
 145 {
 146         int i;
 147         pgd_t * page_dir;
 148 
 149         if (!tsk)
 150                 return;
 151         if (tsk == task[0])
 152                 panic("task[0] (swapper) doesn't support exec()\n");
 153         page_dir = pgd_offset(tsk->mm, 0);
 154         if (!page_dir) {
 155                 printk("%s trying to clear NULL page-directory: not good\n", tsk->comm);
 156                 return;
 157         }
 158         if (pgd_inuse(page_dir)) {
 159                 if (new_page_tables(tsk))
 160                         oom(tsk);
 161                 pgd_free(page_dir);
 162                 return;
 163         }
 164         if (page_dir == swapper_pg_dir) {
 165                 printk("%s trying to clear kernel page-directory: not good\n", tsk->comm);
 166                 return;
 167         }
 168         for (i = 0 ; i < USER_PTRS_PER_PGD ; i++)
 169                 free_one_pgd(page_dir + i);
 170         invalidate();
 171         return;
 172 }
 173 
 174 /*
 175  * This function frees up all page tables of a process when it exits.
 176  */
 177 void free_page_tables(struct task_struct * tsk)
     /* [previous][next][first][last][top][bottom][index][help] */
 178 {
 179         int i;
 180         pgd_t * page_dir;
 181 
 182         page_dir = tsk->mm->pgd;
 183         if (!page_dir || page_dir == swapper_pg_dir) {
 184                 printk("%s trying to free kernel page-directory: not good\n", tsk->comm);
 185                 return;
 186         }
 187         SET_PAGE_DIR(tsk, swapper_pg_dir);
 188         if (pgd_inuse(page_dir)) {
 189                 pgd_free(page_dir);
 190                 return;
 191         }
 192         tsk->mm->pgd = swapper_pg_dir;  /* or else... */
 193         for (i = 0 ; i < PTRS_PER_PGD ; i++)
 194                 free_one_pgd(page_dir + i);
 195         pgd_free(page_dir);
 196         invalidate();
 197 }
 198 
 199 static inline void copy_one_pte(pte_t * old_pte, pte_t * new_pte)
     /* [previous][next][first][last][top][bottom][index][help] */
 200 {
 201         pte_t pte = *old_pte;
 202 
 203         if (pte_none(pte))
 204                 return;
 205         if (!pte_present(pte)) {
 206                 swap_duplicate(pte_val(pte));
 207                 set_pte(new_pte, pte);
 208                 return;
 209         }
 210         if (pte_page(pte) > high_memory || mem_map[MAP_NR(pte_page(pte))].reserved) {
 211                 set_pte(new_pte, pte);
 212                 return;
 213         }
 214         if (pte_cow(pte))
 215                 pte = pte_wrprotect(pte);
 216         if (delete_from_swap_cache(pte_page(pte)))
 217                 pte = pte_mkdirty(pte);
 218         set_pte(new_pte, pte_mkold(pte));
 219         set_pte(old_pte, pte);
 220         mem_map[MAP_NR(pte_page(pte))].count++;
 221 }
 222 
 223 static inline int copy_pte_range(pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long address, unsigned long size)
     /* [previous][next][first][last][top][bottom][index][help] */
 224 {
 225         pte_t * src_pte, * dst_pte;
 226         unsigned long end;
 227 
 228         if (pmd_none(*src_pmd))
 229                 return 0;
 230         if (pmd_bad(*src_pmd)) {
 231                 printk("copy_pte_range: bad pmd (%08lx)\n", pmd_val(*src_pmd));
 232                 pmd_clear(src_pmd);
 233                 return 0;
 234         }
 235         src_pte = pte_offset(src_pmd, address);
 236         if (pmd_none(*dst_pmd)) {
 237                 if (!pte_alloc(dst_pmd, 0))
 238                         return -ENOMEM;
 239         }
 240         dst_pte = pte_offset(dst_pmd, address);
 241         address &= ~PMD_MASK;
 242         end = address + size;
 243         if (end >= PMD_SIZE)
 244                 end = PMD_SIZE;
 245         do {
 246                 /* I would like to switch arguments here, to make it
 247                  * consistent with copy_xxx_range and memcpy syntax.
 248                  */
 249                 copy_one_pte(src_pte++, dst_pte++);
 250                 address += PAGE_SIZE;
 251         } while (address < end);
 252         return 0;
 253 }
 254 
 255 static inline int copy_pmd_range(pgd_t *dst_pgd, pgd_t *src_pgd, unsigned long address, unsigned long size)
     /* [previous][next][first][last][top][bottom][index][help] */
 256 {
 257         pmd_t * src_pmd, * dst_pmd;
 258         unsigned long end;
 259         int error = 0;
 260 
 261         if (pgd_none(*src_pgd))
 262                 return 0;
 263         if (pgd_bad(*src_pgd)) {
 264                 printk("copy_pmd_range: bad pgd (%08lx)\n", pgd_val(*src_pgd));
 265                 pgd_clear(src_pgd);
 266                 return 0;
 267         }
 268         src_pmd = pmd_offset(src_pgd, address);
 269         if (pgd_none(*dst_pgd)) {
 270                 if (!pmd_alloc(dst_pgd, 0))
 271                         return -ENOMEM;
 272         }
 273         dst_pmd = pmd_offset(dst_pgd, address);
 274         address &= ~PGDIR_MASK;
 275         end = address + size;
 276         if (end > PGDIR_SIZE)
 277                 end = PGDIR_SIZE;
 278         do {
 279                 error = copy_pte_range(dst_pmd++, src_pmd++, address, end - address);
 280                 if (error)
 281                         break;
 282                 address = (address + PMD_SIZE) & PMD_MASK; 
 283         } while (address < end);
 284         return error;
 285 }
 286 
 287 /*
 288  * copy one vm_area from one task to the other. Assumes the page tables
 289  * already present in the new task to be cleared in the whole range
 290  * covered by this vma.
 291  */
 292 int copy_page_range(struct mm_struct *dst, struct mm_struct *src,
     /* [previous][next][first][last][top][bottom][index][help] */
 293                         struct vm_area_struct *vma)
 294 {
 295         pgd_t * src_pgd, * dst_pgd;
 296         unsigned long address = vma->vm_start;
 297         unsigned long end = vma->vm_end;
 298         int error = 0;
 299 
 300         src_pgd = pgd_offset(src, address);
 301         dst_pgd = pgd_offset(dst, address);
 302         while (address < end) {
 303                 error = copy_pmd_range(dst_pgd++, src_pgd++, address, end - address);
 304                 if (error)
 305                         break;
 306                 address = (address + PGDIR_SIZE) & PGDIR_MASK;
 307         }
 308         invalidate();
 309         return error;
 310 }
 311 
 312 static inline void forget_pte(pte_t page)
     /* [previous][next][first][last][top][bottom][index][help] */
 313 {
 314         if (pte_none(page))
 315                 return;
 316         if (pte_present(page)) {
 317                 free_page(pte_page(page));
 318                 if (mem_map[MAP_NR(pte_page(page))].reserved)
 319                         return;
 320                 if (current->mm->rss <= 0)
 321                         return;
 322                 current->mm->rss--;
 323                 return;
 324         }
 325         swap_free(pte_val(page));
 326 }
 327 
 328 static inline void unmap_pte_range(pmd_t * pmd, unsigned long address, unsigned long size)
     /* [previous][next][first][last][top][bottom][index][help] */
 329 {
 330         pte_t * pte;
 331         unsigned long end;
 332 
 333         if (pmd_none(*pmd))
 334                 return;
 335         if (pmd_bad(*pmd)) {
 336                 printk("unmap_pte_range: bad pmd (%08lx)\n", pmd_val(*pmd));
 337                 pmd_clear(pmd);
 338                 return;
 339         }
 340         pte = pte_offset(pmd, address);
 341         address &= ~PMD_MASK;
 342         end = address + size;
 343         if (end >= PMD_SIZE)
 344                 end = PMD_SIZE;
 345         do {
 346                 pte_t page = *pte;
 347                 pte_clear(pte);
 348                 forget_pte(page);
 349                 address += PAGE_SIZE;
 350                 pte++;
 351         } while (address < end);
 352 }
 353 
 354 static inline void unmap_pmd_range(pgd_t * dir, unsigned long address, unsigned long size)
     /* [previous][next][first][last][top][bottom][index][help] */
 355 {
 356         pmd_t * pmd;
 357         unsigned long end;
 358 
 359         if (pgd_none(*dir))
 360                 return;
 361         if (pgd_bad(*dir)) {
 362                 printk("unmap_pmd_range: bad pgd (%08lx)\n", pgd_val(*dir));
 363                 pgd_clear(dir);
 364                 return;
 365         }
 366         pmd = pmd_offset(dir, address);
 367         address &= ~PGDIR_MASK;
 368         end = address + size;
 369         if (end > PGDIR_SIZE)
 370                 end = PGDIR_SIZE;
 371         do {
 372                 unmap_pte_range(pmd, address, end - address);
 373                 address = (address + PMD_SIZE) & PMD_MASK; 
 374                 pmd++;
 375         } while (address < end);
 376 }
 377 
 378 /*
 379  * remove user pages in a given range.
 380  */
 381 int zap_page_range(struct mm_struct *mm, unsigned long address, unsigned long size)
     /* [previous][next][first][last][top][bottom][index][help] */
 382 {
 383         pgd_t * dir;
 384         unsigned long end = address + size;
 385 
 386         dir = pgd_offset(mm, address);
 387         while (address < end) {
 388                 unmap_pmd_range(dir, address, end - address);
 389                 address = (address + PGDIR_SIZE) & PGDIR_MASK;
 390                 dir++;
 391         }
 392         invalidate();
 393         return 0;
 394 }
 395 
 396 /*
 397  * a more complete version of free_page_tables which performs with page
 398  * granularity.
 399  */
 400 int unmap_page_range(unsigned long address, unsigned long size)
     /* [previous][next][first][last][top][bottom][index][help] */
 401 {
 402         return zap_page_range(current->mm, address, size);
 403 }
 404 
 405 static inline void zeromap_pte_range(pte_t * pte, unsigned long address, unsigned long size, pte_t zero_pte)
     /* [previous][next][first][last][top][bottom][index][help] */
 406 {
 407         unsigned long end;
 408 
 409         address &= ~PMD_MASK;
 410         end = address + size;
 411         if (end > PMD_SIZE)
 412                 end = PMD_SIZE;
 413         do {
 414                 pte_t oldpage = *pte;
 415                 set_pte(pte, zero_pte);
 416                 forget_pte(oldpage);
 417                 address += PAGE_SIZE;
 418                 pte++;
 419         } while (address < end);
 420 }
 421 
 422 static inline int zeromap_pmd_range(pmd_t * pmd, unsigned long address, unsigned long size, pte_t zero_pte)
     /* [previous][next][first][last][top][bottom][index][help] */
 423 {
 424         unsigned long end;
 425 
 426         address &= ~PGDIR_MASK;
 427         end = address + size;
 428         if (end > PGDIR_SIZE)
 429                 end = PGDIR_SIZE;
 430         do {
 431                 pte_t * pte = pte_alloc(pmd, address);
 432                 if (!pte)
 433                         return -ENOMEM;
 434                 zeromap_pte_range(pte, address, end - address, zero_pte);
 435                 address = (address + PMD_SIZE) & PMD_MASK;
 436                 pmd++;
 437         } while (address < end);
 438         return 0;
 439 }
 440 
 441 int zeromap_page_range(unsigned long address, unsigned long size, pgprot_t prot)
     /* [previous][next][first][last][top][bottom][index][help] */
 442 {
 443         int error = 0;
 444         pgd_t * dir;
 445         unsigned long end = address + size;
 446         pte_t zero_pte;
 447 
 448         zero_pte = pte_wrprotect(mk_pte(ZERO_PAGE, prot));
 449         dir = pgd_offset(current->mm, address);
 450         while (address < end) {
 451                 pmd_t *pmd = pmd_alloc(dir, address);
 452                 error = -ENOMEM;
 453                 if (!pmd)
 454                         break;
 455                 error = zeromap_pmd_range(pmd, address, end - address, zero_pte);
 456                 if (error)
 457                         break;
 458                 address = (address + PGDIR_SIZE) & PGDIR_MASK;
 459                 dir++;
 460         }
 461         invalidate();
 462         return error;
 463 }
 464 
 465 /*
 466  * maps a range of physical memory into the requested pages. the old
 467  * mappings are removed. any references to nonexistent pages results
 468  * in null mappings (currently treated as "copy-on-access")
 469  */
 470 static inline void remap_pte_range(pte_t * pte, unsigned long address, unsigned long size,
     /* [previous][next][first][last][top][bottom][index][help] */
 471         unsigned long offset, pgprot_t prot)
 472 {
 473         unsigned long end;
 474 
 475         address &= ~PMD_MASK;
 476         end = address + size;
 477         if (end > PMD_SIZE)
 478                 end = PMD_SIZE;
 479         do {
 480                 pte_t oldpage = *pte;
 481                 pte_clear(pte);
 482                 if (offset >= high_memory || mem_map[MAP_NR(offset)].reserved)
 483                         set_pte(pte, mk_pte(offset, prot));
 484                 forget_pte(oldpage);
 485                 address += PAGE_SIZE;
 486                 offset += PAGE_SIZE;
 487                 pte++;
 488         } while (address < end);
 489 }
 490 
 491 static inline int remap_pmd_range(pmd_t * pmd, unsigned long address, unsigned long size,
     /* [previous][next][first][last][top][bottom][index][help] */
 492         unsigned long offset, pgprot_t prot)
 493 {
 494         unsigned long end;
 495 
 496         address &= ~PGDIR_MASK;
 497         end = address + size;
 498         if (end > PGDIR_SIZE)
 499                 end = PGDIR_SIZE;
 500         offset -= address;
 501         do {
 502                 pte_t * pte = pte_alloc(pmd, address);
 503                 if (!pte)
 504                         return -ENOMEM;
 505                 remap_pte_range(pte, address, end - address, address + offset, prot);
 506                 address = (address + PMD_SIZE) & PMD_MASK;
 507                 pmd++;
 508         } while (address < end);
 509         return 0;
 510 }
 511 
 512 int remap_page_range(unsigned long from, unsigned long offset, unsigned long size, pgprot_t prot)
     /* [previous][next][first][last][top][bottom][index][help] */
 513 {
 514         int error = 0;
 515         pgd_t * dir;
 516         unsigned long end = from + size;
 517 
 518         offset -= from;
 519         dir = pgd_offset(current->mm, from);
 520         while (from < end) {
 521                 pmd_t *pmd = pmd_alloc(dir, from);
 522                 error = -ENOMEM;
 523                 if (!pmd)
 524                         break;
 525                 error = remap_pmd_range(pmd, from, end - from, offset + from, prot);
 526                 if (error)
 527                         break;
 528                 from = (from + PGDIR_SIZE) & PGDIR_MASK;
 529                 dir++;
 530         }
 531         invalidate();
 532         return error;
 533 }
 534 
 535 /*
 536  * sanity-check function..
 537  */
 538 static void put_page(pte_t * page_table, pte_t pte)
     /* [previous][next][first][last][top][bottom][index][help] */
 539 {
 540         if (!pte_none(*page_table)) {
 541                 printk("put_page: page already exists %08lx\n", pte_val(*page_table));
 542                 free_page(pte_page(pte));
 543                 return;
 544         }
 545 /* no need for invalidate */
 546         *page_table = pte;
 547 }
 548 
 549 /*
 550  * This routine is used to map in a page into an address space: needed by
 551  * execve() for the initial stack and environment pages.
 552  */
 553 unsigned long put_dirty_page(struct task_struct * tsk, unsigned long page, unsigned long address)
     /* [previous][next][first][last][top][bottom][index][help] */
 554 {
 555         pgd_t * pgd;
 556         pmd_t * pmd;
 557         pte_t * pte;
 558 
 559         if (page >= high_memory)
 560                 printk("put_dirty_page: trying to put page %08lx at %08lx\n",page,address);
 561         if (mem_map[MAP_NR(page)].count != 1)
 562                 printk("mem_map disagrees with %08lx at %08lx\n",page,address);
 563         pgd = pgd_offset(tsk->mm,address);
 564         pmd = pmd_alloc(pgd, address);
 565         if (!pmd) {
 566                 free_page(page);
 567                 oom(tsk);
 568                 return 0;
 569         }
 570         pte = pte_alloc(pmd, address);
 571         if (!pte) {
 572                 free_page(page);
 573                 oom(tsk);
 574                 return 0;
 575         }
 576         if (!pte_none(*pte)) {
 577                 printk("put_dirty_page: page already exists\n");
 578                 pte_clear(pte);
 579                 invalidate();
 580         }
 581         set_pte(pte, pte_mkwrite(pte_mkdirty(mk_pte(page, PAGE_COPY))));
 582 /* no need for invalidate */
 583         return page;
 584 }
 585 
 586 /*
 587  * This routine handles present pages, when users try to write
 588  * to a shared page. It is done by copying the page to a new address
 589  * and decrementing the shared-page counter for the old page.
 590  *
 591  * Goto-purists beware: the only reason for goto's here is that it results
 592  * in better assembly code.. The "default" path will see no jumps at all.
 593  *
 594  * Note that this routine assumes that the protection checks have been
 595  * done by the caller (the low-level page fault routine in most cases).
 596  * Thus we can safely just mark it writable once we've done any necessary
 597  * COW.
 598  *
 599  * We also mark the page dirty at this point even though the page will
 600  * change only once the write actually happens. This avoids a few races,
 601  * and potentially makes it more efficient.
 602  */
 603 void do_wp_page(struct task_struct * tsk, struct vm_area_struct * vma,
     /* [previous][next][first][last][top][bottom][index][help] */
 604         unsigned long address, int write_access)
 605 {
 606         pgd_t *page_dir;
 607         pmd_t *page_middle;
 608         pte_t *page_table, pte;
 609         unsigned long old_page, new_page;
 610 
 611         new_page = __get_free_page(GFP_KERNEL);
 612         page_dir = pgd_offset(vma->vm_mm, address);
 613         if (pgd_none(*page_dir))
 614                 goto end_wp_page;
 615         if (pgd_bad(*page_dir))
 616                 goto bad_wp_pagedir;
 617         page_middle = pmd_offset(page_dir, address);
 618         if (pmd_none(*page_middle))
 619                 goto end_wp_page;
 620         if (pmd_bad(*page_middle))
 621                 goto bad_wp_pagemiddle;
 622         page_table = pte_offset(page_middle, address);
 623         pte = *page_table;
 624         if (!pte_present(pte))
 625                 goto end_wp_page;
 626         if (pte_write(pte))
 627                 goto end_wp_page;
 628         old_page = pte_page(pte);
 629         if (old_page >= high_memory)
 630                 goto bad_wp_page;
 631         tsk->min_flt++;
 632         /*
 633          * Do we need to copy?
 634          */
 635         if (mem_map[MAP_NR(old_page)].count != 1) {
 636                 if (new_page) {
 637                         if (mem_map[MAP_NR(old_page)].reserved)
 638                                 ++vma->vm_mm->rss;
 639                         copy_page(old_page,new_page);
 640                         set_pte(page_table, pte_mkwrite(pte_mkdirty(mk_pte(new_page, vma->vm_page_prot))));
 641                         free_page(old_page);
 642                         invalidate();
 643                         return;
 644                 }
 645                 set_pte(page_table, BAD_PAGE);
 646                 free_page(old_page);
 647                 oom(tsk);
 648                 invalidate();
 649                 return;
 650         }
 651         set_pte(page_table, pte_mkdirty(pte_mkwrite(pte)));
 652         invalidate();
 653         if (new_page)
 654                 free_page(new_page);
 655         return;
 656 bad_wp_page:
 657         printk("do_wp_page: bogus page at address %08lx (%08lx)\n",address,old_page);
 658         send_sig(SIGKILL, tsk, 1);
 659         goto end_wp_page;
 660 bad_wp_pagemiddle:
 661         printk("do_wp_page: bogus page-middle at address %08lx (%08lx)\n", address, pmd_val(*page_middle));
 662         send_sig(SIGKILL, tsk, 1);
 663         goto end_wp_page;
 664 bad_wp_pagedir:
 665         printk("do_wp_page: bogus page-dir entry at address %08lx (%08lx)\n", address, pgd_val(*page_dir));
 666         send_sig(SIGKILL, tsk, 1);
 667 end_wp_page:
 668         if (new_page)
 669                 free_page(new_page);
 670         return;
 671 }
 672 
 673 /*
 674  * Ugly, ugly, but the goto's result in better assembly..
 675  */
 676 int verify_area(int type, const void * addr, unsigned long size)
     /* [previous][next][first][last][top][bottom][index][help] */
 677 {
 678         struct vm_area_struct * vma;
 679         unsigned long start = (unsigned long) addr;
 680 
 681         /* If the current user space is mapped to kernel space (for the
 682          * case where we use a fake user buffer with get_fs/set_fs()) we
 683          * don't expect to find the address in the user vm map.
 684          */
 685         if (get_fs() == get_ds())
 686                 return 0;
 687 
 688         vma = find_vma(current, start);
 689         if (!vma)
 690                 goto bad_area;
 691         if (vma->vm_start <= start)
 692                 goto good_area;
 693         if (!(vma->vm_flags & VM_GROWSDOWN))
 694                 goto bad_area;
 695         if (vma->vm_end - start > current->rlim[RLIMIT_STACK].rlim_cur)
 696                 goto bad_area;
 697 
 698 good_area:
 699         if (type == VERIFY_WRITE)
 700                 goto check_write;
 701         for (;;) {
 702                 struct vm_area_struct * next;
 703                 if (!(vma->vm_flags & VM_READ))
 704                         goto bad_area;
 705                 if (vma->vm_end - start >= size)
 706                         return 0;
 707                 next = vma->vm_next;
 708                 if (!next || vma->vm_end != next->vm_start)
 709                         goto bad_area;
 710                 vma = next;
 711         }
 712 
 713 check_write:
 714         if (!(vma->vm_flags & VM_WRITE))
 715                 goto bad_area;
 716         if (!wp_works_ok)
 717                 goto check_wp_fault_by_hand;
 718         for (;;) {
 719                 if (vma->vm_end - start >= size)
 720                         break;
 721                 if (!vma->vm_next || vma->vm_end != vma->vm_next->vm_start)
 722                         goto bad_area;
 723                 vma = vma->vm_next;
 724                 if (!(vma->vm_flags & VM_WRITE))
 725                         goto bad_area;
 726         }
 727         return 0;
 728 
 729 check_wp_fault_by_hand:
 730         size--;
 731         size += start & ~PAGE_MASK;
 732         size >>= PAGE_SHIFT;
 733         start &= PAGE_MASK;
 734 
 735         for (;;) {
 736                 do_wp_page(current, vma, start, 1);
 737                 if (!size)
 738                         break;
 739                 size--;
 740                 start += PAGE_SIZE;
 741                 if (start < vma->vm_end)
 742                         continue;
 743                 vma = vma->vm_next;
 744                 if (!vma || vma->vm_start != start)
 745                         goto bad_area;
 746                 if (!(vma->vm_flags & VM_WRITE))
 747                         goto bad_area;;
 748         }
 749         return 0;
 750 
 751 bad_area:
 752         return -EFAULT;
 753 }
 754 
 755 static inline void get_empty_page(struct task_struct * tsk, struct vm_area_struct * vma, pte_t * page_table)
     /* [previous][next][first][last][top][bottom][index][help] */
 756 {
 757         unsigned long tmp;
 758 
 759         if (!(tmp = get_free_page(GFP_KERNEL))) {
 760                 oom(tsk);
 761                 put_page(page_table, BAD_PAGE);
 762                 return;
 763         }
 764         put_page(page_table, pte_mkwrite(mk_pte(tmp, vma->vm_page_prot)));
 765 }
 766 
 767 /*
 768  * try_to_share() checks the page at address "address" in the task "p",
 769  * to see if it exists, and if it is clean. If so, share it with the current
 770  * task.
 771  *
 772  * NOTE! This assumes we have checked that p != current, and that they
 773  * share the same inode and can generally otherwise be shared.
 774  */
 775 static int try_to_share(unsigned long to_address, struct vm_area_struct * to_area,
     /* [previous][next][first][last][top][bottom][index][help] */
 776         unsigned long from_address, struct vm_area_struct * from_area,
 777         unsigned long newpage)
 778 {
 779         pgd_t * from_dir, * to_dir;
 780         pmd_t * from_middle, * to_middle;
 781         pte_t * from_table, * to_table;
 782         pte_t from, to;
 783 
 784         from_dir = pgd_offset(from_area->vm_mm,from_address);
 785 /* is there a page-directory at from? */
 786         if (pgd_none(*from_dir))
 787                 return 0;
 788         if (pgd_bad(*from_dir)) {
 789                 printk("try_to_share: bad page directory %08lx\n", pgd_val(*from_dir));
 790                 pgd_clear(from_dir);
 791                 return 0;
 792         }
 793         from_middle = pmd_offset(from_dir, from_address);
 794 /* is there a mid-directory at from? */
 795         if (pmd_none(*from_middle))
 796                 return 0;
 797         if (pmd_bad(*from_middle)) {
 798                 printk("try_to_share: bad mid directory %08lx\n", pmd_val(*from_middle));
 799                 pmd_clear(from_middle);
 800                 return 0;
 801         }
 802         from_table = pte_offset(from_middle, from_address);
 803         from = *from_table;
 804 /* is the page present? */
 805         if (!pte_present(from))
 806                 return 0;
 807 /* if it is dirty it must be from a shared mapping to be shared */
 808         if (pte_dirty(from)) {
 809                 if (!(from_area->vm_flags & VM_SHARED))
 810                         return 0;
 811         }
 812 /* is the page reasonable at all? */
 813         if (pte_page(from) >= high_memory)
 814                 return 0;
 815         if (mem_map[MAP_NR(pte_page(from))].reserved)
 816                 return 0;
 817 /* is the destination ok? */
 818         to_dir = pgd_offset(to_area->vm_mm,to_address);
 819 /* is there a page-directory at to? */
 820         if (pgd_none(*to_dir))
 821                 return 0;
 822         if (pgd_bad(*to_dir)) {
 823                 printk("try_to_share: bad page directory %08lx\n", pgd_val(*to_dir));
 824                 return 0;
 825         }
 826         to_middle = pmd_offset(to_dir, to_address);
 827 /* is there a mid-directory at to? */
 828         if (pmd_none(*to_middle))
 829                 return 0;
 830         if (pmd_bad(*to_middle)) {
 831                 printk("try_to_share: bad mid directory %08lx\n", pmd_val(*to_middle));
 832                 return 0;
 833         }
 834         to_table = pte_offset(to_middle, to_address);
 835         to = *to_table;
 836         if (!pte_none(to))
 837                 return 0;
 838 /* do we copy? */
 839         if (newpage) {
 840                 /* if it's in the swap cache, it's dirty by implication */
 841                 /* so we can't use it if it's not from a shared mapping */
 842                 if (in_swap_cache(pte_page(from))) {
 843                         if (!(from_area->vm_flags & VM_SHARED))
 844                                 return 0;
 845                 }
 846                 copy_page(pte_page(from), newpage);
 847                 set_pte(to_table, mk_pte(newpage, to_area->vm_page_prot));
 848                 return 1;
 849         }
 850 /*
 851  * do a final swap-cache test before sharing them: if it's in the swap
 852  * cache, we have to remove it now, as we get two pointers to the same
 853  * physical page and the cache can't handle it. Mark the original dirty.
 854  *
 855  * NOTE! Even if "from" is dirty, "to" will be clean: if we get here
 856  * with a dirty "from", the from-mapping is a shared map, so we can trust
 857  * the page contents to be up-to-date
 858  */
 859         if (in_swap_cache(pte_page(from))) {
 860                 if (!(from_area->vm_flags & VM_SHARED))
 861                         return 0;
 862                 set_pte(from_table, pte_mkdirty(from));
 863                 delete_from_swap_cache(pte_page(from));
 864         }
 865         mem_map[MAP_NR(pte_page(from))].count++;
 866         set_pte(to_table, mk_pte(pte_page(from), to_area->vm_page_prot));
 867 /* Check if we need to do anything at all to the 'from' field */
 868         if (!pte_write(from))
 869                 return 1;
 870         if (from_area->vm_flags & VM_SHARED)
 871                 return 1;
 872 /* ok, need to mark it read-only, so invalidate any possible old TB entry */
 873         set_pte(from_table, pte_wrprotect(from));
 874         invalidate();
 875         return 1;
 876 }
 877 
 878 /*
 879  * share_page() tries to find a process that could share a page with
 880  * the current one.
 881  *
 882  * We first check if it is at all feasible by checking inode->i_count.
 883  * It should be >1 if there are other tasks sharing this inode.
 884  */
 885 static int share_page(struct vm_area_struct * area, unsigned long address,
     /* [previous][next][first][last][top][bottom][index][help] */
 886         int write_access, unsigned long newpage)
 887 {
 888         struct inode * inode;
 889         unsigned long offset;
 890         unsigned long from_address;
 891         unsigned long give_page;
 892         struct vm_area_struct * mpnt;
 893 
 894         if (!area || !(inode = area->vm_inode) || inode->i_count < 2)
 895                 return 0;
 896         /* do we need to copy or can we just share? */
 897         give_page = 0;
 898         if (write_access && !(area->vm_flags & VM_SHARED)) {
 899                 if (!newpage)
 900                         return 0;
 901                 give_page = newpage;
 902         }
 903         offset = address - area->vm_start + area->vm_offset;
 904         /* See if there is something in the VM we can share pages with. */
 905         /* Traverse the entire circular i_mmap list, except `area' itself. */
 906         for (mpnt = area->vm_next_share; mpnt != area; mpnt = mpnt->vm_next_share) {
 907                 /* must be same inode */
 908                 if (mpnt->vm_inode != inode) {
 909                         printk("Aiee! Corrupt vm_area_struct i_mmap ring\n");
 910                         break;  
 911                 }
 912                 /* offsets must be mutually page-aligned */
 913                 if ((mpnt->vm_offset ^ area->vm_offset) & ~PAGE_MASK)
 914                         continue;
 915                 /* the other area must actually cover the wanted page.. */
 916                 from_address = offset + mpnt->vm_start - mpnt->vm_offset;
 917                 if (from_address < mpnt->vm_start || from_address >= mpnt->vm_end)
 918                         continue;
 919                 /* .. NOW we can actually try to use the same physical page */
 920                 if (!try_to_share(address, area, from_address, mpnt, give_page))
 921                         continue;
 922                 /* free newpage if we never used it.. */
 923                 if (give_page || !newpage)
 924                         return 1;
 925                 free_page(newpage);
 926                 return 1;
 927         }
 928         return 0;
 929 }
 930 
 931 /*
 932  * This function tries to find a page that is shared with the buffer cache,
 933  * and if so it moves the buffer cache to a new location.
 934  *
 935  * It returns non-zero if we used up the "new_page" page.
 936  */
 937 static int unshare(struct vm_area_struct *vma, unsigned long address, unsigned long new_page)
     /* [previous][next][first][last][top][bottom][index][help] */
 938 {
 939         pgd_t *page_dir;
 940         pmd_t *page_middle;
 941         pte_t *page_table, pte;
 942         unsigned long old_page;
 943         struct buffer_head * bh, * tmp;
 944 
 945         page_dir = pgd_offset(vma->vm_mm, address);
 946         if (pgd_none(*page_dir))
 947                 return 0;
 948         if (pgd_bad(*page_dir)) {
 949                 printk("bad page table directory entry %p:[%lx]\n", page_dir, pgd_val(*page_dir));
 950                 pgd_clear(page_dir);
 951                 return 0;
 952         }
 953         page_middle = pmd_offset(page_dir, address);
 954         if (pmd_none(*page_middle))
 955                 return 0;
 956         if (pmd_bad(*page_middle)) {
 957                 printk("bad page table directory entry %p:[%lx]\n", page_dir, pgd_val(*page_dir));
 958                 pmd_clear(page_middle);
 959                 return 0;
 960         }
 961         page_table = pte_offset(page_middle, address);
 962         pte = *page_table;
 963         if (!pte_present(pte))
 964                 return 0;
 965         old_page = pte_page(pte);
 966         if (MAP_NR(old_page) > MAP_NR(high_memory))
 967                 return 0;
 968         address &= ~PAGE_MASK;
 969         memset((void *) (old_page + address), 0, PAGE_SIZE - address);
 970         bh = buffer_pages[MAP_NR(old_page)];
 971         if (!bh)
 972                 return 0;
 973         if (!new_page) {
 974                 printk("Aieee... unshare(): no page available\n");
 975                 return 0;
 976         }
 977         buffer_pages[MAP_NR(old_page)] = NULL;
 978         copy_page(old_page, new_page);
 979         free_page(old_page);
 980         old_page -= new_page;
 981         buffer_pages[MAP_NR(new_page)] = bh;
 982         tmp = bh;
 983         do {
 984                 tmp->b_data -= old_page;
 985                 tmp = tmp->b_this_page;
 986         } while (tmp != bh);
 987         return 1;
 988 }
 989 
 990 /*
 991  * Handle all mappings that got truncated by a "truncate()"
 992  * system call.
 993  *
 994  * NOTE! We have to be ready to update the memory sharing
 995  * between the file and the memory map for a potential last
 996  * incomplete page.  Ugly, but necessary.
 997  */
 998 void vmtruncate(struct inode * inode, unsigned long offset)
     /* [previous][next][first][last][top][bottom][index][help] */
 999 {
1000         unsigned long page;
1001         struct vm_area_struct * mpnt;
1002 
1003         if (!inode->i_mmap)
1004                 return;
1005         page = __get_free_page(GFP_KERNEL);
1006         mpnt = inode->i_mmap;
1007         if (!mpnt) {
1008                 free_page(page);
1009                 return;
1010         }
1011         do {
1012                 unsigned long start = mpnt->vm_start;
1013                 unsigned long len = mpnt->vm_end - start;
1014                 unsigned long diff;
1015 
1016                 /* mapping wholly truncated? */
1017                 if (mpnt->vm_offset >= offset) {
1018                         zap_page_range(mpnt->vm_mm, start, len);
1019                         continue;
1020                 }
1021                 /* mapping wholly unaffected? */
1022                 diff = offset - mpnt->vm_offset;
1023                 if (diff >= len)
1024                         continue;
1025                 /* Ok, partially affected.. */
1026                 start += diff;
1027                 len = (len - diff) & PAGE_MASK;
1028                 /* Ugh, here comes the _really_ ugly part.. */
1029                 if (start & ~PAGE_MASK) {
1030                         if (unshare(mpnt, start, page))
1031                                 page = 0;
1032                         start = (start + ~PAGE_MASK) & PAGE_MASK;
1033                 }
1034                 zap_page_range(mpnt->vm_mm, start, len);
1035         } while ((mpnt = mpnt->vm_next_share) != inode->i_mmap);
1036         free_page(page);
1037 }
1038 
1039 /*
1040  * fill in an empty page-table if none exists.
1041  */
1042 static inline pte_t * get_empty_pgtable(struct task_struct * tsk,unsigned long address)
     /* [previous][next][first][last][top][bottom][index][help] */
1043 {
1044         pgd_t *pgd;
1045         pmd_t *pmd;
1046         pte_t *pte;
1047 
1048         pgd = pgd_offset(tsk->mm, address);
1049         pmd = pmd_alloc(pgd, address);
1050         if (!pmd) {
1051                 oom(tsk);
1052                 return NULL;
1053         }
1054         pte = pte_alloc(pmd, address);
1055         if (!pte) {
1056                 oom(tsk);
1057                 return NULL;
1058         }
1059         return pte;
1060 }
1061 
1062 static inline void do_swap_page(struct task_struct * tsk, 
     /* [previous][next][first][last][top][bottom][index][help] */
1063         struct vm_area_struct * vma, unsigned long address,
1064         pte_t * page_table, pte_t entry, int write_access)
1065 {
1066         pte_t page;
1067 
1068         if (!vma->vm_ops || !vma->vm_ops->swapin) {
1069                 swap_in(tsk, vma, page_table, pte_val(entry), write_access);
1070                 return;
1071         }
1072         page = vma->vm_ops->swapin(vma, address - vma->vm_start + vma->vm_offset, pte_val(entry));
1073         if (pte_val(*page_table) != pte_val(entry)) {
1074                 free_page(pte_page(page));
1075                 return;
1076         }
1077         if (mem_map[MAP_NR(pte_page(page))].count > 1 && !(vma->vm_flags & VM_SHARED))
1078                 page = pte_wrprotect(page);
1079         ++vma->vm_mm->rss;
1080         ++tsk->maj_flt;
1081         set_pte(page_table, page);
1082         return;
1083 }
1084 
1085 /*
1086  * do_no_page() tries to create a new page mapping. It aggressively
1087  * tries to share with existing pages, but makes a separate copy if
1088  * the "write_access" parameter is true in order to avoid the next
1089  * page fault.
1090  */
1091 void do_no_page(struct task_struct * tsk, struct vm_area_struct * vma,
     /* [previous][next][first][last][top][bottom][index][help] */
1092         unsigned long address, int write_access)
1093 {
1094         pte_t * page_table;
1095         pte_t entry;
1096         unsigned long page;
1097 
1098         page_table = get_empty_pgtable(tsk, address);
1099         if (!page_table)
1100                 return;
1101         entry = *page_table;
1102         if (pte_present(entry))
1103                 return;
1104         if (!pte_none(entry)) {
1105                 do_swap_page(tsk, vma, address, page_table, entry, write_access);
1106                 return;
1107         }
1108         address &= PAGE_MASK;
1109         if (!vma->vm_ops || !vma->vm_ops->nopage) {
1110                 ++vma->vm_mm->rss;
1111                 ++tsk->min_flt;
1112                 get_empty_page(tsk, vma, page_table);
1113                 return;
1114         }
1115         page = __get_free_page(GFP_KERNEL);
1116         if (share_page(vma, address, write_access, page)) {
1117                 ++vma->vm_mm->rss;
1118                 ++tsk->min_flt;
1119                 return;
1120         }
1121         if (!page) {
1122                 oom(tsk);
1123                 put_page(page_table, BAD_PAGE);
1124                 return;
1125         }
1126         ++tsk->maj_flt;
1127         ++vma->vm_mm->rss;
1128         /*
1129          * The fourth argument is "no_share", which tells the low-level code
1130          * to copy, not share the page even if sharing is possible.  It's
1131          * essentially an early COW detection 
1132          */
1133         page = vma->vm_ops->nopage(vma, address, page,
1134                 write_access && !(vma->vm_flags & VM_SHARED));
1135         if (share_page(vma, address, write_access, 0)) {
1136                 free_page(page);
1137                 return;
1138         }
1139         /*
1140          * This silly early PAGE_DIRTY setting removes a race
1141          * due to the bad i386 page protection. But it's valid
1142          * for other architectures too.
1143          *
1144          * Note that if write_access is true, we either now have
1145          * a exclusive copy of the page, or this is a shared mapping,
1146          * so we can make it writable and dirty to avoid having to
1147          * handle that later.
1148          */
1149         entry = mk_pte(page, vma->vm_page_prot);
1150         if (write_access) {
1151                 entry = pte_mkwrite(pte_mkdirty(entry));
1152         } else if (mem_map[MAP_NR(page)].count > 1 && !(vma->vm_flags & VM_SHARED))
1153                 entry = pte_wrprotect(entry);
1154         put_page(page_table, entry);
1155 }
1156 
1157 /*
1158  * The above separate functions for the no-page and wp-page
1159  * cases will go away (they mostly do the same thing anyway),
1160  * and we'll instead use only a general "handle_mm_fault()".
1161  *
1162  * These routines also need to handle stuff like marking pages dirty
1163  * and/or accessed for architectures that don't do it in hardware (most
1164  * RISC architectures).  The early dirtying is also good on the i386.
1165  *
1166  * There is also a hook called "update_mmu_cache()" that architectures
1167  * with external mmu caches can use to update those (ie the Sparc or
1168  * PowerPC hashed page tables that act as extended TLBs).
1169  */
1170 static inline void handle_pte_fault(struct vm_area_struct * vma, unsigned long address,
     /* [previous][next][first][last][top][bottom][index][help] */
1171         int write_access, pte_t * pte)
1172 {
1173         if (!pte_present(*pte)) {
1174                 do_no_page(current, vma, address, write_access);
1175                 return;
1176         }
1177         set_pte(pte, pte_mkyoung(*pte));
1178         if (!write_access)
1179                 return;
1180         if (pte_write(*pte)) {
1181                 set_pte(pte, pte_mkdirty(*pte));
1182                 return;
1183         }
1184         do_wp_page(current, vma, address, write_access);
1185 }
1186 
1187 void handle_mm_fault(struct vm_area_struct * vma, unsigned long address,
     /* [previous][next][first][last][top][bottom][index][help] */
1188         int write_access)
1189 {
1190         pgd_t *pgd;
1191         pmd_t *pmd;
1192         pte_t *pte;
1193 
1194         pgd = pgd_offset(vma->vm_mm, address);
1195         pmd = pmd_alloc(pgd, address);
1196         if (!pmd)
1197                 goto no_memory;
1198         pte = pte_alloc(pmd, address);
1199         if (!pte)
1200                 goto no_memory;
1201         handle_pte_fault(vma, address, write_access, pte);
1202         update_mmu_cache(vma, address, *pte);
1203         return;
1204 no_memory:
1205         oom(current);
1206 }

/* [previous][next][first][last][top][bottom][index][help] */