root/mm/memory.c

/* [previous][next][first][last][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. copy_page
  2. oom
  3. free_one_pmd
  4. free_one_pgd
  5. clear_page_tables
  6. free_page_tables
  7. new_page_tables
  8. copy_one_pte
  9. copy_pte_range
  10. copy_pmd_range
  11. copy_page_range
  12. forget_pte
  13. zap_pte_range
  14. zap_pmd_range
  15. zap_page_range
  16. zeromap_pte_range
  17. zeromap_pmd_range
  18. zeromap_page_range
  19. remap_pte_range
  20. remap_pmd_range
  21. remap_page_range
  22. put_page
  23. put_dirty_page
  24. do_wp_page
  25. verify_area
  26. get_empty_page
  27. partial_clear
  28. vmtruncate
  29. get_empty_pgtable
  30. do_swap_page
  31. do_no_page
  32. handle_pte_fault
  33. handle_mm_fault

   1 /*
   2  *  linux/mm/memory.c
   3  *
   4  *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
   5  */
   6 
   7 /*
   8  * demand-loading started 01.12.91 - seems it is high on the list of
   9  * things wanted, and it should be easy to implement. - Linus
  10  */
  11 
  12 /*
  13  * Ok, demand-loading was easy, shared pages a little bit tricker. Shared
  14  * pages started 02.12.91, seems to work. - Linus.
  15  *
  16  * Tested sharing by executing about 30 /bin/sh: under the old kernel it
  17  * would have taken more than the 6M I have free, but it worked well as
  18  * far as I could see.
  19  *
  20  * Also corrected some "invalidate()"s - I wasn't doing enough of them.
  21  */
  22 
  23 /*
  24  * Real VM (paging to/from disk) started 18.12.91. Much more work and
  25  * thought has to go into this. Oh, well..
  26  * 19.12.91  -  works, somewhat. Sometimes I get faults, don't know why.
  27  *              Found it. Everything seems to work now.
  28  * 20.12.91  -  Ok, making the swap-device changeable like the root.
  29  */
  30 
  31 /*
  32  * 05.04.94  -  Multi-page memory management added for v1.1.
  33  *              Idea by Alex Bligh (alex@cconcepts.co.uk)
  34  */
  35 
  36 #include <linux/signal.h>
  37 #include <linux/sched.h>
  38 #include <linux/head.h>
  39 #include <linux/kernel.h>
  40 #include <linux/errno.h>
  41 #include <linux/string.h>
  42 #include <linux/types.h>
  43 #include <linux/ptrace.h>
  44 #include <linux/mman.h>
  45 #include <linux/mm.h>
  46 #include <linux/swap.h>
  47 
  48 #include <asm/system.h>
  49 #include <asm/segment.h>
  50 #include <asm/pgtable.h>
  51 #include <asm/string.h>
  52 
  53 unsigned long high_memory = 0;
  54 
  55 /*
  56  * We special-case the C-O-W ZERO_PAGE, because it's such
  57  * a common occurrence (no need to read the page to know
  58  * that it's zero - better for the cache and memory subsystem).
  59  */
  60 static inline void copy_page(unsigned long from, unsigned long to)
     /* [previous][next][first][last][top][bottom][index][help] */
  61 {
  62         if (from == ZERO_PAGE) {
  63                 memset((void *) to, 0, PAGE_SIZE);
  64                 return;
  65         }
  66         memcpy((void *) to, (void *) from, PAGE_SIZE);
  67 }
  68 
  69 #define USER_PTRS_PER_PGD (TASK_SIZE / PGDIR_SIZE)
  70 
  71 mem_map_t * mem_map = NULL;
  72 
  73 /*
  74  * oom() prints a message (so that the user knows why the process died),
  75  * and gives the process an untrappable SIGKILL.
  76  */
  77 void oom(struct task_struct * task)
     /* [previous][next][first][last][top][bottom][index][help] */
  78 {
  79         printk("\nOut of memory for %s.\n", current->comm);
  80         task->sig->action[SIGKILL-1].sa_handler = NULL;
  81         task->blocked &= ~(1<<(SIGKILL-1));
  82         send_sig(SIGKILL,task,1);
  83 }
  84 
  85 /*
  86  * Note: this doesn't free the actual pages themselves. That
  87  * has been handled earlier when unmapping all the memory regions.
  88  */
  89 static inline void free_one_pmd(pmd_t * dir)
     /* [previous][next][first][last][top][bottom][index][help] */
  90 {
  91         pte_t * pte;
  92 
  93         if (pmd_none(*dir))
  94                 return;
  95         if (pmd_bad(*dir)) {
  96                 printk("free_one_pmd: bad directory entry %08lx\n", pmd_val(*dir));
  97                 pmd_clear(dir);
  98                 return;
  99         }
 100         pte = pte_offset(dir, 0);
 101         pmd_clear(dir);
 102         pte_free(pte);
 103 }
 104 
 105 static inline void free_one_pgd(pgd_t * dir)
     /* [previous][next][first][last][top][bottom][index][help] */
 106 {
 107         int j;
 108         pmd_t * pmd;
 109 
 110         if (pgd_none(*dir))
 111                 return;
 112         if (pgd_bad(*dir)) {
 113                 printk("free_one_pgd: bad directory entry %08lx\n", pgd_val(*dir));
 114                 pgd_clear(dir);
 115                 return;
 116         }
 117         pmd = pmd_offset(dir, 0);
 118         pgd_clear(dir);
 119         for (j = 0; j < PTRS_PER_PMD ; j++)
 120                 free_one_pmd(pmd+j);
 121         pmd_free(pmd);
 122 }
 123         
 124 /*
 125  * This function clears all user-level page tables of a process - this
 126  * is needed by execve(), so that old pages aren't in the way.
 127  */
 128 void clear_page_tables(struct task_struct * tsk)
     /* [previous][next][first][last][top][bottom][index][help] */
 129 {
 130         int i;
 131         pgd_t * page_dir;
 132 
 133         page_dir = tsk->mm->pgd;
 134         if (!page_dir || page_dir == swapper_pg_dir) {
 135                 printk("%s trying to clear kernel page-directory: not good\n", tsk->comm);
 136                 return;
 137         }
 138         flush_cache_mm(tsk->mm);
 139         for (i = 0 ; i < USER_PTRS_PER_PGD ; i++)
 140                 free_one_pgd(page_dir + i);
 141         flush_tlb_mm(tsk->mm);
 142 }
 143 
 144 /*
 145  * This function frees up all page tables of a process when it exits. It
 146  * is the same as "clear_page_tables()", except it also changes the process'
 147  * page table directory to the kernel page tables and then frees the old
 148  * page table directory.
 149  */
 150 void free_page_tables(struct task_struct * tsk)
     /* [previous][next][first][last][top][bottom][index][help] */
 151 {
 152         int i;
 153         pgd_t * page_dir;
 154 
 155         page_dir = tsk->mm->pgd;
 156         if (!page_dir || page_dir == swapper_pg_dir) {
 157                 printk("%s trying to free kernel page-directory: not good\n", tsk->comm);
 158                 return;
 159         }
 160         flush_cache_mm(tsk->mm);
 161         flush_tlb_mm(tsk->mm);
 162         SET_PAGE_DIR(tsk, swapper_pg_dir);
 163         tsk->mm->pgd = swapper_pg_dir;  /* or else... */
 164         for (i = 0 ; i < USER_PTRS_PER_PGD ; i++)
 165                 free_one_pgd(page_dir + i);
 166         pgd_free(page_dir);
 167 }
 168 
 169 int new_page_tables(struct task_struct * tsk)
     /* [previous][next][first][last][top][bottom][index][help] */
 170 {
 171         pgd_t * page_dir, * new_pg;
 172 
 173         if (!(new_pg = pgd_alloc()))
 174                 return -ENOMEM;
 175         page_dir = pgd_offset(&init_mm, 0);
 176         flush_cache_mm(tsk->mm);
 177         memcpy(new_pg + USER_PTRS_PER_PGD, page_dir + USER_PTRS_PER_PGD,
 178                (PTRS_PER_PGD - USER_PTRS_PER_PGD) * sizeof (pgd_t));
 179         flush_tlb_mm(tsk->mm);
 180         SET_PAGE_DIR(tsk, new_pg);
 181         tsk->mm->pgd = new_pg;
 182         return 0;
 183 }
 184 
 185 static inline void copy_one_pte(pte_t * old_pte, pte_t * new_pte, int cow)
     /* [previous][next][first][last][top][bottom][index][help] */
 186 {
 187         pte_t pte = *old_pte;
 188         unsigned long page_nr;
 189 
 190         if (pte_none(pte))
 191                 return;
 192         if (!pte_present(pte)) {
 193                 swap_duplicate(pte_val(pte));
 194                 set_pte(new_pte, pte);
 195                 return;
 196         }
 197         page_nr = MAP_NR(pte_page(pte));
 198         if (page_nr >= MAP_NR(high_memory) || mem_map[page_nr].reserved) {
 199                 set_pte(new_pte, pte);
 200                 return;
 201         }
 202         if (cow)
 203                 pte = pte_wrprotect(pte);
 204         if (delete_from_swap_cache(page_nr))
 205                 pte = pte_mkdirty(pte);
 206         set_pte(new_pte, pte_mkold(pte));
 207         set_pte(old_pte, pte);
 208         mem_map[page_nr].count++;
 209 }
 210 
 211 static inline int copy_pte_range(pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long address, unsigned long size, int cow)
     /* [previous][next][first][last][top][bottom][index][help] */
 212 {
 213         pte_t * src_pte, * dst_pte;
 214         unsigned long end;
 215 
 216         if (pmd_none(*src_pmd))
 217                 return 0;
 218         if (pmd_bad(*src_pmd)) {
 219                 printk("copy_pte_range: bad pmd (%08lx)\n", pmd_val(*src_pmd));
 220                 pmd_clear(src_pmd);
 221                 return 0;
 222         }
 223         src_pte = pte_offset(src_pmd, address);
 224         if (pmd_none(*dst_pmd)) {
 225                 if (!pte_alloc(dst_pmd, 0))
 226                         return -ENOMEM;
 227         }
 228         dst_pte = pte_offset(dst_pmd, address);
 229         address &= ~PMD_MASK;
 230         end = address + size;
 231         if (end >= PMD_SIZE)
 232                 end = PMD_SIZE;
 233         do {
 234                 /* I would like to switch arguments here, to make it
 235                  * consistent with copy_xxx_range and memcpy syntax.
 236                  */
 237                 copy_one_pte(src_pte++, dst_pte++, cow);
 238                 address += PAGE_SIZE;
 239         } while (address < end);
 240         return 0;
 241 }
 242 
 243 static inline int copy_pmd_range(pgd_t *dst_pgd, pgd_t *src_pgd, unsigned long address, unsigned long size, int cow)
     /* [previous][next][first][last][top][bottom][index][help] */
 244 {
 245         pmd_t * src_pmd, * dst_pmd;
 246         unsigned long end;
 247         int error = 0;
 248 
 249         if (pgd_none(*src_pgd))
 250                 return 0;
 251         if (pgd_bad(*src_pgd)) {
 252                 printk("copy_pmd_range: bad pgd (%08lx)\n", pgd_val(*src_pgd));
 253                 pgd_clear(src_pgd);
 254                 return 0;
 255         }
 256         src_pmd = pmd_offset(src_pgd, address);
 257         if (pgd_none(*dst_pgd)) {
 258                 if (!pmd_alloc(dst_pgd, 0))
 259                         return -ENOMEM;
 260         }
 261         dst_pmd = pmd_offset(dst_pgd, address);
 262         address &= ~PGDIR_MASK;
 263         end = address + size;
 264         if (end > PGDIR_SIZE)
 265                 end = PGDIR_SIZE;
 266         do {
 267                 error = copy_pte_range(dst_pmd++, src_pmd++, address, end - address, cow);
 268                 if (error)
 269                         break;
 270                 address = (address + PMD_SIZE) & PMD_MASK; 
 271         } while (address < end);
 272         return error;
 273 }
 274 
 275 /*
 276  * copy one vm_area from one task to the other. Assumes the page tables
 277  * already present in the new task to be cleared in the whole range
 278  * covered by this vma.
 279  */
 280 int copy_page_range(struct mm_struct *dst, struct mm_struct *src,
     /* [previous][next][first][last][top][bottom][index][help] */
 281                         struct vm_area_struct *vma)
 282 {
 283         pgd_t * src_pgd, * dst_pgd;
 284         unsigned long address = vma->vm_start;
 285         unsigned long end = vma->vm_end;
 286         int error = 0, cow;
 287 
 288         cow = (vma->vm_flags & (VM_SHARED | VM_WRITE)) == VM_WRITE;
 289         src_pgd = pgd_offset(src, address);
 290         dst_pgd = pgd_offset(dst, address);
 291         flush_cache_range(src, vma->vm_start, vma->vm_end);
 292         flush_cache_range(dst, vma->vm_start, vma->vm_end);
 293         while (address < end) {
 294                 error = copy_pmd_range(dst_pgd++, src_pgd++, address, end - address, cow);
 295                 if (error)
 296                         break;
 297                 address = (address + PGDIR_SIZE) & PGDIR_MASK;
 298         }
 299         /* Note that the src ptes get c-o-w treatment, so they change too. */
 300         flush_tlb_range(src, vma->vm_start, vma->vm_end);
 301         flush_tlb_range(dst, vma->vm_start, vma->vm_end);
 302         return error;
 303 }
 304 
 305 static inline void forget_pte(pte_t page)
     /* [previous][next][first][last][top][bottom][index][help] */
 306 {
 307         if (pte_none(page))
 308                 return;
 309         if (pte_present(page)) {
 310                 unsigned long addr = pte_page(page);
 311                 if (addr >= high_memory || mem_map[MAP_NR(addr)].reserved)
 312                         return;
 313                 free_page(addr);
 314                 if (current->mm->rss <= 0)
 315                         return;
 316                 current->mm->rss--;
 317                 return;
 318         }
 319         swap_free(pte_val(page));
 320 }
 321 
 322 static inline void zap_pte_range(pmd_t * pmd, unsigned long address, unsigned long size)
     /* [previous][next][first][last][top][bottom][index][help] */
 323 {
 324         pte_t * pte;
 325         unsigned long end;
 326 
 327         if (pmd_none(*pmd))
 328                 return;
 329         if (pmd_bad(*pmd)) {
 330                 printk("zap_pte_range: bad pmd (%08lx)\n", pmd_val(*pmd));
 331                 pmd_clear(pmd);
 332                 return;
 333         }
 334         pte = pte_offset(pmd, address);
 335         address &= ~PMD_MASK;
 336         end = address + size;
 337         if (end >= PMD_SIZE)
 338                 end = PMD_SIZE;
 339         do {
 340                 pte_t page = *pte;
 341                 pte_clear(pte);
 342                 forget_pte(page);
 343                 address += PAGE_SIZE;
 344                 pte++;
 345         } while (address < end);
 346 }
 347 
 348 static inline void zap_pmd_range(pgd_t * dir, unsigned long address, unsigned long size)
     /* [previous][next][first][last][top][bottom][index][help] */
 349 {
 350         pmd_t * pmd;
 351         unsigned long end;
 352 
 353         if (pgd_none(*dir))
 354                 return;
 355         if (pgd_bad(*dir)) {
 356                 printk("zap_pmd_range: bad pgd (%08lx)\n", pgd_val(*dir));
 357                 pgd_clear(dir);
 358                 return;
 359         }
 360         pmd = pmd_offset(dir, address);
 361         address &= ~PGDIR_MASK;
 362         end = address + size;
 363         if (end > PGDIR_SIZE)
 364                 end = PGDIR_SIZE;
 365         do {
 366                 zap_pte_range(pmd, address, end - address);
 367                 address = (address + PMD_SIZE) & PMD_MASK; 
 368                 pmd++;
 369         } while (address < end);
 370 }
 371 
 372 /*
 373  * remove user pages in a given range.
 374  */
 375 int zap_page_range(struct mm_struct *mm, unsigned long address, unsigned long size)
     /* [previous][next][first][last][top][bottom][index][help] */
 376 {
 377         pgd_t * dir;
 378         unsigned long end = address + size;
 379 
 380         dir = pgd_offset(mm, address);
 381         flush_cache_range(mm, end - size, end);
 382         while (address < end) {
 383                 zap_pmd_range(dir, address, end - address);
 384                 address = (address + PGDIR_SIZE) & PGDIR_MASK;
 385                 dir++;
 386         }
 387         flush_tlb_range(mm, end - size, end);
 388         return 0;
 389 }
 390 
 391 static inline void zeromap_pte_range(pte_t * pte, unsigned long address, unsigned long size, pte_t zero_pte)
     /* [previous][next][first][last][top][bottom][index][help] */
 392 {
 393         unsigned long end;
 394 
 395         address &= ~PMD_MASK;
 396         end = address + size;
 397         if (end > PMD_SIZE)
 398                 end = PMD_SIZE;
 399         do {
 400                 pte_t oldpage = *pte;
 401                 set_pte(pte, zero_pte);
 402                 forget_pte(oldpage);
 403                 address += PAGE_SIZE;
 404                 pte++;
 405         } while (address < end);
 406 }
 407 
 408 static inline int zeromap_pmd_range(pmd_t * pmd, unsigned long address, unsigned long size, pte_t zero_pte)
     /* [previous][next][first][last][top][bottom][index][help] */
 409 {
 410         unsigned long end;
 411 
 412         address &= ~PGDIR_MASK;
 413         end = address + size;
 414         if (end > PGDIR_SIZE)
 415                 end = PGDIR_SIZE;
 416         do {
 417                 pte_t * pte = pte_alloc(pmd, address);
 418                 if (!pte)
 419                         return -ENOMEM;
 420                 zeromap_pte_range(pte, address, end - address, zero_pte);
 421                 address = (address + PMD_SIZE) & PMD_MASK;
 422                 pmd++;
 423         } while (address < end);
 424         return 0;
 425 }
 426 
 427 int zeromap_page_range(unsigned long address, unsigned long size, pgprot_t prot)
     /* [previous][next][first][last][top][bottom][index][help] */
 428 {
 429         int error = 0;
 430         pgd_t * dir;
 431         unsigned long beg = address;
 432         unsigned long end = address + size;
 433         pte_t zero_pte;
 434 
 435         zero_pte = pte_wrprotect(mk_pte(ZERO_PAGE, prot));
 436         dir = pgd_offset(current->mm, address);
 437         flush_cache_range(current->mm, beg, end);
 438         while (address < end) {
 439                 pmd_t *pmd = pmd_alloc(dir, address);
 440                 error = -ENOMEM;
 441                 if (!pmd)
 442                         break;
 443                 error = zeromap_pmd_range(pmd, address, end - address, zero_pte);
 444                 if (error)
 445                         break;
 446                 address = (address + PGDIR_SIZE) & PGDIR_MASK;
 447                 dir++;
 448         }
 449         flush_tlb_range(current->mm, beg, end);
 450         return error;
 451 }
 452 
 453 /*
 454  * maps a range of physical memory into the requested pages. the old
 455  * mappings are removed. any references to nonexistent pages results
 456  * in null mappings (currently treated as "copy-on-access")
 457  */
 458 static inline void remap_pte_range(pte_t * pte, unsigned long address, unsigned long size,
     /* [previous][next][first][last][top][bottom][index][help] */
 459         unsigned long offset, pgprot_t prot)
 460 {
 461         unsigned long end;
 462 
 463         address &= ~PMD_MASK;
 464         end = address + size;
 465         if (end > PMD_SIZE)
 466                 end = PMD_SIZE;
 467         do {
 468                 pte_t oldpage = *pte;
 469                 pte_clear(pte);
 470                 if (offset >= high_memory || mem_map[MAP_NR(offset)].reserved)
 471                         set_pte(pte, mk_pte(offset, prot));
 472                 forget_pte(oldpage);
 473                 address += PAGE_SIZE;
 474                 offset += PAGE_SIZE;
 475                 pte++;
 476         } while (address < end);
 477 }
 478 
 479 static inline int remap_pmd_range(pmd_t * pmd, unsigned long address, unsigned long size,
     /* [previous][next][first][last][top][bottom][index][help] */
 480         unsigned long offset, pgprot_t prot)
 481 {
 482         unsigned long end;
 483 
 484         address &= ~PGDIR_MASK;
 485         end = address + size;
 486         if (end > PGDIR_SIZE)
 487                 end = PGDIR_SIZE;
 488         offset -= address;
 489         do {
 490                 pte_t * pte = pte_alloc(pmd, address);
 491                 if (!pte)
 492                         return -ENOMEM;
 493                 remap_pte_range(pte, address, end - address, address + offset, prot);
 494                 address = (address + PMD_SIZE) & PMD_MASK;
 495                 pmd++;
 496         } while (address < end);
 497         return 0;
 498 }
 499 
 500 int remap_page_range(unsigned long from, unsigned long offset, unsigned long size, pgprot_t prot)
     /* [previous][next][first][last][top][bottom][index][help] */
 501 {
 502         int error = 0;
 503         pgd_t * dir;
 504         unsigned long beg = from;
 505         unsigned long end = from + size;
 506 
 507         offset -= from;
 508         dir = pgd_offset(current->mm, from);
 509         flush_cache_range(current->mm, beg, from);
 510         while (from < end) {
 511                 pmd_t *pmd = pmd_alloc(dir, from);
 512                 error = -ENOMEM;
 513                 if (!pmd)
 514                         break;
 515                 error = remap_pmd_range(pmd, from, end - from, offset + from, prot);
 516                 if (error)
 517                         break;
 518                 from = (from + PGDIR_SIZE) & PGDIR_MASK;
 519                 dir++;
 520         }
 521         flush_tlb_range(current->mm, beg, from);
 522         return error;
 523 }
 524 
 525 /*
 526  * sanity-check function..
 527  */
 528 static void put_page(pte_t * page_table, pte_t pte)
     /* [previous][next][first][last][top][bottom][index][help] */
 529 {
 530         if (!pte_none(*page_table)) {
 531                 printk("put_page: page already exists %08lx\n", pte_val(*page_table));
 532                 free_page(pte_page(pte));
 533                 return;
 534         }
 535 /* no need for invalidate */
 536         set_pte(page_table, pte);
 537 }
 538 
 539 /*
 540  * This routine is used to map in a page into an address space: needed by
 541  * execve() for the initial stack and environment pages.
 542  */
 543 unsigned long put_dirty_page(struct task_struct * tsk, unsigned long page, unsigned long address)
     /* [previous][next][first][last][top][bottom][index][help] */
 544 {
 545         pgd_t * pgd;
 546         pmd_t * pmd;
 547         pte_t * pte;
 548 
 549         if (page >= high_memory)
 550                 printk("put_dirty_page: trying to put page %08lx at %08lx\n",page,address);
 551         if (mem_map[MAP_NR(page)].count != 1)
 552                 printk("mem_map disagrees with %08lx at %08lx\n",page,address);
 553         pgd = pgd_offset(tsk->mm,address);
 554         pmd = pmd_alloc(pgd, address);
 555         if (!pmd) {
 556                 free_page(page);
 557                 oom(tsk);
 558                 return 0;
 559         }
 560         pte = pte_alloc(pmd, address);
 561         if (!pte) {
 562                 free_page(page);
 563                 oom(tsk);
 564                 return 0;
 565         }
 566         if (!pte_none(*pte)) {
 567                 printk("put_dirty_page: page already exists\n");
 568                 free_page(page);
 569                 return 0;
 570         }
 571         set_pte(pte, pte_mkwrite(pte_mkdirty(mk_pte(page, PAGE_COPY))));
 572 /* no need for invalidate */
 573         return page;
 574 }
 575 
 576 /*
 577  * This routine handles present pages, when users try to write
 578  * to a shared page. It is done by copying the page to a new address
 579  * and decrementing the shared-page counter for the old page.
 580  *
 581  * Goto-purists beware: the only reason for goto's here is that it results
 582  * in better assembly code.. The "default" path will see no jumps at all.
 583  *
 584  * Note that this routine assumes that the protection checks have been
 585  * done by the caller (the low-level page fault routine in most cases).
 586  * Thus we can safely just mark it writable once we've done any necessary
 587  * COW.
 588  *
 589  * We also mark the page dirty at this point even though the page will
 590  * change only once the write actually happens. This avoids a few races,
 591  * and potentially makes it more efficient.
 592  */
 593 void do_wp_page(struct task_struct * tsk, struct vm_area_struct * vma,
     /* [previous][next][first][last][top][bottom][index][help] */
 594         unsigned long address, int write_access)
 595 {
 596         pgd_t *page_dir;
 597         pmd_t *page_middle;
 598         pte_t *page_table, pte;
 599         unsigned long old_page, new_page;
 600 
 601         new_page = __get_free_page(GFP_KERNEL);
 602         page_dir = pgd_offset(vma->vm_mm, address);
 603         if (pgd_none(*page_dir))
 604                 goto end_wp_page;
 605         if (pgd_bad(*page_dir))
 606                 goto bad_wp_pagedir;
 607         page_middle = pmd_offset(page_dir, address);
 608         if (pmd_none(*page_middle))
 609                 goto end_wp_page;
 610         if (pmd_bad(*page_middle))
 611                 goto bad_wp_pagemiddle;
 612         page_table = pte_offset(page_middle, address);
 613         pte = *page_table;
 614         if (!pte_present(pte))
 615                 goto end_wp_page;
 616         if (pte_write(pte))
 617                 goto end_wp_page;
 618         old_page = pte_page(pte);
 619         if (old_page >= high_memory)
 620                 goto bad_wp_page;
 621         tsk->min_flt++;
 622         /*
 623          * Do we need to copy?
 624          */
 625         if (mem_map[MAP_NR(old_page)].count != 1) {
 626                 if (new_page) {
 627                         if (mem_map[MAP_NR(old_page)].reserved)
 628                                 ++vma->vm_mm->rss;
 629                         copy_page(old_page,new_page);
 630                         flush_page_to_ram(old_page);
 631                         flush_page_to_ram(new_page);
 632                         flush_cache_page(vma, address);
 633                         set_pte(page_table, pte_mkwrite(pte_mkdirty(mk_pte(new_page, vma->vm_page_prot))));
 634                         free_page(old_page);
 635                         flush_tlb_page(vma, address);
 636                         return;
 637                 }
 638                 flush_cache_page(vma, address);
 639                 set_pte(page_table, BAD_PAGE);
 640                 flush_tlb_page(vma, address);
 641                 free_page(old_page);
 642                 oom(tsk);
 643                 return;
 644         }
 645         flush_cache_page(vma, address);
 646         set_pte(page_table, pte_mkdirty(pte_mkwrite(pte)));
 647         flush_tlb_page(vma, address);
 648         if (new_page)
 649                 free_page(new_page);
 650         return;
 651 bad_wp_page:
 652         printk("do_wp_page: bogus page at address %08lx (%08lx)\n",address,old_page);
 653         send_sig(SIGKILL, tsk, 1);
 654         goto end_wp_page;
 655 bad_wp_pagemiddle:
 656         printk("do_wp_page: bogus page-middle at address %08lx (%08lx)\n", address, pmd_val(*page_middle));
 657         send_sig(SIGKILL, tsk, 1);
 658         goto end_wp_page;
 659 bad_wp_pagedir:
 660         printk("do_wp_page: bogus page-dir entry at address %08lx (%08lx)\n", address, pgd_val(*page_dir));
 661         send_sig(SIGKILL, tsk, 1);
 662 end_wp_page:
 663         if (new_page)
 664                 free_page(new_page);
 665         return;
 666 }
 667 
 668 /*
 669  * Ugly, ugly, but the goto's result in better assembly..
 670  */
 671 int verify_area(int type, const void * addr, unsigned long size)
     /* [previous][next][first][last][top][bottom][index][help] */
 672 {
 673         struct vm_area_struct * vma;
 674         unsigned long start = (unsigned long) addr;
 675 
 676         /* If the current user space is mapped to kernel space (for the
 677          * case where we use a fake user buffer with get_fs/set_fs()) we
 678          * don't expect to find the address in the user vm map.
 679          */
 680         if (!size || get_fs() == get_ds())
 681                 return 0;
 682 
 683         vma = find_vma(current, start);
 684         if (!vma)
 685                 goto bad_area;
 686         if (vma->vm_start <= start)
 687                 goto good_area;
 688         if (!(vma->vm_flags & VM_GROWSDOWN))
 689                 goto bad_area;
 690         if (expand_stack(vma, start))
 691                 goto bad_area;
 692 
 693 good_area:
 694         if (type == VERIFY_WRITE)
 695                 goto check_write;
 696         for (;;) {
 697                 struct vm_area_struct * next;
 698                 if (!(vma->vm_flags & VM_READ))
 699                         goto bad_area;
 700                 if (vma->vm_end - start >= size)
 701                         return 0;
 702                 next = vma->vm_next;
 703                 if (!next || vma->vm_end != next->vm_start)
 704                         goto bad_area;
 705                 vma = next;
 706         }
 707 
 708 check_write:
 709         if (!(vma->vm_flags & VM_WRITE))
 710                 goto bad_area;
 711         if (!wp_works_ok)
 712                 goto check_wp_fault_by_hand;
 713         for (;;) {
 714                 if (vma->vm_end - start >= size)
 715                         break;
 716                 if (!vma->vm_next || vma->vm_end != vma->vm_next->vm_start)
 717                         goto bad_area;
 718                 vma = vma->vm_next;
 719                 if (!(vma->vm_flags & VM_WRITE))
 720                         goto bad_area;
 721         }
 722         return 0;
 723 
 724 check_wp_fault_by_hand:
 725         size--;
 726         size += start & ~PAGE_MASK;
 727         size >>= PAGE_SHIFT;
 728         start &= PAGE_MASK;
 729 
 730         for (;;) {
 731                 do_wp_page(current, vma, start, 1);
 732                 if (!size)
 733                         break;
 734                 size--;
 735                 start += PAGE_SIZE;
 736                 if (start < vma->vm_end)
 737                         continue;
 738                 vma = vma->vm_next;
 739                 if (!vma || vma->vm_start != start)
 740                         goto bad_area;
 741                 if (!(vma->vm_flags & VM_WRITE))
 742                         goto bad_area;;
 743         }
 744         return 0;
 745 
 746 bad_area:
 747         return -EFAULT;
 748 }
 749 
 750 static inline void get_empty_page(struct task_struct * tsk, struct vm_area_struct * vma,
     /* [previous][next][first][last][top][bottom][index][help] */
 751         pte_t * page_table, int write_access)
 752 {
 753         pte_t pte;
 754 
 755         pte = pte_wrprotect(mk_pte(ZERO_PAGE, vma->vm_page_prot));
 756         if (write_access) {
 757                 unsigned long page = get_free_page(GFP_KERNEL);
 758                 pte = pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
 759                 vma->vm_mm->rss++;
 760                 tsk->min_flt++;
 761                 if (!page) {
 762                         oom(tsk);
 763                         pte = BAD_PAGE;
 764                 }
 765         }
 766         put_page(page_table, pte);
 767 }
 768 
 769 /*
 770  * This function zeroes out partial mmap'ed pages at truncation time..
 771  */
 772 static void partial_clear(struct vm_area_struct *vma, unsigned long address)
     /* [previous][next][first][last][top][bottom][index][help] */
 773 {
 774         pgd_t *page_dir;
 775         pmd_t *page_middle;
 776         pte_t *page_table, pte;
 777 
 778         page_dir = pgd_offset(vma->vm_mm, address);
 779         if (pgd_none(*page_dir))
 780                 return;
 781         if (pgd_bad(*page_dir)) {
 782                 printk("bad page table directory entry %p:[%lx]\n", page_dir, pgd_val(*page_dir));
 783                 pgd_clear(page_dir);
 784                 return;
 785         }
 786         page_middle = pmd_offset(page_dir, address);
 787         if (pmd_none(*page_middle))
 788                 return;
 789         if (pmd_bad(*page_middle)) {
 790                 printk("bad page table directory entry %p:[%lx]\n", page_dir, pgd_val(*page_dir));
 791                 pmd_clear(page_middle);
 792                 return;
 793         }
 794         page_table = pte_offset(page_middle, address);
 795         pte = *page_table;
 796         if (!pte_present(pte))
 797                 return;
 798         address &= ~PAGE_MASK;
 799         address += pte_page(pte);
 800         if (address >= high_memory)
 801                 return;
 802         memset((void *) address, 0, PAGE_SIZE - (address & ~PAGE_MASK));
 803 }
 804 
 805 /*
 806  * Handle all mappings that got truncated by a "truncate()"
 807  * system call.
 808  *
 809  * NOTE! We have to be ready to update the memory sharing
 810  * between the file and the memory map for a potential last
 811  * incomplete page.  Ugly, but necessary.
 812  */
 813 void vmtruncate(struct inode * inode, unsigned long offset)
     /* [previous][next][first][last][top][bottom][index][help] */
 814 {
 815         struct vm_area_struct * mpnt;
 816 
 817         truncate_inode_pages(inode, offset);
 818         if (!inode->i_mmap)
 819                 return;
 820         mpnt = inode->i_mmap;
 821         do {
 822                 unsigned long start = mpnt->vm_start;
 823                 unsigned long len = mpnt->vm_end - start;
 824                 unsigned long diff;
 825 
 826                 /* mapping wholly truncated? */
 827                 if (mpnt->vm_offset >= offset) {
 828                         zap_page_range(mpnt->vm_mm, start, len);
 829                         continue;
 830                 }
 831                 /* mapping wholly unaffected? */
 832                 diff = offset - mpnt->vm_offset;
 833                 if (diff >= len)
 834                         continue;
 835                 /* Ok, partially affected.. */
 836                 start += diff;
 837                 len = (len - diff) & PAGE_MASK;
 838                 if (start & ~PAGE_MASK) {
 839                         partial_clear(mpnt, start);
 840                         start = (start + ~PAGE_MASK) & PAGE_MASK;
 841                 }
 842                 zap_page_range(mpnt->vm_mm, start, len);
 843         } while ((mpnt = mpnt->vm_next_share) != inode->i_mmap);
 844 }
 845 
 846 /*
 847  * fill in an empty page-table if none exists.
 848  */
 849 static inline pte_t * get_empty_pgtable(struct task_struct * tsk,unsigned long address)
     /* [previous][next][first][last][top][bottom][index][help] */
 850 {
 851         pgd_t *pgd;
 852         pmd_t *pmd;
 853         pte_t *pte;
 854 
 855         pgd = pgd_offset(tsk->mm, address);
 856         pmd = pmd_alloc(pgd, address);
 857         if (!pmd) {
 858                 oom(tsk);
 859                 return NULL;
 860         }
 861         pte = pte_alloc(pmd, address);
 862         if (!pte) {
 863                 oom(tsk);
 864                 return NULL;
 865         }
 866         return pte;
 867 }
 868 
 869 static inline void do_swap_page(struct task_struct * tsk, 
     /* [previous][next][first][last][top][bottom][index][help] */
 870         struct vm_area_struct * vma, unsigned long address,
 871         pte_t * page_table, pte_t entry, int write_access)
 872 {
 873         pte_t page;
 874 
 875         if (!vma->vm_ops || !vma->vm_ops->swapin) {
 876                 swap_in(tsk, vma, page_table, pte_val(entry), write_access);
 877                 return;
 878         }
 879         page = vma->vm_ops->swapin(vma, address - vma->vm_start + vma->vm_offset, pte_val(entry));
 880         if (pte_val(*page_table) != pte_val(entry)) {
 881                 free_page(pte_page(page));
 882                 return;
 883         }
 884         if (mem_map[MAP_NR(pte_page(page))].count > 1 && !(vma->vm_flags & VM_SHARED))
 885                 page = pte_wrprotect(page);
 886         ++vma->vm_mm->rss;
 887         ++tsk->maj_flt;
 888         set_pte(page_table, page);
 889         return;
 890 }
 891 
 892 /*
 893  * do_no_page() tries to create a new page mapping. It aggressively
 894  * tries to share with existing pages, but makes a separate copy if
 895  * the "write_access" parameter is true in order to avoid the next
 896  * page fault.
 897  */
 898 void do_no_page(struct task_struct * tsk, struct vm_area_struct * vma,
     /* [previous][next][first][last][top][bottom][index][help] */
 899         unsigned long address, int write_access)
 900 {
 901         pte_t * page_table;
 902         pte_t entry;
 903         unsigned long page;
 904 
 905         page_table = get_empty_pgtable(tsk, address);
 906         if (!page_table)
 907                 return;
 908         entry = *page_table;
 909         if (pte_present(entry))
 910                 return;
 911         if (!pte_none(entry)) {
 912                 do_swap_page(tsk, vma, address, page_table, entry, write_access);
 913                 return;
 914         }
 915         address &= PAGE_MASK;
 916         if (!vma->vm_ops || !vma->vm_ops->nopage) {
 917                 flush_cache_page(vma, address);
 918                 get_empty_page(tsk, vma, page_table, write_access);
 919                 return;
 920         }
 921         ++tsk->maj_flt;
 922         ++vma->vm_mm->rss;
 923         /*
 924          * The third argument is "no_share", which tells the low-level code
 925          * to copy, not share the page even if sharing is possible.  It's
 926          * essentially an early COW detection 
 927          */
 928         page = vma->vm_ops->nopage(vma, address, write_access && !(vma->vm_flags & VM_SHARED));
 929         if (!page) {
 930                 send_sig(SIGBUS, current, 1);
 931                 flush_cache_page(vma, address);
 932                 put_page(page_table, BAD_PAGE);
 933                 flush_tlb_page(vma, address);
 934                 return;
 935         }
 936         /*
 937          * This silly early PAGE_DIRTY setting removes a race
 938          * due to the bad i386 page protection. But it's valid
 939          * for other architectures too.
 940          *
 941          * Note that if write_access is true, we either now have
 942          * a exclusive copy of the page, or this is a shared mapping,
 943          * so we can make it writable and dirty to avoid having to
 944          * handle that later.
 945          */
 946         entry = mk_pte(page, vma->vm_page_prot);
 947         if (write_access) {
 948                 entry = pte_mkwrite(pte_mkdirty(entry));
 949         } else if (mem_map[MAP_NR(page)].count > 1 && !(vma->vm_flags & VM_SHARED))
 950                 entry = pte_wrprotect(entry);
 951         flush_cache_page(vma, address);
 952         put_page(page_table, entry);
 953         flush_tlb_page(vma, address);
 954 }
 955 
 956 /*
 957  * The above separate functions for the no-page and wp-page
 958  * cases will go away (they mostly do the same thing anyway),
 959  * and we'll instead use only a general "handle_mm_fault()".
 960  *
 961  * These routines also need to handle stuff like marking pages dirty
 962  * and/or accessed for architectures that don't do it in hardware (most
 963  * RISC architectures).  The early dirtying is also good on the i386.
 964  *
 965  * There is also a hook called "update_mmu_cache()" that architectures
 966  * with external mmu caches can use to update those (ie the Sparc or
 967  * PowerPC hashed page tables that act as extended TLBs).
 968  */
 969 static inline void handle_pte_fault(struct vm_area_struct * vma, unsigned long address,
     /* [previous][next][first][last][top][bottom][index][help] */
 970         int write_access, pte_t * pte)
 971 {
 972         if (!pte_present(*pte)) {
 973                 do_no_page(current, vma, address, write_access);
 974                 return;
 975         }
 976         set_pte(pte, pte_mkyoung(*pte));
 977         if (!write_access)
 978                 return;
 979         if (pte_write(*pte)) {
 980                 set_pte(pte, pte_mkdirty(*pte));
 981                 return;
 982         }
 983         do_wp_page(current, vma, address, write_access);
 984 }
 985 
 986 void handle_mm_fault(struct vm_area_struct * vma, unsigned long address,
     /* [previous][next][first][last][top][bottom][index][help] */
 987         int write_access)
 988 {
 989         pgd_t *pgd;
 990         pmd_t *pmd;
 991         pte_t *pte;
 992 
 993         pgd = pgd_offset(vma->vm_mm, address);
 994         pmd = pmd_alloc(pgd, address);
 995         if (!pmd)
 996                 goto no_memory;
 997         pte = pte_alloc(pmd, address);
 998         if (!pte)
 999                 goto no_memory;
1000         handle_pte_fault(vma, address, write_access, pte);
1001         update_mmu_cache(vma, address, *pte);
1002         return;
1003 no_memory:
1004         oom(current);
1005 }

/* [previous][next][first][last][top][bottom][index][help] */