mm/memory.c

/* */
This source file includes following definitions.
copy_page
oom
free_one_pmd
free_one_pgd
clear_page_tables
free_page_tables
new_page_tables
copy_one_pte
copy_pte_range
copy_pmd_range
copy_page_range
forget_pte
zap_pte_range
zap_pmd_range
zap_page_range
zeromap_pte_range
zeromap_pmd_range
zeromap_page_range
remap_pte_range
remap_pmd_range
remap_page_range
put_page
put_dirty_page
do_wp_page
verify_area
get_empty_page
partial_clear
vmtruncate
get_empty_pgtable
do_swap_page
do_no_page
handle_pte_fault
handle_mm_fault
   1 /*
   2  *  linux/mm/memory.c
   3  *
   4  *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
   5  */
   6 
   7 /*
   8  * demand-loading started 01.12.91 - seems it is high on the list of
   9  * things wanted, and it should be easy to implement. - Linus
  10  */
  11 
  12 /*
  13  * Ok, demand-loading was easy, shared pages a little bit tricker. Shared
  14  * pages started 02.12.91, seems to work. - Linus.
  15  *
  16  * Tested sharing by executing about 30 /bin/sh: under the old kernel it
  17  * would have taken more than the 6M I have free, but it worked well as
  18  * far as I could see.
  19  *
  20  * Also corrected some "invalidate()"s - I wasn't doing enough of them.
  21  */
  22 
  23 /*
  24  * Real VM (paging to/from disk) started 18.12.91. Much more work and
  25  * thought has to go into this. Oh, well..
  26  * 19.12.91  -  works, somewhat. Sometimes I get faults, don't know why.
  27  *              Found it. Everything seems to work now.
  28  * 20.12.91  -  Ok, making the swap-device changeable like the root.
  29  */
  30 
  31 /*
  32  * 05.04.94  -  Multi-page memory management added for v1.1.
  33  *              Idea by Alex Bligh (alex@cconcepts.co.uk)
  34  */
  35 
  36 #include <linux/signal.h>
  37 #include <linux/sched.h>
  38 #include <linux/head.h>
  39 #include <linux/kernel.h>
  40 #include <linux/errno.h>
  41 #include <linux/string.h>
  42 #include <linux/types.h>
  43 #include <linux/ptrace.h>
  44 #include <linux/mman.h>
  45 #include <linux/mm.h>
  46 #include <linux/swap.h>
  47 
  48 #include <asm/system.h>
  49 #include <asm/segment.h>
  50 #include <asm/pgtable.h>
  51 #include <asm/string.h>
  52 
  53 unsigned long high_memory = 0;
  54 
  55 /*
  56  * We special-case the C-O-W ZERO_PAGE, because it's such
  57  * a common occurrence (no need to read the page to know
  58  * that it's zero - better for the cache and memory subsystem).
  59  */
  60 static inline void copy_page(unsigned long from, unsigned long to)
     /*  */
  61 {
  62         if (from == ZERO_PAGE) {
  63                 memset((void *) to, 0, PAGE_SIZE);
  64                 return;
  65         }
  66         memcpy((void *) to, (void *) from, PAGE_SIZE);
  67 }
  68 
  69 #define USER_PTRS_PER_PGD (TASK_SIZE / PGDIR_SIZE)
  70 
  71 mem_map_t * mem_map = NULL;
  72 
  73 /*
  74  * oom() prints a message (so that the user knows why the process died),
  75  * and gives the process an untrappable SIGKILL.
  76  */
  77 void oom(struct task_struct * task)
     /*  */
  78 {
  79         printk("\nOut of memory for %s.\n", current->comm);
  80         task->sig->action[SIGKILL-1].sa_handler = NULL;
  81         task->blocked &= ~(1<<(SIGKILL-1));
  82         send_sig(SIGKILL,task,1);
  83 }
  84 
  85 /*
  86  * Note: this doesn't free the actual pages themselves. That
  87  * has been handled earlier when unmapping all the memory regions.
  88  */
  89 static inline void free_one_pmd(pmd_t * dir)
     /*  */
  90 {
  91         pte_t * pte;
  92 
  93         if (pmd_none(*dir))
  94                 return;
  95         if (pmd_bad(*dir)) {
  96                 printk("free_one_pmd: bad directory entry %08lx\n", pmd_val(*dir));
  97                 pmd_clear(dir);
  98                 return;
  99         }
 100         pte = pte_offset(dir, 0);
 101         pmd_clear(dir);
 102         pte_free(pte);
 103 }
 104 
 105 static inline void free_one_pgd(pgd_t * dir)
     /*  */
 106 {
 107         int j;
 108         pmd_t * pmd;
 109 
 110         if (pgd_none(*dir))
 111                 return;
 112         if (pgd_bad(*dir)) {
 113                 printk("free_one_pgd: bad directory entry %08lx\n", pgd_val(*dir));
 114                 pgd_clear(dir);
 115                 return;
 116         }
 117         pmd = pmd_offset(dir, 0);
 118         pgd_clear(dir);
 119         for (j = 0; j < PTRS_PER_PMD ; j++)
 120                 free_one_pmd(pmd+j);
 121         pmd_free(pmd);
 122 }
 123         
 124 /*
 125  * This function clears all user-level page tables of a process - this
 126  * is needed by execve(), so that old pages aren't in the way.
 127  */
 128 void clear_page_tables(struct task_struct * tsk)
     /*  */
 129 {
 130         int i;
 131         pgd_t * page_dir;
 132 
 133         page_dir = tsk->mm->pgd;
 134         if (!page_dir || page_dir == swapper_pg_dir) {
 135                 printk("%s trying to clear kernel page-directory: not good\n", tsk->comm);
 136                 return;
 137         }
 138         flush_cache_mm(tsk->mm);
 139         for (i = 0 ; i < USER_PTRS_PER_PGD ; i++)
 140                 free_one_pgd(page_dir + i);
 141         flush_tlb_mm(tsk->mm);
 142 }
 143 
 144 /*
 145  * This function frees up all page tables of a process when it exits. It
 146  * is the same as "clear_page_tables()", except it also changes the process'
 147  * page table directory to the kernel page tables and then frees the old
 148  * page table directory.
 149  */
 150 void free_page_tables(struct task_struct * tsk)
     /*  */
 151 {
 152         int i;
 153         pgd_t * page_dir;
 154 
 155         page_dir = tsk->mm->pgd;
 156         if (!page_dir || page_dir == swapper_pg_dir) {
 157                 printk("%s trying to free kernel page-directory: not good\n", tsk->comm);
 158                 return;
 159         }
 160         flush_cache_mm(tsk->mm);
 161         flush_tlb_mm(tsk->mm);
 162         SET_PAGE_DIR(tsk, swapper_pg_dir);
 163         tsk->mm->pgd = swapper_pg_dir;  /* or else... */
 164         for (i = 0 ; i < USER_PTRS_PER_PGD ; i++)
 165                 free_one_pgd(page_dir + i);
 166         pgd_free(page_dir);
 167 }
 168 
 169 int new_page_tables(struct task_struct * tsk)
     /*  */
 170 {
 171         pgd_t * page_dir, * new_pg;
 172 
 173         if (!(new_pg = pgd_alloc()))
 174                 return -ENOMEM;
 175         page_dir = pgd_offset(&init_mm, 0);
 176         flush_cache_mm(tsk->mm);
 177         memcpy(new_pg + USER_PTRS_PER_PGD, page_dir + USER_PTRS_PER_PGD,
 178                (PTRS_PER_PGD - USER_PTRS_PER_PGD) * sizeof (pgd_t));
 179         flush_tlb_mm(tsk->mm);
 180         SET_PAGE_DIR(tsk, new_pg);
 181         tsk->mm->pgd = new_pg;
 182         return 0;
 183 }
 184 
 185 static inline void copy_one_pte(pte_t * old_pte, pte_t * new_pte, int cow)
     /*  */
 186 {
 187         pte_t pte = *old_pte;
 188         unsigned long page_nr;
 189 
 190         if (pte_none(pte))
 191                 return;
 192         if (!pte_present(pte)) {
 193                 swap_duplicate(pte_val(pte));
 194                 set_pte(new_pte, pte);
 195                 return;
 196         }
 197         page_nr = MAP_NR(pte_page(pte));
 198         if (page_nr >= MAP_NR(high_memory) || PageReserved(mem_map+page_nr)) {
 199                 set_pte(new_pte, pte);
 200                 return;
 201         }
 202         if (cow)
 203                 pte = pte_wrprotect(pte);
 204         if (delete_from_swap_cache(page_nr))
 205                 pte = pte_mkdirty(pte);
 206         set_pte(new_pte, pte_mkold(pte));
 207         set_pte(old_pte, pte);
 208         mem_map[page_nr].count++;
 209 }
 210 
 211 static inline int copy_pte_range(pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long address, unsigned long size, int cow)
     /*  */
 212 {
 213         pte_t * src_pte, * dst_pte;
 214         unsigned long end;
 215 
 216         if (pmd_none(*src_pmd))
 217                 return 0;
 218         if (pmd_bad(*src_pmd)) {
 219                 printk("copy_pte_range: bad pmd (%08lx)\n", pmd_val(*src_pmd));
 220                 pmd_clear(src_pmd);
 221                 return 0;
 222         }
 223         src_pte = pte_offset(src_pmd, address);
 224         if (pmd_none(*dst_pmd)) {
 225                 if (!pte_alloc(dst_pmd, 0))
 226                         return -ENOMEM;
 227         }
 228         dst_pte = pte_offset(dst_pmd, address);
 229         address &= ~PMD_MASK;
 230         end = address + size;
 231         if (end >= PMD_SIZE)
 232                 end = PMD_SIZE;
 233         do {
 234                 /* I would like to switch arguments here, to make it
 235                  * consistent with copy_xxx_range and memcpy syntax.
 236                  */
 237                 copy_one_pte(src_pte++, dst_pte++, cow);
 238                 address += PAGE_SIZE;
 239         } while (address < end);
 240         return 0;
 241 }
 242 
 243 static inline int copy_pmd_range(pgd_t *dst_pgd, pgd_t *src_pgd, unsigned long address, unsigned long size, int cow)
     /*  */
 244 {
 245         pmd_t * src_pmd, * dst_pmd;
 246         unsigned long end;
 247         int error = 0;
 248 
 249         if (pgd_none(*src_pgd))
 250                 return 0;
 251         if (pgd_bad(*src_pgd)) {
 252                 printk("copy_pmd_range: bad pgd (%08lx)\n", pgd_val(*src_pgd));
 253                 pgd_clear(src_pgd);
 254                 return 0;
 255         }
 256         src_pmd = pmd_offset(src_pgd, address);
 257         if (pgd_none(*dst_pgd)) {
 258                 if (!pmd_alloc(dst_pgd, 0))
 259                         return -ENOMEM;
 260         }
 261         dst_pmd = pmd_offset(dst_pgd, address);
 262         address &= ~PGDIR_MASK;
 263         end = address + size;
 264         if (end > PGDIR_SIZE)
 265                 end = PGDIR_SIZE;
 266         do {
 267                 error = copy_pte_range(dst_pmd++, src_pmd++, address, end - address, cow);
 268                 if (error)
 269                         break;
 270                 address = (address + PMD_SIZE) & PMD_MASK; 
 271         } while (address < end);
 272         return error;
 273 }
 274 
 275 /*
 276  * copy one vm_area from one task to the other. Assumes the page tables
 277  * already present in the new task to be cleared in the whole range
 278  * covered by this vma.
 279  */
 280 int copy_page_range(struct mm_struct *dst, struct mm_struct *src,
     /*  */
 281                         struct vm_area_struct *vma)
 282 {
 283         pgd_t * src_pgd, * dst_pgd;
 284         unsigned long address = vma->vm_start;
 285         unsigned long end = vma->vm_end;
 286         int error = 0, cow;
 287 
 288         cow = (vma->vm_flags & (VM_SHARED | VM_WRITE)) == VM_WRITE;
 289         src_pgd = pgd_offset(src, address);
 290         dst_pgd = pgd_offset(dst, address);
 291         flush_cache_range(src, vma->vm_start, vma->vm_end);
 292         flush_cache_range(dst, vma->vm_start, vma->vm_end);
 293         while (address < end) {
 294                 error = copy_pmd_range(dst_pgd++, src_pgd++, address, end - address, cow);
 295                 if (error)
 296                         break;
 297                 address = (address + PGDIR_SIZE) & PGDIR_MASK;
 298         }
 299         /* Note that the src ptes get c-o-w treatment, so they change too. */
 300         flush_tlb_range(src, vma->vm_start, vma->vm_end);
 301         flush_tlb_range(dst, vma->vm_start, vma->vm_end);
 302         return error;
 303 }
 304 
 305 static inline void forget_pte(pte_t page)
     /*  */
 306 {
 307         if (pte_none(page))
 308                 return;
 309         if (pte_present(page)) {
 310                 unsigned long addr = pte_page(page);
 311                 if (addr >= high_memory || PageReserved(mem_map+MAP_NR(addr)))
 312                         return;
 313                 free_page(addr);
 314                 if (current->mm->rss <= 0)
 315                         return;
 316                 current->mm->rss--;
 317                 return;
 318         }
 319         swap_free(pte_val(page));
 320 }
 321 
 322 static inline void zap_pte_range(pmd_t * pmd, unsigned long address, unsigned long size)
     /*  */
 323 {
 324         pte_t * pte;
 325         unsigned long end;
 326 
 327         if (pmd_none(*pmd))
 328                 return;
 329         if (pmd_bad(*pmd)) {
 330                 printk("zap_pte_range: bad pmd (%08lx)\n", pmd_val(*pmd));
 331                 pmd_clear(pmd);
 332                 return;
 333         }
 334         pte = pte_offset(pmd, address);
 335         address &= ~PMD_MASK;
 336         end = address + size;
 337         if (end >= PMD_SIZE)
 338                 end = PMD_SIZE;
 339         do {
 340                 pte_t page = *pte;
 341                 pte_clear(pte);
 342                 forget_pte(page);
 343                 address += PAGE_SIZE;
 344                 pte++;
 345         } while (address < end);
 346 }
 347 
 348 static inline void zap_pmd_range(pgd_t * dir, unsigned long address, unsigned long size)
     /*  */
 349 {
 350         pmd_t * pmd;
 351         unsigned long end;
 352 
 353         if (pgd_none(*dir))
 354                 return;
 355         if (pgd_bad(*dir)) {
 356                 printk("zap_pmd_range: bad pgd (%08lx)\n", pgd_val(*dir));
 357                 pgd_clear(dir);
 358                 return;
 359         }
 360         pmd = pmd_offset(dir, address);
 361         address &= ~PGDIR_MASK;
 362         end = address + size;
 363         if (end > PGDIR_SIZE)
 364                 end = PGDIR_SIZE;
 365         do {
 366                 zap_pte_range(pmd, address, end - address);
 367                 address = (address + PMD_SIZE) & PMD_MASK; 
 368                 pmd++;
 369         } while (address < end);
 370 }
 371 
 372 /*
 373  * remove user pages in a given range.
 374  */
 375 int zap_page_range(struct mm_struct *mm, unsigned long address, unsigned long size)
     /*  */
 376 {
 377         pgd_t * dir;
 378         unsigned long end = address + size;
 379 
 380         dir = pgd_offset(mm, address);
 381         flush_cache_range(mm, end - size, end);
 382         while (address < end) {
 383                 zap_pmd_range(dir, address, end - address);
 384                 address = (address + PGDIR_SIZE) & PGDIR_MASK;
 385                 dir++;
 386         }
 387         flush_tlb_range(mm, end - size, end);
 388         return 0;
 389 }
 390 
 391 static inline void zeromap_pte_range(pte_t * pte, unsigned long address, unsigned long size, pte_t zero_pte)
     /*  */
 392 {
 393         unsigned long end;
 394 
 395         address &= ~PMD_MASK;
 396         end = address + size;
 397         if (end > PMD_SIZE)
 398                 end = PMD_SIZE;
 399         do {
 400                 pte_t oldpage = *pte;
 401                 set_pte(pte, zero_pte);
 402                 forget_pte(oldpage);
 403                 address += PAGE_SIZE;
 404                 pte++;
 405         } while (address < end);
 406 }
 407 
 408 static inline int zeromap_pmd_range(pmd_t * pmd, unsigned long address, unsigned long size, pte_t zero_pte)
     /*  */
 409 {
 410         unsigned long end;
 411 
 412         address &= ~PGDIR_MASK;
 413         end = address + size;
 414         if (end > PGDIR_SIZE)
 415                 end = PGDIR_SIZE;
 416         do {
 417                 pte_t * pte = pte_alloc(pmd, address);
 418                 if (!pte)
 419                         return -ENOMEM;
 420                 zeromap_pte_range(pte, address, end - address, zero_pte);
 421                 address = (address + PMD_SIZE) & PMD_MASK;
 422                 pmd++;
 423         } while (address < end);
 424         return 0;
 425 }
 426 
 427 int zeromap_page_range(unsigned long address, unsigned long size, pgprot_t prot)
     /*  */
 428 {
 429         int error = 0;
 430         pgd_t * dir;
 431         unsigned long beg = address;
 432         unsigned long end = address + size;
 433         pte_t zero_pte;
 434 
 435         zero_pte = pte_wrprotect(mk_pte(ZERO_PAGE, prot));
 436         dir = pgd_offset(current->mm, address);
 437         flush_cache_range(current->mm, beg, end);
 438         while (address < end) {
 439                 pmd_t *pmd = pmd_alloc(dir, address);
 440                 error = -ENOMEM;
 441                 if (!pmd)
 442                         break;
 443                 error = zeromap_pmd_range(pmd, address, end - address, zero_pte);
 444                 if (error)
 445                         break;
 446                 address = (address + PGDIR_SIZE) & PGDIR_MASK;
 447                 dir++;
 448         }
 449         flush_tlb_range(current->mm, beg, end);
 450         return error;
 451 }
 452 
 453 /*
 454  * maps a range of physical memory into the requested pages. the old
 455  * mappings are removed. any references to nonexistent pages results
 456  * in null mappings (currently treated as "copy-on-access")
 457  */
 458 static inline void remap_pte_range(pte_t * pte, unsigned long address, unsigned long size,
     /*  */
 459         unsigned long offset, pgprot_t prot)
 460 {
 461         unsigned long end;
 462 
 463         address &= ~PMD_MASK;
 464         end = address + size;
 465         if (end > PMD_SIZE)
 466                 end = PMD_SIZE;
 467         do {
 468                 pte_t oldpage = *pte;
 469                 pte_clear(pte);
 470                 if (offset >= high_memory || PageReserved(mem_map+MAP_NR(offset)))
 471                         set_pte(pte, mk_pte(offset, prot));
 472                 forget_pte(oldpage);
 473                 address += PAGE_SIZE;
 474                 offset += PAGE_SIZE;
 475                 pte++;
 476         } while (address < end);
 477 }
 478 
 479 static inline int remap_pmd_range(pmd_t * pmd, unsigned long address, unsigned long size,
     /*  */
 480         unsigned long offset, pgprot_t prot)
 481 {
 482         unsigned long end;
 483 
 484         address &= ~PGDIR_MASK;
 485         end = address + size;
 486         if (end > PGDIR_SIZE)
 487                 end = PGDIR_SIZE;
 488         offset -= address;
 489         do {
 490                 pte_t * pte = pte_alloc(pmd, address);
 491                 if (!pte)
 492                         return -ENOMEM;
 493                 remap_pte_range(pte, address, end - address, address + offset, prot);
 494                 address = (address + PMD_SIZE) & PMD_MASK;
 495                 pmd++;
 496         } while (address < end);
 497         return 0;
 498 }
 499 
 500 int remap_page_range(unsigned long from, unsigned long offset, unsigned long size, pgprot_t prot)
     /*  */
 501 {
 502         int error = 0;
 503         pgd_t * dir;
 504         unsigned long beg = from;
 505         unsigned long end = from + size;
 506 
 507         offset -= from;
 508         dir = pgd_offset(current->mm, from);
 509         flush_cache_range(current->mm, beg, from);
 510         while (from < end) {
 511                 pmd_t *pmd = pmd_alloc(dir, from);
 512                 error = -ENOMEM;
 513                 if (!pmd)
 514                         break;
 515                 error = remap_pmd_range(pmd, from, end - from, offset + from, prot);
 516                 if (error)
 517                         break;
 518                 from = (from + PGDIR_SIZE) & PGDIR_MASK;
 519                 dir++;
 520         }
 521         flush_tlb_range(current->mm, beg, from);
 522         return error;
 523 }
 524 
 525 /*
 526  * sanity-check function..
 527  */
 528 static void put_page(pte_t * page_table, pte_t pte)
     /*  */
 529 {
 530         if (!pte_none(*page_table)) {
 531                 printk("put_page: page already exists %08lx\n", pte_val(*page_table));
 532                 free_page(pte_page(pte));
 533                 return;
 534         }
 535 /* no need for invalidate */
 536         set_pte(page_table, pte);
 537 }
 538 
 539 /*
 540  * This routine is used to map in a page into an address space: needed by
 541  * execve() for the initial stack and environment pages.
 542  */
 543 unsigned long put_dirty_page(struct task_struct * tsk, unsigned long page, unsigned long address)
     /*  */
 544 {
 545         pgd_t * pgd;
 546         pmd_t * pmd;
 547         pte_t * pte;
 548 
 549         if (page >= high_memory)
 550                 printk("put_dirty_page: trying to put page %08lx at %08lx\n",page,address);
 551         if (mem_map[MAP_NR(page)].count != 1)
 552                 printk("mem_map disagrees with %08lx at %08lx\n",page,address);
 553         pgd = pgd_offset(tsk->mm,address);
 554         pmd = pmd_alloc(pgd, address);
 555         if (!pmd) {
 556                 free_page(page);
 557                 oom(tsk);
 558                 return 0;
 559         }
 560         pte = pte_alloc(pmd, address);
 561         if (!pte) {
 562                 free_page(page);
 563                 oom(tsk);
 564                 return 0;
 565         }
 566         if (!pte_none(*pte)) {
 567                 printk("put_dirty_page: page already exists\n");
 568                 free_page(page);
 569                 return 0;
 570         }
 571         set_pte(pte, pte_mkwrite(pte_mkdirty(mk_pte(page, PAGE_COPY))));
 572 /* no need for invalidate */
 573         return page;
 574 }
 575 
 576 /*
 577  * This routine handles present pages, when users try to write
 578  * to a shared page. It is done by copying the page to a new address
 579  * and decrementing the shared-page counter for the old page.
 580  *
 581  * Goto-purists beware: the only reason for goto's here is that it results
 582  * in better assembly code.. The "default" path will see no jumps at all.
 583  *
 584  * Note that this routine assumes that the protection checks have been
 585  * done by the caller (the low-level page fault routine in most cases).
 586  * Thus we can safely just mark it writable once we've done any necessary
 587  * COW.
 588  *
 589  * We also mark the page dirty at this point even though the page will
 590  * change only once the write actually happens. This avoids a few races,
 591  * and potentially makes it more efficient.
 592  */
 593 void do_wp_page(struct task_struct * tsk, struct vm_area_struct * vma,
     /*  */
 594         unsigned long address, int write_access)
 595 {
 596         pgd_t *page_dir;
 597         pmd_t *page_middle;
 598         pte_t *page_table, pte;
 599         unsigned long old_page, new_page;
 600 
 601         new_page = __get_free_page(GFP_KERNEL);
 602         page_dir = pgd_offset(vma->vm_mm, address);
 603         if (pgd_none(*page_dir))
 604                 goto end_wp_page;
 605         if (pgd_bad(*page_dir))
 606                 goto bad_wp_pagedir;
 607         page_middle = pmd_offset(page_dir, address);
 608         if (pmd_none(*page_middle))
 609                 goto end_wp_page;
 610         if (pmd_bad(*page_middle))
 611                 goto bad_wp_pagemiddle;
 612         page_table = pte_offset(page_middle, address);
 613         pte = *page_table;
 614         if (!pte_present(pte))
 615                 goto end_wp_page;
 616         if (pte_write(pte))
 617                 goto end_wp_page;
 618         old_page = pte_page(pte);
 619         if (old_page >= high_memory)
 620                 goto bad_wp_page;
 621         tsk->min_flt++;
 622         /*
 623          * Do we need to copy?
 624          */
 625         if (mem_map[MAP_NR(old_page)].count != 1) {
 626                 if (new_page) {
 627                         if (PageReserved(mem_map + MAP_NR(old_page)))
 628                                 ++vma->vm_mm->rss;
 629                         copy_page(old_page,new_page);
 630                         flush_page_to_ram(old_page);
 631                         flush_page_to_ram(new_page);
 632                         flush_cache_page(vma, address);
 633                         set_pte(page_table, pte_mkwrite(pte_mkdirty(mk_pte(new_page, vma->vm_page_prot))));
 634                         free_page(old_page);
 635                         flush_tlb_page(vma, address);
 636                         return;
 637                 }
 638                 flush_cache_page(vma, address);
 639                 set_pte(page_table, BAD_PAGE);
 640                 flush_tlb_page(vma, address);
 641                 free_page(old_page);
 642                 oom(tsk);
 643                 return;
 644         }
 645         flush_cache_page(vma, address);
 646         set_pte(page_table, pte_mkdirty(pte_mkwrite(pte)));
 647         flush_tlb_page(vma, address);
 648         if (new_page)
 649                 free_page(new_page);
 650         return;
 651 bad_wp_page:
 652         printk("do_wp_page: bogus page at address %08lx (%08lx)\n",address,old_page);
 653         send_sig(SIGKILL, tsk, 1);
 654         goto end_wp_page;
 655 bad_wp_pagemiddle:
 656         printk("do_wp_page: bogus page-middle at address %08lx (%08lx)\n", address, pmd_val(*page_middle));
 657         send_sig(SIGKILL, tsk, 1);
 658         goto end_wp_page;
 659 bad_wp_pagedir:
 660         printk("do_wp_page: bogus page-dir entry at address %08lx (%08lx)\n", address, pgd_val(*page_dir));
 661         send_sig(SIGKILL, tsk, 1);
 662 end_wp_page:
 663         if (new_page)
 664                 free_page(new_page);
 665         return;
 666 }
 667 
 668 /*
 669  * Ugly, ugly, but the goto's result in better assembly..
 670  */
 671 int verify_area(int type, const void * addr, unsigned long size)
     /*  */
 672 {
 673         struct vm_area_struct * vma;
 674         unsigned long start = (unsigned long) addr;
 675 
 676         /* If the current user space is mapped to kernel space (for the
 677          * case where we use a fake user buffer with get_fs/set_fs()) we
 678          * don't expect to find the address in the user vm map.
 679          */
 680         if (!size || get_fs() == KERNEL_DS)
 681                 return 0;
 682 
 683         vma = find_vma(current, start);
 684         if (!vma)
 685                 goto bad_area;
 686         if (vma->vm_start > start)
 687                 goto check_stack;
 688 
 689 good_area:
 690         if (type == VERIFY_WRITE)
 691                 goto check_write;
 692         for (;;) {
 693                 struct vm_area_struct * next;
 694                 if (!(vma->vm_flags & VM_READ))
 695                         goto bad_area;
 696                 if (vma->vm_end - start >= size)
 697                         return 0;
 698                 next = vma->vm_next;
 699                 if (!next || vma->vm_end != next->vm_start)
 700                         goto bad_area;
 701                 vma = next;
 702         }
 703 
 704 check_write:
 705         if (!(vma->vm_flags & VM_WRITE))
 706                 goto bad_area;
 707         if (!wp_works_ok)
 708                 goto check_wp_fault_by_hand;
 709         for (;;) {
 710                 if (vma->vm_end - start >= size)
 711                         break;
 712                 if (!vma->vm_next || vma->vm_end != vma->vm_next->vm_start)
 713                         goto bad_area;
 714                 vma = vma->vm_next;
 715                 if (!(vma->vm_flags & VM_WRITE))
 716                         goto bad_area;
 717         }
 718         return 0;
 719 
 720 check_wp_fault_by_hand:
 721         size--;
 722         size += start & ~PAGE_MASK;
 723         size >>= PAGE_SHIFT;
 724         start &= PAGE_MASK;
 725 
 726         for (;;) {
 727                 do_wp_page(current, vma, start, 1);
 728                 if (!size)
 729                         break;
 730                 size--;
 731                 start += PAGE_SIZE;
 732                 if (start < vma->vm_end)
 733                         continue;
 734                 vma = vma->vm_next;
 735                 if (!vma || vma->vm_start != start)
 736                         goto bad_area;
 737                 if (!(vma->vm_flags & VM_WRITE))
 738                         goto bad_area;;
 739         }
 740         return 0;
 741 
 742 check_stack:
 743         if (!(vma->vm_flags & VM_GROWSDOWN))
 744                 goto bad_area;
 745         if (expand_stack(vma, start))
 746                 goto good_area;
 747 
 748 bad_area:
 749         return -EFAULT;
 750 }
 751 
 752 static inline void get_empty_page(struct task_struct * tsk, struct vm_area_struct * vma,
     /*  */
 753         pte_t * page_table, int write_access)
 754 {
 755         pte_t pte;
 756 
 757         pte = pte_wrprotect(mk_pte(ZERO_PAGE, vma->vm_page_prot));
 758         if (write_access) {
 759                 unsigned long page = get_free_page(GFP_KERNEL);
 760                 pte = pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
 761                 vma->vm_mm->rss++;
 762                 tsk->min_flt++;
 763                 if (!page) {
 764                         oom(tsk);
 765                         pte = BAD_PAGE;
 766                 }
 767         }
 768         put_page(page_table, pte);
 769 }
 770 
 771 /*
 772  * This function zeroes out partial mmap'ed pages at truncation time..
 773  */
 774 static void partial_clear(struct vm_area_struct *vma, unsigned long address)
     /*  */
 775 {
 776         pgd_t *page_dir;
 777         pmd_t *page_middle;
 778         pte_t *page_table, pte;
 779 
 780         page_dir = pgd_offset(vma->vm_mm, address);
 781         if (pgd_none(*page_dir))
 782                 return;
 783         if (pgd_bad(*page_dir)) {
 784                 printk("bad page table directory entry %p:[%lx]\n", page_dir, pgd_val(*page_dir));
 785                 pgd_clear(page_dir);
 786                 return;
 787         }
 788         page_middle = pmd_offset(page_dir, address);
 789         if (pmd_none(*page_middle))
 790                 return;
 791         if (pmd_bad(*page_middle)) {
 792                 printk("bad page table directory entry %p:[%lx]\n", page_dir, pgd_val(*page_dir));
 793                 pmd_clear(page_middle);
 794                 return;
 795         }
 796         page_table = pte_offset(page_middle, address);
 797         pte = *page_table;
 798         if (!pte_present(pte))
 799                 return;
 800         address &= ~PAGE_MASK;
 801         address += pte_page(pte);
 802         if (address >= high_memory)
 803                 return;
 804         memset((void *) address, 0, PAGE_SIZE - (address & ~PAGE_MASK));
 805 }
 806 
 807 /*
 808  * Handle all mappings that got truncated by a "truncate()"
 809  * system call.
 810  *
 811  * NOTE! We have to be ready to update the memory sharing
 812  * between the file and the memory map for a potential last
 813  * incomplete page.  Ugly, but necessary.
 814  */
 815 void vmtruncate(struct inode * inode, unsigned long offset)
     /*  */
 816 {
 817         struct vm_area_struct * mpnt;
 818 
 819         truncate_inode_pages(inode, offset);
 820         if (!inode->i_mmap)
 821                 return;
 822         mpnt = inode->i_mmap;
 823         do {
 824                 unsigned long start = mpnt->vm_start;
 825                 unsigned long len = mpnt->vm_end - start;
 826                 unsigned long diff;
 827 
 828                 /* mapping wholly truncated? */
 829                 if (mpnt->vm_offset >= offset) {
 830                         zap_page_range(mpnt->vm_mm, start, len);
 831                         continue;
 832                 }
 833                 /* mapping wholly unaffected? */
 834                 diff = offset - mpnt->vm_offset;
 835                 if (diff >= len)
 836                         continue;
 837                 /* Ok, partially affected.. */
 838                 start += diff;
 839                 len = (len - diff) & PAGE_MASK;
 840                 if (start & ~PAGE_MASK) {
 841                         partial_clear(mpnt, start);
 842                         start = (start + ~PAGE_MASK) & PAGE_MASK;
 843                 }
 844                 zap_page_range(mpnt->vm_mm, start, len);
 845         } while ((mpnt = mpnt->vm_next_share) != inode->i_mmap);
 846 }
 847 
 848 /*
 849  * fill in an empty page-table if none exists.
 850  */
 851 static inline pte_t * get_empty_pgtable(struct task_struct * tsk,unsigned long address)
     /*  */
 852 {
 853         pgd_t *pgd;
 854         pmd_t *pmd;
 855         pte_t *pte;
 856 
 857         pgd = pgd_offset(tsk->mm, address);
 858         pmd = pmd_alloc(pgd, address);
 859         if (!pmd) {
 860                 oom(tsk);
 861                 return NULL;
 862         }
 863         pte = pte_alloc(pmd, address);
 864         if (!pte) {
 865                 oom(tsk);
 866                 return NULL;
 867         }
 868         return pte;
 869 }
 870 
 871 static inline void do_swap_page(struct task_struct * tsk, 
     /*  */
 872         struct vm_area_struct * vma, unsigned long address,
 873         pte_t * page_table, pte_t entry, int write_access)
 874 {
 875         pte_t page;
 876 
 877         if (!vma->vm_ops || !vma->vm_ops->swapin) {
 878                 swap_in(tsk, vma, page_table, pte_val(entry), write_access);
 879                 return;
 880         }
 881         page = vma->vm_ops->swapin(vma, address - vma->vm_start + vma->vm_offset, pte_val(entry));
 882         if (pte_val(*page_table) != pte_val(entry)) {
 883                 free_page(pte_page(page));
 884                 return;
 885         }
 886         if (mem_map[MAP_NR(pte_page(page))].count > 1 && !(vma->vm_flags & VM_SHARED))
 887                 page = pte_wrprotect(page);
 888         ++vma->vm_mm->rss;
 889         ++tsk->maj_flt;
 890         set_pte(page_table, page);
 891         return;
 892 }
 893 
 894 /*
 895  * do_no_page() tries to create a new page mapping. It aggressively
 896  * tries to share with existing pages, but makes a separate copy if
 897  * the "write_access" parameter is true in order to avoid the next
 898  * page fault.
 899  */
 900 void do_no_page(struct task_struct * tsk, struct vm_area_struct * vma,
     /*  */
 901         unsigned long address, int write_access)
 902 {
 903         pte_t * page_table;
 904         pte_t entry;
 905         unsigned long page;
 906 
 907         page_table = get_empty_pgtable(tsk, address);
 908         if (!page_table)
 909                 return;
 910         entry = *page_table;
 911         if (pte_present(entry))
 912                 return;
 913         if (!pte_none(entry)) {
 914                 do_swap_page(tsk, vma, address, page_table, entry, write_access);
 915                 return;
 916         }
 917         address &= PAGE_MASK;
 918         if (!vma->vm_ops || !vma->vm_ops->nopage) {
 919                 flush_cache_page(vma, address);
 920                 get_empty_page(tsk, vma, page_table, write_access);
 921                 return;
 922         }
 923         ++tsk->maj_flt;
 924         ++vma->vm_mm->rss;
 925         /*
 926          * The third argument is "no_share", which tells the low-level code
 927          * to copy, not share the page even if sharing is possible.  It's
 928          * essentially an early COW detection 
 929          */
 930         page = vma->vm_ops->nopage(vma, address, write_access && !(vma->vm_flags & VM_SHARED));
 931         if (!page) {
 932                 send_sig(SIGBUS, current, 1);
 933                 flush_cache_page(vma, address);
 934                 put_page(page_table, BAD_PAGE);
 935                 flush_tlb_page(vma, address);
 936                 return;
 937         }
 938         /*
 939          * This silly early PAGE_DIRTY setting removes a race
 940          * due to the bad i386 page protection. But it's valid
 941          * for other architectures too.
 942          *
 943          * Note that if write_access is true, we either now have
 944          * a exclusive copy of the page, or this is a shared mapping,
 945          * so we can make it writable and dirty to avoid having to
 946          * handle that later.
 947          */
 948         entry = mk_pte(page, vma->vm_page_prot);
 949         if (write_access) {
 950                 entry = pte_mkwrite(pte_mkdirty(entry));
 951         } else if (mem_map[MAP_NR(page)].count > 1 && !(vma->vm_flags & VM_SHARED))
 952                 entry = pte_wrprotect(entry);
 953         flush_cache_page(vma, address);
 954         put_page(page_table, entry);
 955         flush_tlb_page(vma, address);
 956 }
 957 
 958 /*
 959  * The above separate functions for the no-page and wp-page
 960  * cases will go away (they mostly do the same thing anyway),
 961  * and we'll instead use only a general "handle_mm_fault()".
 962  *
 963  * These routines also need to handle stuff like marking pages dirty
 964  * and/or accessed for architectures that don't do it in hardware (most
 965  * RISC architectures).  The early dirtying is also good on the i386.
 966  *
 967  * There is also a hook called "update_mmu_cache()" that architectures
 968  * with external mmu caches can use to update those (ie the Sparc or
 969  * PowerPC hashed page tables that act as extended TLBs).
 970  */
 971 static inline void handle_pte_fault(struct vm_area_struct * vma, unsigned long address,
     /*  */
 972         int write_access, pte_t * pte)
 973 {
 974         if (!pte_present(*pte)) {
 975                 do_no_page(current, vma, address, write_access);
 976                 return;
 977         }
 978         set_pte(pte, pte_mkyoung(*pte));
 979         if (!write_access)
 980                 return;
 981         if (pte_write(*pte)) {
 982                 set_pte(pte, pte_mkdirty(*pte));
 983                 return;
 984         }
 985         do_wp_page(current, vma, address, write_access);
 986 }
 987 
 988 void handle_mm_fault(struct vm_area_struct * vma, unsigned long address,
     /*  */
 989         int write_access)
 990 {
 991         pgd_t *pgd;
 992         pmd_t *pmd;
 993         pte_t *pte;
 994 
 995         pgd = pgd_offset(vma->vm_mm, address);
 996         pmd = pmd_alloc(pgd, address);
 997         if (!pmd)
 998                 goto no_memory;
 999         pte = pte_alloc(pmd, address);
1000         if (!pte)
1001                 goto no_memory;
1002         handle_pte_fault(vma, address, write_access, pte);
1003         update_mmu_cache(vma, address, *pte);
1004         return;
1005 no_memory:
1006         oom(current);
1007 }
/* */
root/mm/memory.c

DEFINITIONS