mm/memory.c

/* */
This source file includes following definitions.
copy_page
oom
free_one_pmd
free_one_pgd
clear_page_tables
free_page_tables
new_page_tables
copy_one_pte
copy_pte_range
copy_pmd_range
copy_page_range
forget_pte
zap_pte_range
zap_pmd_range
zap_page_range
zeromap_pte_range
zeromap_pmd_range
zeromap_page_range
remap_pte_range
remap_pmd_range
remap_page_range
put_page
put_dirty_page
do_wp_page
verify_area
get_empty_page
partial_clear
vmtruncate
get_empty_pgtable
do_swap_page
do_no_page
handle_pte_fault
handle_mm_fault
   1 /*
   2  *  linux/mm/memory.c
   3  *
   4  *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
   5  */
   6 
   7 /*
   8  * demand-loading started 01.12.91 - seems it is high on the list of
   9  * things wanted, and it should be easy to implement. - Linus
  10  */
  11 
  12 /*
  13  * Ok, demand-loading was easy, shared pages a little bit tricker. Shared
  14  * pages started 02.12.91, seems to work. - Linus.
  15  *
  16  * Tested sharing by executing about 30 /bin/sh: under the old kernel it
  17  * would have taken more than the 6M I have free, but it worked well as
  18  * far as I could see.
  19  *
  20  * Also corrected some "invalidate()"s - I wasn't doing enough of them.
  21  */
  22 
  23 /*
  24  * Real VM (paging to/from disk) started 18.12.91. Much more work and
  25  * thought has to go into this. Oh, well..
  26  * 19.12.91  -  works, somewhat. Sometimes I get faults, don't know why.
  27  *              Found it. Everything seems to work now.
  28  * 20.12.91  -  Ok, making the swap-device changeable like the root.
  29  */
  30 
  31 /*
  32  * 05.04.94  -  Multi-page memory management added for v1.1.
  33  *              Idea by Alex Bligh (alex@cconcepts.co.uk)
  34  */
  35 
  36 #include <linux/signal.h>
  37 #include <linux/sched.h>
  38 #include <linux/head.h>
  39 #include <linux/kernel.h>
  40 #include <linux/errno.h>
  41 #include <linux/string.h>
  42 #include <linux/types.h>
  43 #include <linux/ptrace.h>
  44 #include <linux/mman.h>
  45 #include <linux/mm.h>
  46 
  47 #include <asm/system.h>
  48 #include <asm/segment.h>
  49 #include <asm/pgtable.h>
  50 
  51 unsigned long high_memory = 0;
  52 
  53 /*
  54  * The free_area_list arrays point to the queue heads of the free areas
  55  * of different sizes
  56  */
  57 int nr_swap_pages = 0;
  58 int nr_free_pages = 0;
  59 struct mem_list free_area_list[NR_MEM_LISTS];
  60 unsigned int * free_area_map[NR_MEM_LISTS];
  61 
  62 /*
  63  * We special-case the C-O-W ZERO_PAGE, because it's such
  64  * a common occurrence (no need to read the page to know
  65  * that it's zero - better for the cache and memory subsystem).
  66  */
  67 static inline void copy_page(unsigned long from, unsigned long to)
     /*  */
  68 {
  69         if (from == ZERO_PAGE) {
  70                 memset((void *) to, 0, PAGE_SIZE);
  71                 return;
  72         }
  73         memcpy((void *) to, (void *) from, PAGE_SIZE);
  74 }
  75 
  76 #define USER_PTRS_PER_PGD (TASK_SIZE / PGDIR_SIZE)
  77 
  78 mem_map_t * mem_map = NULL;
  79 
  80 /*
  81  * oom() prints a message (so that the user knows why the process died),
  82  * and gives the process an untrappable SIGKILL.
  83  */
  84 void oom(struct task_struct * task)
     /*  */
  85 {
  86         printk("\nOut of memory for %s.\n", current->comm);
  87         task->sig->action[SIGKILL-1].sa_handler = NULL;
  88         task->blocked &= ~(1<<(SIGKILL-1));
  89         send_sig(SIGKILL,task,1);
  90 }
  91 
  92 /*
  93  * Note: this doesn't free the actual pages themselves. That
  94  * has been handled earlier when unmapping all the memory regions.
  95  */
  96 static inline void free_one_pmd(pmd_t * dir)
     /*  */
  97 {
  98         pte_t * pte;
  99 
 100         if (pmd_none(*dir))
 101                 return;
 102         if (pmd_bad(*dir)) {
 103                 printk("free_one_pmd: bad directory entry %08lx\n", pmd_val(*dir));
 104                 pmd_clear(dir);
 105                 return;
 106         }
 107         pte = pte_offset(dir, 0);
 108         pmd_clear(dir);
 109         pte_free(pte);
 110 }
 111 
 112 static inline void free_one_pgd(pgd_t * dir)
     /*  */
 113 {
 114         pmd_t * pmd;
 115 
 116         if (pgd_none(*dir))
 117                 return;
 118         if (pgd_bad(*dir)) {
 119                 printk("free_one_pgd: bad directory entry %08lx\n", pgd_val(*dir));
 120                 pgd_clear(dir);
 121                 return;
 122         }
 123         pmd = pmd_offset(dir, 0);
 124         pgd_clear(dir);
 125         if (!pmd_inuse(pmd)) {
 126                 int j;
 127                 for (j = 0; j < PTRS_PER_PMD ; j++)
 128                         free_one_pmd(pmd+j);
 129         }
 130         pmd_free(pmd);
 131 }
 132         
 133 /*
 134  * This function clears all user-level page tables of a process - this
 135  * is needed by execve(), so that old pages aren't in the way.
 136  */
 137 void clear_page_tables(struct task_struct * tsk)
     /*  */
 138 {
 139         int i;
 140         pgd_t * page_dir;
 141 
 142         page_dir = tsk->mm->pgd;
 143         if (!page_dir || page_dir == swapper_pg_dir) {
 144                 printk("%s trying to clear kernel page-directory: not good\n", tsk->comm);
 145                 return;
 146         }
 147         for (i = 0 ; i < USER_PTRS_PER_PGD ; i++)
 148                 free_one_pgd(page_dir + i);
 149         invalidate_mm(tsk->mm);
 150 }
 151 
 152 /*
 153  * This function frees up all page tables of a process when it exits. It
 154  * is the same as "clear_page_tables()", except it also changes the process'
 155  * page table directory to the kernel page tables and then frees the old
 156  * page table directory.
 157  */
 158 void free_page_tables(struct task_struct * tsk)
     /*  */
 159 {
 160         int i;
 161         pgd_t * page_dir;
 162 
 163         page_dir = tsk->mm->pgd;
 164         if (!page_dir || page_dir == swapper_pg_dir) {
 165                 printk("%s trying to free kernel page-directory: not good\n", tsk->comm);
 166                 return;
 167         }
 168         invalidate_mm(tsk->mm);
 169         SET_PAGE_DIR(tsk, swapper_pg_dir);
 170         tsk->mm->pgd = swapper_pg_dir;  /* or else... */
 171         for (i = 0 ; i < PTRS_PER_PGD ; i++)
 172                 free_one_pgd(page_dir + i);
 173         pgd_free(page_dir);
 174 }
 175 
 176 int new_page_tables(struct task_struct * tsk)
     /*  */
 177 {
 178         pgd_t * page_dir, * new_pg;
 179         int i;
 180 
 181         if (!(new_pg = pgd_alloc()))
 182                 return -ENOMEM;
 183         page_dir = pgd_offset(&init_mm, 0);
 184         for (i = USER_PTRS_PER_PGD ; i < PTRS_PER_PGD ; i++)
 185                 new_pg[i] = page_dir[i];
 186         invalidate_mm(tsk->mm);
 187         SET_PAGE_DIR(tsk, new_pg);
 188         tsk->mm->pgd = new_pg;
 189         return 0;
 190 }
 191 
 192 static inline void copy_one_pte(pte_t * old_pte, pte_t * new_pte, int cow)
     /*  */
 193 {
 194         pte_t pte = *old_pte;
 195 
 196         if (pte_none(pte))
 197                 return;
 198         if (!pte_present(pte)) {
 199                 swap_duplicate(pte_val(pte));
 200                 set_pte(new_pte, pte);
 201                 return;
 202         }
 203         if (pte_page(pte) > high_memory || mem_map[MAP_NR(pte_page(pte))].reserved) {
 204                 set_pte(new_pte, pte);
 205                 return;
 206         }
 207         if (cow)
 208                 pte = pte_wrprotect(pte);
 209         if (delete_from_swap_cache(pte_page(pte)))
 210                 pte = pte_mkdirty(pte);
 211         set_pte(new_pte, pte_mkold(pte));
 212         set_pte(old_pte, pte);
 213         mem_map[MAP_NR(pte_page(pte))].count++;
 214 }
 215 
 216 static inline int copy_pte_range(pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long address, unsigned long size, int cow)
     /*  */
 217 {
 218         pte_t * src_pte, * dst_pte;
 219         unsigned long end;
 220 
 221         if (pmd_none(*src_pmd))
 222                 return 0;
 223         if (pmd_bad(*src_pmd)) {
 224                 printk("copy_pte_range: bad pmd (%08lx)\n", pmd_val(*src_pmd));
 225                 pmd_clear(src_pmd);
 226                 return 0;
 227         }
 228         src_pte = pte_offset(src_pmd, address);
 229         if (pmd_none(*dst_pmd)) {
 230                 if (!pte_alloc(dst_pmd, 0))
 231                         return -ENOMEM;
 232         }
 233         dst_pte = pte_offset(dst_pmd, address);
 234         address &= ~PMD_MASK;
 235         end = address + size;
 236         if (end >= PMD_SIZE)
 237                 end = PMD_SIZE;
 238         do {
 239                 /* I would like to switch arguments here, to make it
 240                  * consistent with copy_xxx_range and memcpy syntax.
 241                  */
 242                 copy_one_pte(src_pte++, dst_pte++, cow);
 243                 address += PAGE_SIZE;
 244         } while (address < end);
 245         return 0;
 246 }
 247 
 248 static inline int copy_pmd_range(pgd_t *dst_pgd, pgd_t *src_pgd, unsigned long address, unsigned long size, int cow)
     /*  */
 249 {
 250         pmd_t * src_pmd, * dst_pmd;
 251         unsigned long end;
 252         int error = 0;
 253 
 254         if (pgd_none(*src_pgd))
 255                 return 0;
 256         if (pgd_bad(*src_pgd)) {
 257                 printk("copy_pmd_range: bad pgd (%08lx)\n", pgd_val(*src_pgd));
 258                 pgd_clear(src_pgd);
 259                 return 0;
 260         }
 261         src_pmd = pmd_offset(src_pgd, address);
 262         if (pgd_none(*dst_pgd)) {
 263                 if (!pmd_alloc(dst_pgd, 0))
 264                         return -ENOMEM;
 265         }
 266         dst_pmd = pmd_offset(dst_pgd, address);
 267         address &= ~PGDIR_MASK;
 268         end = address + size;
 269         if (end > PGDIR_SIZE)
 270                 end = PGDIR_SIZE;
 271         do {
 272                 error = copy_pte_range(dst_pmd++, src_pmd++, address, end - address, cow);
 273                 if (error)
 274                         break;
 275                 address = (address + PMD_SIZE) & PMD_MASK; 
 276         } while (address < end);
 277         return error;
 278 }
 279 
 280 /*
 281  * copy one vm_area from one task to the other. Assumes the page tables
 282  * already present in the new task to be cleared in the whole range
 283  * covered by this vma.
 284  */
 285 int copy_page_range(struct mm_struct *dst, struct mm_struct *src,
     /*  */
 286                         struct vm_area_struct *vma)
 287 {
 288         pgd_t * src_pgd, * dst_pgd;
 289         unsigned long address = vma->vm_start;
 290         unsigned long end = vma->vm_end;
 291         int error = 0, cow;
 292 
 293         cow = (vma->vm_flags & (VM_SHARED | VM_WRITE)) == VM_WRITE;
 294         src_pgd = pgd_offset(src, address);
 295         dst_pgd = pgd_offset(dst, address);
 296         while (address < end) {
 297                 error = copy_pmd_range(dst_pgd++, src_pgd++, address, end - address, cow);
 298                 if (error)
 299                         break;
 300                 address = (address + PGDIR_SIZE) & PGDIR_MASK;
 301         }
 302         /* Note that the src ptes get c-o-w treatment, so they change too. */
 303         invalidate_range(src, vma->vm_start, vma->vm_end);
 304         invalidate_range(dst, vma->vm_start, vma->vm_end);
 305         return error;
 306 }
 307 
 308 static inline void forget_pte(pte_t page)
     /*  */
 309 {
 310         if (pte_none(page))
 311                 return;
 312         if (pte_present(page)) {
 313                 unsigned long addr = pte_page(page);
 314                 if (addr >= high_memory || mem_map[MAP_NR(addr)].reserved)
 315                         return;
 316                 free_page(addr);
 317                 if (current->mm->rss <= 0)
 318                         return;
 319                 current->mm->rss--;
 320                 return;
 321         }
 322         swap_free(pte_val(page));
 323 }
 324 
 325 static inline void zap_pte_range(pmd_t * pmd, unsigned long address, unsigned long size)
     /*  */
 326 {
 327         pte_t * pte;
 328         unsigned long end;
 329 
 330         if (pmd_none(*pmd))
 331                 return;
 332         if (pmd_bad(*pmd)) {
 333                 printk("zap_pte_range: bad pmd (%08lx)\n", pmd_val(*pmd));
 334                 pmd_clear(pmd);
 335                 return;
 336         }
 337         pte = pte_offset(pmd, address);
 338         address &= ~PMD_MASK;
 339         end = address + size;
 340         if (end >= PMD_SIZE)
 341                 end = PMD_SIZE;
 342         do {
 343                 pte_t page = *pte;
 344                 pte_clear(pte);
 345                 forget_pte(page);
 346                 address += PAGE_SIZE;
 347                 pte++;
 348         } while (address < end);
 349 }
 350 
 351 static inline void zap_pmd_range(pgd_t * dir, unsigned long address, unsigned long size)
     /*  */
 352 {
 353         pmd_t * pmd;
 354         unsigned long end;
 355 
 356         if (pgd_none(*dir))
 357                 return;
 358         if (pgd_bad(*dir)) {
 359                 printk("zap_pmd_range: bad pgd (%08lx)\n", pgd_val(*dir));
 360                 pgd_clear(dir);
 361                 return;
 362         }
 363         pmd = pmd_offset(dir, address);
 364         address &= ~PGDIR_MASK;
 365         end = address + size;
 366         if (end > PGDIR_SIZE)
 367                 end = PGDIR_SIZE;
 368         do {
 369                 zap_pte_range(pmd, address, end - address);
 370                 address = (address + PMD_SIZE) & PMD_MASK; 
 371                 pmd++;
 372         } while (address < end);
 373 }
 374 
 375 /*
 376  * remove user pages in a given range.
 377  */
 378 int zap_page_range(struct mm_struct *mm, unsigned long address, unsigned long size)
     /*  */
 379 {
 380         pgd_t * dir;
 381         unsigned long end = address + size;
 382 
 383         dir = pgd_offset(mm, address);
 384         while (address < end) {
 385                 zap_pmd_range(dir, address, end - address);
 386                 address = (address + PGDIR_SIZE) & PGDIR_MASK;
 387                 dir++;
 388         }
 389         invalidate_range(mm, end - size, end);
 390         return 0;
 391 }
 392 
 393 static inline void zeromap_pte_range(pte_t * pte, unsigned long address, unsigned long size, pte_t zero_pte)
     /*  */
 394 {
 395         unsigned long end;
 396 
 397         address &= ~PMD_MASK;
 398         end = address + size;
 399         if (end > PMD_SIZE)
 400                 end = PMD_SIZE;
 401         do {
 402                 pte_t oldpage = *pte;
 403                 set_pte(pte, zero_pte);
 404                 forget_pte(oldpage);
 405                 address += PAGE_SIZE;
 406                 pte++;
 407         } while (address < end);
 408 }
 409 
 410 static inline int zeromap_pmd_range(pmd_t * pmd, unsigned long address, unsigned long size, pte_t zero_pte)
     /*  */
 411 {
 412         unsigned long end;
 413 
 414         address &= ~PGDIR_MASK;
 415         end = address + size;
 416         if (end > PGDIR_SIZE)
 417                 end = PGDIR_SIZE;
 418         do {
 419                 pte_t * pte = pte_alloc(pmd, address);
 420                 if (!pte)
 421                         return -ENOMEM;
 422                 zeromap_pte_range(pte, address, end - address, zero_pte);
 423                 address = (address + PMD_SIZE) & PMD_MASK;
 424                 pmd++;
 425         } while (address < end);
 426         return 0;
 427 }
 428 
 429 int zeromap_page_range(unsigned long address, unsigned long size, pgprot_t prot)
     /*  */
 430 {
 431         int error = 0;
 432         pgd_t * dir;
 433         unsigned long end = address + size;
 434         pte_t zero_pte;
 435 
 436         zero_pte = pte_wrprotect(mk_pte(ZERO_PAGE, prot));
 437         dir = pgd_offset(current->mm, address);
 438         while (address < end) {
 439                 pmd_t *pmd = pmd_alloc(dir, address);
 440                 error = -ENOMEM;
 441                 if (!pmd)
 442                         break;
 443                 error = zeromap_pmd_range(pmd, address, end - address, zero_pte);
 444                 if (error)
 445                         break;
 446                 address = (address + PGDIR_SIZE) & PGDIR_MASK;
 447                 dir++;
 448         }
 449         invalidate_range(current->mm, end - size, end);
 450         return error;
 451 }
 452 
 453 /*
 454  * maps a range of physical memory into the requested pages. the old
 455  * mappings are removed. any references to nonexistent pages results
 456  * in null mappings (currently treated as "copy-on-access")
 457  */
 458 static inline void remap_pte_range(pte_t * pte, unsigned long address, unsigned long size,
     /*  */
 459         unsigned long offset, pgprot_t prot)
 460 {
 461         unsigned long end;
 462 
 463         address &= ~PMD_MASK;
 464         end = address + size;
 465         if (end > PMD_SIZE)
 466                 end = PMD_SIZE;
 467         do {
 468                 pte_t oldpage = *pte;
 469                 pte_clear(pte);
 470                 if (offset >= high_memory || mem_map[MAP_NR(offset)].reserved)
 471                         set_pte(pte, mk_pte(offset, prot));
 472                 forget_pte(oldpage);
 473                 address += PAGE_SIZE;
 474                 offset += PAGE_SIZE;
 475                 pte++;
 476         } while (address < end);
 477 }
 478 
 479 static inline int remap_pmd_range(pmd_t * pmd, unsigned long address, unsigned long size,
     /*  */
 480         unsigned long offset, pgprot_t prot)
 481 {
 482         unsigned long end;
 483 
 484         address &= ~PGDIR_MASK;
 485         end = address + size;
 486         if (end > PGDIR_SIZE)
 487                 end = PGDIR_SIZE;
 488         offset -= address;
 489         do {
 490                 pte_t * pte = pte_alloc(pmd, address);
 491                 if (!pte)
 492                         return -ENOMEM;
 493                 remap_pte_range(pte, address, end - address, address + offset, prot);
 494                 address = (address + PMD_SIZE) & PMD_MASK;
 495                 pmd++;
 496         } while (address < end);
 497         return 0;
 498 }
 499 
 500 int remap_page_range(unsigned long from, unsigned long offset, unsigned long size, pgprot_t prot)
     /*  */
 501 {
 502         int error = 0;
 503         pgd_t * dir;
 504         unsigned long end = from + size;
 505 
 506         offset -= from;
 507         dir = pgd_offset(current->mm, from);
 508         while (from < end) {
 509                 pmd_t *pmd = pmd_alloc(dir, from);
 510                 error = -ENOMEM;
 511                 if (!pmd)
 512                         break;
 513                 error = remap_pmd_range(pmd, from, end - from, offset + from, prot);
 514                 if (error)
 515                         break;
 516                 from = (from + PGDIR_SIZE) & PGDIR_MASK;
 517                 dir++;
 518         }
 519         invalidate_range(current->mm, from - size, from);
 520         return error;
 521 }
 522 
 523 /*
 524  * sanity-check function..
 525  */
 526 static void put_page(pte_t * page_table, pte_t pte)
     /*  */
 527 {
 528         if (!pte_none(*page_table)) {
 529                 printk("put_page: page already exists %08lx\n", pte_val(*page_table));
 530                 free_page(pte_page(pte));
 531                 return;
 532         }
 533 /* no need for invalidate */
 534         set_pte(page_table, pte);
 535 }
 536 
 537 /*
 538  * This routine is used to map in a page into an address space: needed by
 539  * execve() for the initial stack and environment pages.
 540  */
 541 unsigned long put_dirty_page(struct task_struct * tsk, unsigned long page, unsigned long address)
     /*  */
 542 {
 543         pgd_t * pgd;
 544         pmd_t * pmd;
 545         pte_t * pte;
 546 
 547         if (page >= high_memory)
 548                 printk("put_dirty_page: trying to put page %08lx at %08lx\n",page,address);
 549         if (mem_map[MAP_NR(page)].count != 1)
 550                 printk("mem_map disagrees with %08lx at %08lx\n",page,address);
 551         pgd = pgd_offset(tsk->mm,address);
 552         pmd = pmd_alloc(pgd, address);
 553         if (!pmd) {
 554                 free_page(page);
 555                 oom(tsk);
 556                 return 0;
 557         }
 558         pte = pte_alloc(pmd, address);
 559         if (!pte) {
 560                 free_page(page);
 561                 oom(tsk);
 562                 return 0;
 563         }
 564         if (!pte_none(*pte)) {
 565                 printk("put_dirty_page: page already exists\n");
 566                 free_page(page);
 567                 return 0;
 568         }
 569         set_pte(pte, pte_mkwrite(pte_mkdirty(mk_pte(page, PAGE_COPY))));
 570 /* no need for invalidate */
 571         return page;
 572 }
 573 
 574 /*
 575  * This routine handles present pages, when users try to write
 576  * to a shared page. It is done by copying the page to a new address
 577  * and decrementing the shared-page counter for the old page.
 578  *
 579  * Goto-purists beware: the only reason for goto's here is that it results
 580  * in better assembly code.. The "default" path will see no jumps at all.
 581  *
 582  * Note that this routine assumes that the protection checks have been
 583  * done by the caller (the low-level page fault routine in most cases).
 584  * Thus we can safely just mark it writable once we've done any necessary
 585  * COW.
 586  *
 587  * We also mark the page dirty at this point even though the page will
 588  * change only once the write actually happens. This avoids a few races,
 589  * and potentially makes it more efficient.
 590  */
 591 void do_wp_page(struct task_struct * tsk, struct vm_area_struct * vma,
     /*  */
 592         unsigned long address, int write_access)
 593 {
 594         pgd_t *page_dir;
 595         pmd_t *page_middle;
 596         pte_t *page_table, pte;
 597         unsigned long old_page, new_page;
 598 
 599         new_page = __get_free_page(GFP_KERNEL);
 600         page_dir = pgd_offset(vma->vm_mm, address);
 601         if (pgd_none(*page_dir))
 602                 goto end_wp_page;
 603         if (pgd_bad(*page_dir))
 604                 goto bad_wp_pagedir;
 605         page_middle = pmd_offset(page_dir, address);
 606         if (pmd_none(*page_middle))
 607                 goto end_wp_page;
 608         if (pmd_bad(*page_middle))
 609                 goto bad_wp_pagemiddle;
 610         page_table = pte_offset(page_middle, address);
 611         pte = *page_table;
 612         if (!pte_present(pte))
 613                 goto end_wp_page;
 614         if (pte_write(pte))
 615                 goto end_wp_page;
 616         old_page = pte_page(pte);
 617         if (old_page >= high_memory)
 618                 goto bad_wp_page;
 619         tsk->min_flt++;
 620         /*
 621          * Do we need to copy?
 622          */
 623         if (mem_map[MAP_NR(old_page)].count != 1) {
 624                 if (new_page) {
 625                         if (mem_map[MAP_NR(old_page)].reserved)
 626                                 ++vma->vm_mm->rss;
 627                         copy_page(old_page,new_page);
 628                         set_pte(page_table, pte_mkwrite(pte_mkdirty(mk_pte(new_page, vma->vm_page_prot))));
 629                         free_page(old_page);
 630                         invalidate_page(vma, address);
 631                         return;
 632                 }
 633                 set_pte(page_table, BAD_PAGE);
 634                 free_page(old_page);
 635                 oom(tsk);
 636                 invalidate_page(vma, address);
 637                 return;
 638         }
 639         set_pte(page_table, pte_mkdirty(pte_mkwrite(pte)));
 640         invalidate_page(vma, address);
 641         if (new_page)
 642                 free_page(new_page);
 643         return;
 644 bad_wp_page:
 645         printk("do_wp_page: bogus page at address %08lx (%08lx)\n",address,old_page);
 646         send_sig(SIGKILL, tsk, 1);
 647         goto end_wp_page;
 648 bad_wp_pagemiddle:
 649         printk("do_wp_page: bogus page-middle at address %08lx (%08lx)\n", address, pmd_val(*page_middle));
 650         send_sig(SIGKILL, tsk, 1);
 651         goto end_wp_page;
 652 bad_wp_pagedir:
 653         printk("do_wp_page: bogus page-dir entry at address %08lx (%08lx)\n", address, pgd_val(*page_dir));
 654         send_sig(SIGKILL, tsk, 1);
 655 end_wp_page:
 656         if (new_page)
 657                 free_page(new_page);
 658         return;
 659 }
 660 
 661 /*
 662  * Ugly, ugly, but the goto's result in better assembly..
 663  */
 664 int verify_area(int type, const void * addr, unsigned long size)
     /*  */
 665 {
 666         struct vm_area_struct * vma;
 667         unsigned long start = (unsigned long) addr;
 668 
 669         /* If the current user space is mapped to kernel space (for the
 670          * case where we use a fake user buffer with get_fs/set_fs()) we
 671          * don't expect to find the address in the user vm map.
 672          */
 673         if (!size || get_fs() == get_ds())
 674                 return 0;
 675 
 676         vma = find_vma(current, start);
 677         if (!vma)
 678                 goto bad_area;
 679         if (vma->vm_start <= start)
 680                 goto good_area;
 681         if (!(vma->vm_flags & VM_GROWSDOWN))
 682                 goto bad_area;
 683         if (expand_stack(vma, start))
 684                 goto bad_area;
 685 
 686 good_area:
 687         if (type == VERIFY_WRITE)
 688                 goto check_write;
 689         for (;;) {
 690                 struct vm_area_struct * next;
 691                 if (!(vma->vm_flags & VM_READ))
 692                         goto bad_area;
 693                 if (vma->vm_end - start >= size)
 694                         return 0;
 695                 next = vma->vm_next;
 696                 if (!next || vma->vm_end != next->vm_start)
 697                         goto bad_area;
 698                 vma = next;
 699         }
 700 
 701 check_write:
 702         if (!(vma->vm_flags & VM_WRITE))
 703                 goto bad_area;
 704         if (!wp_works_ok)
 705                 goto check_wp_fault_by_hand;
 706         for (;;) {
 707                 if (vma->vm_end - start >= size)
 708                         break;
 709                 if (!vma->vm_next || vma->vm_end != vma->vm_next->vm_start)
 710                         goto bad_area;
 711                 vma = vma->vm_next;
 712                 if (!(vma->vm_flags & VM_WRITE))
 713                         goto bad_area;
 714         }
 715         return 0;
 716 
 717 check_wp_fault_by_hand:
 718         size--;
 719         size += start & ~PAGE_MASK;
 720         size >>= PAGE_SHIFT;
 721         start &= PAGE_MASK;
 722 
 723         for (;;) {
 724                 do_wp_page(current, vma, start, 1);
 725                 if (!size)
 726                         break;
 727                 size--;
 728                 start += PAGE_SIZE;
 729                 if (start < vma->vm_end)
 730                         continue;
 731                 vma = vma->vm_next;
 732                 if (!vma || vma->vm_start != start)
 733                         goto bad_area;
 734                 if (!(vma->vm_flags & VM_WRITE))
 735                         goto bad_area;;
 736         }
 737         return 0;
 738 
 739 bad_area:
 740         return -EFAULT;
 741 }
 742 
 743 static inline void get_empty_page(struct task_struct * tsk, struct vm_area_struct * vma, pte_t * page_table)
     /*  */
 744 {
 745         unsigned long tmp;
 746 
 747         if (!(tmp = get_free_page(GFP_KERNEL))) {
 748                 oom(tsk);
 749                 put_page(page_table, BAD_PAGE);
 750                 return;
 751         }
 752         put_page(page_table, pte_mkwrite(mk_pte(tmp, vma->vm_page_prot)));
 753 }
 754 
 755 /*
 756  * This function zeroes out partial mmap'ed pages at truncation time..
 757  */
 758 static void partial_clear(struct vm_area_struct *vma, unsigned long address)
     /*  */
 759 {
 760         pgd_t *page_dir;
 761         pmd_t *page_middle;
 762         pte_t *page_table, pte;
 763 
 764         page_dir = pgd_offset(vma->vm_mm, address);
 765         if (pgd_none(*page_dir))
 766                 return;
 767         if (pgd_bad(*page_dir)) {
 768                 printk("bad page table directory entry %p:[%lx]\n", page_dir, pgd_val(*page_dir));
 769                 pgd_clear(page_dir);
 770                 return;
 771         }
 772         page_middle = pmd_offset(page_dir, address);
 773         if (pmd_none(*page_middle))
 774                 return;
 775         if (pmd_bad(*page_middle)) {
 776                 printk("bad page table directory entry %p:[%lx]\n", page_dir, pgd_val(*page_dir));
 777                 pmd_clear(page_middle);
 778                 return;
 779         }
 780         page_table = pte_offset(page_middle, address);
 781         pte = *page_table;
 782         if (!pte_present(pte))
 783                 return;
 784         address &= ~PAGE_MASK;
 785         address += pte_page(pte);
 786         if (address >= high_memory)
 787                 return;
 788         memset((void *) address, 0, PAGE_SIZE - (address & ~PAGE_MASK));
 789 }
 790 
 791 /*
 792  * Handle all mappings that got truncated by a "truncate()"
 793  * system call.
 794  *
 795  * NOTE! We have to be ready to update the memory sharing
 796  * between the file and the memory map for a potential last
 797  * incomplete page.  Ugly, but necessary.
 798  */
 799 void vmtruncate(struct inode * inode, unsigned long offset)
     /*  */
 800 {
 801         struct vm_area_struct * mpnt;
 802 
 803         invalidate_inode_pages(inode, offset);
 804         if (!inode->i_mmap)
 805                 return;
 806         mpnt = inode->i_mmap;
 807         do {
 808                 unsigned long start = mpnt->vm_start;
 809                 unsigned long len = mpnt->vm_end - start;
 810                 unsigned long diff;
 811 
 812                 /* mapping wholly truncated? */
 813                 if (mpnt->vm_offset >= offset) {
 814                         zap_page_range(mpnt->vm_mm, start, len);
 815                         continue;
 816                 }
 817                 /* mapping wholly unaffected? */
 818                 diff = offset - mpnt->vm_offset;
 819                 if (diff >= len)
 820                         continue;
 821                 /* Ok, partially affected.. */
 822                 start += diff;
 823                 len = (len - diff) & PAGE_MASK;
 824                 if (start & ~PAGE_MASK) {
 825                         partial_clear(mpnt, start);
 826                         start = (start + ~PAGE_MASK) & PAGE_MASK;
 827                 }
 828                 zap_page_range(mpnt->vm_mm, start, len);
 829         } while ((mpnt = mpnt->vm_next_share) != inode->i_mmap);
 830 }
 831 
 832 /*
 833  * fill in an empty page-table if none exists.
 834  */
 835 static inline pte_t * get_empty_pgtable(struct task_struct * tsk,unsigned long address)
     /*  */
 836 {
 837         pgd_t *pgd;
 838         pmd_t *pmd;
 839         pte_t *pte;
 840 
 841         pgd = pgd_offset(tsk->mm, address);
 842         pmd = pmd_alloc(pgd, address);
 843         if (!pmd) {
 844                 oom(tsk);
 845                 return NULL;
 846         }
 847         pte = pte_alloc(pmd, address);
 848         if (!pte) {
 849                 oom(tsk);
 850                 return NULL;
 851         }
 852         return pte;
 853 }
 854 
 855 static inline void do_swap_page(struct task_struct * tsk, 
     /*  */
 856         struct vm_area_struct * vma, unsigned long address,
 857         pte_t * page_table, pte_t entry, int write_access)
 858 {
 859         pte_t page;
 860 
 861         if (!vma->vm_ops || !vma->vm_ops->swapin) {
 862                 swap_in(tsk, vma, page_table, pte_val(entry), write_access);
 863                 return;
 864         }
 865         page = vma->vm_ops->swapin(vma, address - vma->vm_start + vma->vm_offset, pte_val(entry));
 866         if (pte_val(*page_table) != pte_val(entry)) {
 867                 free_page(pte_page(page));
 868                 return;
 869         }
 870         if (mem_map[MAP_NR(pte_page(page))].count > 1 && !(vma->vm_flags & VM_SHARED))
 871                 page = pte_wrprotect(page);
 872         ++vma->vm_mm->rss;
 873         ++tsk->maj_flt;
 874         set_pte(page_table, page);
 875         return;
 876 }
 877 
 878 /*
 879  * do_no_page() tries to create a new page mapping. It aggressively
 880  * tries to share with existing pages, but makes a separate copy if
 881  * the "write_access" parameter is true in order to avoid the next
 882  * page fault.
 883  */
 884 void do_no_page(struct task_struct * tsk, struct vm_area_struct * vma,
     /*  */
 885         unsigned long address, int write_access)
 886 {
 887         pte_t * page_table;
 888         pte_t entry;
 889         unsigned long page;
 890 
 891         page_table = get_empty_pgtable(tsk, address);
 892         if (!page_table)
 893                 return;
 894         entry = *page_table;
 895         if (pte_present(entry))
 896                 return;
 897         if (!pte_none(entry)) {
 898                 do_swap_page(tsk, vma, address, page_table, entry, write_access);
 899                 return;
 900         }
 901         address &= PAGE_MASK;
 902         if (!vma->vm_ops || !vma->vm_ops->nopage) {
 903                 ++vma->vm_mm->rss;
 904                 ++tsk->min_flt;
 905                 get_empty_page(tsk, vma, page_table);
 906                 return;
 907         }
 908         ++tsk->maj_flt;
 909         ++vma->vm_mm->rss;
 910         /*
 911          * The third argument is "no_share", which tells the low-level code
 912          * to copy, not share the page even if sharing is possible.  It's
 913          * essentially an early COW detection 
 914          */
 915         page = vma->vm_ops->nopage(vma, address, write_access && !(vma->vm_flags & VM_SHARED));
 916         if (!page) {
 917                 send_sig(SIGBUS, current, 1);
 918                 put_page(page_table, BAD_PAGE);
 919                 return;
 920         }
 921         /*
 922          * This silly early PAGE_DIRTY setting removes a race
 923          * due to the bad i386 page protection. But it's valid
 924          * for other architectures too.
 925          *
 926          * Note that if write_access is true, we either now have
 927          * a exclusive copy of the page, or this is a shared mapping,
 928          * so we can make it writable and dirty to avoid having to
 929          * handle that later.
 930          */
 931         entry = mk_pte(page, vma->vm_page_prot);
 932         if (write_access) {
 933                 entry = pte_mkwrite(pte_mkdirty(entry));
 934         } else if (mem_map[MAP_NR(page)].count > 1 && !(vma->vm_flags & VM_SHARED))
 935                 entry = pte_wrprotect(entry);
 936         put_page(page_table, entry);
 937 }
 938 
 939 /*
 940  * The above separate functions for the no-page and wp-page
 941  * cases will go away (they mostly do the same thing anyway),
 942  * and we'll instead use only a general "handle_mm_fault()".
 943  *
 944  * These routines also need to handle stuff like marking pages dirty
 945  * and/or accessed for architectures that don't do it in hardware (most
 946  * RISC architectures).  The early dirtying is also good on the i386.
 947  *
 948  * There is also a hook called "update_mmu_cache()" that architectures
 949  * with external mmu caches can use to update those (ie the Sparc or
 950  * PowerPC hashed page tables that act as extended TLBs).
 951  */
 952 static inline void handle_pte_fault(struct vm_area_struct * vma, unsigned long address,
     /*  */
 953         int write_access, pte_t * pte)
 954 {
 955         if (!pte_present(*pte)) {
 956                 do_no_page(current, vma, address, write_access);
 957                 return;
 958         }
 959         set_pte(pte, pte_mkyoung(*pte));
 960         if (!write_access)
 961                 return;
 962         if (pte_write(*pte)) {
 963                 set_pte(pte, pte_mkdirty(*pte));
 964                 return;
 965         }
 966         do_wp_page(current, vma, address, write_access);
 967 }
 968 
 969 void handle_mm_fault(struct vm_area_struct * vma, unsigned long address,
     /*  */
 970         int write_access)
 971 {
 972         pgd_t *pgd;
 973         pmd_t *pmd;
 974         pte_t *pte;
 975 
 976         pgd = pgd_offset(vma->vm_mm, address);
 977         pmd = pmd_alloc(pgd, address);
 978         if (!pmd)
 979                 goto no_memory;
 980         pte = pte_alloc(pmd, address);
 981         if (!pte)
 982                 goto no_memory;
 983         handle_pte_fault(vma, address, write_access, pte);
 984         update_mmu_cache(vma, address, *pte);
 985         return;
 986 no_memory:
 987         oom(current);
 988 }
/* */
root/mm/memory.c

DEFINITIONS