mm/memory.c

/* */
This source file includes following definitions.
copy_page
oom
free_one_pmd
free_one_pgd
clear_page_tables
free_page_tables
new_page_tables
copy_one_pte
copy_pte_range
copy_pmd_range
copy_page_range
forget_pte
zap_pte_range
zap_pmd_range
zap_page_range
zeromap_pte_range
zeromap_pmd_range
zeromap_page_range
remap_pte_range
remap_pmd_range
remap_page_range
put_page
put_dirty_page
do_wp_page
verify_area
get_empty_page
try_to_share
share_page
unshare
vmtruncate
get_empty_pgtable
do_swap_page
do_no_page
handle_pte_fault
handle_mm_fault
   1 /*
   2  *  linux/mm/memory.c
   3  *
   4  *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
   5  */
   6 
   7 /*
   8  * demand-loading started 01.12.91 - seems it is high on the list of
   9  * things wanted, and it should be easy to implement. - Linus
  10  */
  11 
  12 /*
  13  * Ok, demand-loading was easy, shared pages a little bit tricker. Shared
  14  * pages started 02.12.91, seems to work. - Linus.
  15  *
  16  * Tested sharing by executing about 30 /bin/sh: under the old kernel it
  17  * would have taken more than the 6M I have free, but it worked well as
  18  * far as I could see.
  19  *
  20  * Also corrected some "invalidate()"s - I wasn't doing enough of them.
  21  */
  22 
  23 /*
  24  * Real VM (paging to/from disk) started 18.12.91. Much more work and
  25  * thought has to go into this. Oh, well..
  26  * 19.12.91  -  works, somewhat. Sometimes I get faults, don't know why.
  27  *              Found it. Everything seems to work now.
  28  * 20.12.91  -  Ok, making the swap-device changeable like the root.
  29  */
  30 
  31 /*
  32  * 05.04.94  -  Multi-page memory management added for v1.1.
  33  *              Idea by Alex Bligh (alex@cconcepts.co.uk)
  34  */
  35 
  36 #include <linux/signal.h>
  37 #include <linux/sched.h>
  38 #include <linux/head.h>
  39 #include <linux/kernel.h>
  40 #include <linux/errno.h>
  41 #include <linux/string.h>
  42 #include <linux/types.h>
  43 #include <linux/ptrace.h>
  44 #include <linux/mman.h>
  45 #include <linux/mm.h>
  46 
  47 #include <asm/system.h>
  48 #include <asm/segment.h>
  49 #include <asm/pgtable.h>
  50 
  51 unsigned long high_memory = 0;
  52 
  53 /*
  54  * The free_area_list arrays point to the queue heads of the free areas
  55  * of different sizes
  56  */
  57 int nr_swap_pages = 0;
  58 int nr_free_pages = 0;
  59 struct mem_list free_area_list[NR_MEM_LISTS];
  60 unsigned char * free_area_map[NR_MEM_LISTS];
  61 
  62 /*
  63  * We special-case the C-O-W ZERO_PAGE, because it's such
  64  * a common occurrence (no need to read the page to know
  65  * that it's zero - better for the cache and memory subsystem).
  66  */
  67 static inline void copy_page(unsigned long from, unsigned long to)
     /*  */
  68 {
  69         if (from == ZERO_PAGE) {
  70                 memset((void *) to, 0, PAGE_SIZE);
  71                 return;
  72         }
  73         memcpy((void *) to, (void *) from, PAGE_SIZE);
  74 }
  75 
  76 #define USER_PTRS_PER_PGD (TASK_SIZE / PGDIR_SIZE)
  77 
  78 mem_map_t * mem_map = NULL;
  79 
  80 /*
  81  * oom() prints a message (so that the user knows why the process died),
  82  * and gives the process an untrappable SIGKILL.
  83  */
  84 void oom(struct task_struct * task)
     /*  */
  85 {
  86         printk("\nOut of memory for %s.\n", current->comm);
  87         task->sig->action[SIGKILL-1].sa_handler = NULL;
  88         task->blocked &= ~(1<<(SIGKILL-1));
  89         send_sig(SIGKILL,task,1);
  90 }
  91 
  92 /*
  93  * Note: this doesn't free the actual pages themselves. That
  94  * has been handled earlier when unmapping all the memory regions.
  95  */
  96 static inline void free_one_pmd(pmd_t * dir)
     /*  */
  97 {
  98         pte_t * pte;
  99 
 100         if (pmd_none(*dir))
 101                 return;
 102         if (pmd_bad(*dir)) {
 103                 printk("free_one_pmd: bad directory entry %08lx\n", pmd_val(*dir));
 104                 pmd_clear(dir);
 105                 return;
 106         }
 107         pte = pte_offset(dir, 0);
 108         pmd_clear(dir);
 109         pte_free(pte);
 110 }
 111 
 112 static inline void free_one_pgd(pgd_t * dir)
     /*  */
 113 {
 114         pmd_t * pmd;
 115 
 116         if (pgd_none(*dir))
 117                 return;
 118         if (pgd_bad(*dir)) {
 119                 printk("free_one_pgd: bad directory entry %08lx\n", pgd_val(*dir));
 120                 pgd_clear(dir);
 121                 return;
 122         }
 123         pmd = pmd_offset(dir, 0);
 124         pgd_clear(dir);
 125         if (!pmd_inuse(pmd)) {
 126                 int j;
 127                 for (j = 0; j < PTRS_PER_PMD ; j++)
 128                         free_one_pmd(pmd+j);
 129         }
 130         pmd_free(pmd);
 131 }
 132         
 133 /*
 134  * This function clears all user-level page tables of a process - this
 135  * is needed by execve(), so that old pages aren't in the way.
 136  */
 137 void clear_page_tables(struct task_struct * tsk)
     /*  */
 138 {
 139         int i;
 140         pgd_t * page_dir;
 141 
 142         page_dir = tsk->mm->pgd;
 143         if (!page_dir || page_dir == swapper_pg_dir) {
 144                 printk("%s trying to clear kernel page-directory: not good\n", tsk->comm);
 145                 return;
 146         }
 147         for (i = 0 ; i < USER_PTRS_PER_PGD ; i++)
 148                 free_one_pgd(page_dir + i);
 149         invalidate_mm(tsk->mm);
 150 }
 151 
 152 /*
 153  * This function frees up all page tables of a process when it exits. It
 154  * is the same as "clear_page_tables()", except it also changes the process'
 155  * page table directory to the kernel page tables and then frees the old
 156  * page table directory.
 157  */
 158 void free_page_tables(struct task_struct * tsk)
     /*  */
 159 {
 160         int i;
 161         pgd_t * page_dir;
 162 
 163         page_dir = tsk->mm->pgd;
 164         if (!page_dir || page_dir == swapper_pg_dir) {
 165                 printk("%s trying to free kernel page-directory: not good\n", tsk->comm);
 166                 return;
 167         }
 168         invalidate_mm(tsk->mm);
 169         SET_PAGE_DIR(tsk, swapper_pg_dir);
 170         tsk->mm->pgd = swapper_pg_dir;  /* or else... */
 171         for (i = 0 ; i < PTRS_PER_PGD ; i++)
 172                 free_one_pgd(page_dir + i);
 173         pgd_free(page_dir);
 174 }
 175 
 176 int new_page_tables(struct task_struct * tsk)
     /*  */
 177 {
 178         pgd_t * page_dir, * new_pg;
 179         int i;
 180 
 181         if (!(new_pg = pgd_alloc()))
 182                 return -ENOMEM;
 183         page_dir = pgd_offset(&init_mm, 0);
 184         for (i = USER_PTRS_PER_PGD ; i < PTRS_PER_PGD ; i++)
 185                 new_pg[i] = page_dir[i];
 186         invalidate_mm(tsk->mm);
 187         SET_PAGE_DIR(tsk, new_pg);
 188         tsk->mm->pgd = new_pg;
 189         return 0;
 190 }
 191 
 192 static inline void copy_one_pte(pte_t * old_pte, pte_t * new_pte)
     /*  */
 193 {
 194         pte_t pte = *old_pte;
 195 
 196         if (pte_none(pte))
 197                 return;
 198         if (!pte_present(pte)) {
 199                 swap_duplicate(pte_val(pte));
 200                 set_pte(new_pte, pte);
 201                 return;
 202         }
 203         if (pte_page(pte) > high_memory || mem_map[MAP_NR(pte_page(pte))].reserved) {
 204                 set_pte(new_pte, pte);
 205                 return;
 206         }
 207         if (pte_cow(pte))
 208                 pte = pte_wrprotect(pte);
 209         if (delete_from_swap_cache(pte_page(pte)))
 210                 pte = pte_mkdirty(pte);
 211         set_pte(new_pte, pte_mkold(pte));
 212         set_pte(old_pte, pte);
 213         mem_map[MAP_NR(pte_page(pte))].count++;
 214 }
 215 
 216 static inline int copy_pte_range(pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long address, unsigned long size)
     /*  */
 217 {
 218         pte_t * src_pte, * dst_pte;
 219         unsigned long end;
 220 
 221         if (pmd_none(*src_pmd))
 222                 return 0;
 223         if (pmd_bad(*src_pmd)) {
 224                 printk("copy_pte_range: bad pmd (%08lx)\n", pmd_val(*src_pmd));
 225                 pmd_clear(src_pmd);
 226                 return 0;
 227         }
 228         src_pte = pte_offset(src_pmd, address);
 229         if (pmd_none(*dst_pmd)) {
 230                 if (!pte_alloc(dst_pmd, 0))
 231                         return -ENOMEM;
 232         }
 233         dst_pte = pte_offset(dst_pmd, address);
 234         address &= ~PMD_MASK;
 235         end = address + size;
 236         if (end >= PMD_SIZE)
 237                 end = PMD_SIZE;
 238         do {
 239                 /* I would like to switch arguments here, to make it
 240                  * consistent with copy_xxx_range and memcpy syntax.
 241                  */
 242                 copy_one_pte(src_pte++, dst_pte++);
 243                 address += PAGE_SIZE;
 244         } while (address < end);
 245         return 0;
 246 }
 247 
 248 static inline int copy_pmd_range(pgd_t *dst_pgd, pgd_t *src_pgd, unsigned long address, unsigned long size)
     /*  */
 249 {
 250         pmd_t * src_pmd, * dst_pmd;
 251         unsigned long end;
 252         int error = 0;
 253 
 254         if (pgd_none(*src_pgd))
 255                 return 0;
 256         if (pgd_bad(*src_pgd)) {
 257                 printk("copy_pmd_range: bad pgd (%08lx)\n", pgd_val(*src_pgd));
 258                 pgd_clear(src_pgd);
 259                 return 0;
 260         }
 261         src_pmd = pmd_offset(src_pgd, address);
 262         if (pgd_none(*dst_pgd)) {
 263                 if (!pmd_alloc(dst_pgd, 0))
 264                         return -ENOMEM;
 265         }
 266         dst_pmd = pmd_offset(dst_pgd, address);
 267         address &= ~PGDIR_MASK;
 268         end = address + size;
 269         if (end > PGDIR_SIZE)
 270                 end = PGDIR_SIZE;
 271         do {
 272                 error = copy_pte_range(dst_pmd++, src_pmd++, address, end - address);
 273                 if (error)
 274                         break;
 275                 address = (address + PMD_SIZE) & PMD_MASK; 
 276         } while (address < end);
 277         return error;
 278 }
 279 
 280 /*
 281  * copy one vm_area from one task to the other. Assumes the page tables
 282  * already present in the new task to be cleared in the whole range
 283  * covered by this vma.
 284  */
 285 int copy_page_range(struct mm_struct *dst, struct mm_struct *src,
     /*  */
 286                         struct vm_area_struct *vma)
 287 {
 288         pgd_t * src_pgd, * dst_pgd;
 289         unsigned long address = vma->vm_start;
 290         unsigned long end = vma->vm_end;
 291         int error = 0;
 292 
 293         src_pgd = pgd_offset(src, address);
 294         dst_pgd = pgd_offset(dst, address);
 295         while (address < end) {
 296                 error = copy_pmd_range(dst_pgd++, src_pgd++, address, end - address);
 297                 if (error)
 298                         break;
 299                 address = (address + PGDIR_SIZE) & PGDIR_MASK;
 300         }
 301         /* Note that the src ptes get c-o-w treatment, so they change too. */
 302         invalidate_range(src, vma->vm_start, vma->vm_end);
 303         invalidate_range(dst, vma->vm_start, vma->vm_end);
 304         return error;
 305 }
 306 
 307 static inline void forget_pte(pte_t page)
     /*  */
 308 {
 309         if (pte_none(page))
 310                 return;
 311         if (pte_present(page)) {
 312                 free_page(pte_page(page));
 313                 if (mem_map[MAP_NR(pte_page(page))].reserved)
 314                         return;
 315                 if (current->mm->rss <= 0)
 316                         return;
 317                 current->mm->rss--;
 318                 return;
 319         }
 320         swap_free(pte_val(page));
 321 }
 322 
 323 static inline void zap_pte_range(pmd_t * pmd, unsigned long address, unsigned long size)
     /*  */
 324 {
 325         pte_t * pte;
 326         unsigned long end;
 327 
 328         if (pmd_none(*pmd))
 329                 return;
 330         if (pmd_bad(*pmd)) {
 331                 printk("zap_pte_range: bad pmd (%08lx)\n", pmd_val(*pmd));
 332                 pmd_clear(pmd);
 333                 return;
 334         }
 335         pte = pte_offset(pmd, address);
 336         address &= ~PMD_MASK;
 337         end = address + size;
 338         if (end >= PMD_SIZE)
 339                 end = PMD_SIZE;
 340         do {
 341                 pte_t page = *pte;
 342                 pte_clear(pte);
 343                 forget_pte(page);
 344                 address += PAGE_SIZE;
 345                 pte++;
 346         } while (address < end);
 347 }
 348 
 349 static inline void zap_pmd_range(pgd_t * dir, unsigned long address, unsigned long size)
     /*  */
 350 {
 351         pmd_t * pmd;
 352         unsigned long end;
 353 
 354         if (pgd_none(*dir))
 355                 return;
 356         if (pgd_bad(*dir)) {
 357                 printk("zap_pmd_range: bad pgd (%08lx)\n", pgd_val(*dir));
 358                 pgd_clear(dir);
 359                 return;
 360         }
 361         pmd = pmd_offset(dir, address);
 362         address &= ~PGDIR_MASK;
 363         end = address + size;
 364         if (end > PGDIR_SIZE)
 365                 end = PGDIR_SIZE;
 366         do {
 367                 zap_pte_range(pmd, address, end - address);
 368                 address = (address + PMD_SIZE) & PMD_MASK; 
 369                 pmd++;
 370         } while (address < end);
 371 }
 372 
 373 /*
 374  * remove user pages in a given range.
 375  */
 376 int zap_page_range(struct mm_struct *mm, unsigned long address, unsigned long size)
     /*  */
 377 {
 378         pgd_t * dir;
 379         unsigned long end = address + size;
 380 
 381         dir = pgd_offset(mm, address);
 382         while (address < end) {
 383                 zap_pmd_range(dir, address, end - address);
 384                 address = (address + PGDIR_SIZE) & PGDIR_MASK;
 385                 dir++;
 386         }
 387         invalidate_range(mm, end - size, end);
 388         return 0;
 389 }
 390 
 391 static inline void zeromap_pte_range(pte_t * pte, unsigned long address, unsigned long size, pte_t zero_pte)
     /*  */
 392 {
 393         unsigned long end;
 394 
 395         address &= ~PMD_MASK;
 396         end = address + size;
 397         if (end > PMD_SIZE)
 398                 end = PMD_SIZE;
 399         do {
 400                 pte_t oldpage = *pte;
 401                 set_pte(pte, zero_pte);
 402                 forget_pte(oldpage);
 403                 address += PAGE_SIZE;
 404                 pte++;
 405         } while (address < end);
 406 }
 407 
 408 static inline int zeromap_pmd_range(pmd_t * pmd, unsigned long address, unsigned long size, pte_t zero_pte)
     /*  */
 409 {
 410         unsigned long end;
 411 
 412         address &= ~PGDIR_MASK;
 413         end = address + size;
 414         if (end > PGDIR_SIZE)
 415                 end = PGDIR_SIZE;
 416         do {
 417                 pte_t * pte = pte_alloc(pmd, address);
 418                 if (!pte)
 419                         return -ENOMEM;
 420                 zeromap_pte_range(pte, address, end - address, zero_pte);
 421                 address = (address + PMD_SIZE) & PMD_MASK;
 422                 pmd++;
 423         } while (address < end);
 424         return 0;
 425 }
 426 
 427 int zeromap_page_range(unsigned long address, unsigned long size, pgprot_t prot)
     /*  */
 428 {
 429         int error = 0;
 430         pgd_t * dir;
 431         unsigned long end = address + size;
 432         pte_t zero_pte;
 433 
 434         zero_pte = pte_wrprotect(mk_pte(ZERO_PAGE, prot));
 435         dir = pgd_offset(current->mm, address);
 436         while (address < end) {
 437                 pmd_t *pmd = pmd_alloc(dir, address);
 438                 error = -ENOMEM;
 439                 if (!pmd)
 440                         break;
 441                 error = zeromap_pmd_range(pmd, address, end - address, zero_pte);
 442                 if (error)
 443                         break;
 444                 address = (address + PGDIR_SIZE) & PGDIR_MASK;
 445                 dir++;
 446         }
 447         invalidate_range(current->mm, end - size, end);
 448         return error;
 449 }
 450 
 451 /*
 452  * maps a range of physical memory into the requested pages. the old
 453  * mappings are removed. any references to nonexistent pages results
 454  * in null mappings (currently treated as "copy-on-access")
 455  */
 456 static inline void remap_pte_range(pte_t * pte, unsigned long address, unsigned long size,
     /*  */
 457         unsigned long offset, pgprot_t prot)
 458 {
 459         unsigned long end;
 460 
 461         address &= ~PMD_MASK;
 462         end = address + size;
 463         if (end > PMD_SIZE)
 464                 end = PMD_SIZE;
 465         do {
 466                 pte_t oldpage = *pte;
 467                 pte_clear(pte);
 468                 if (offset >= high_memory || mem_map[MAP_NR(offset)].reserved)
 469                         set_pte(pte, mk_pte(offset, prot));
 470                 forget_pte(oldpage);
 471                 address += PAGE_SIZE;
 472                 offset += PAGE_SIZE;
 473                 pte++;
 474         } while (address < end);
 475 }
 476 
 477 static inline int remap_pmd_range(pmd_t * pmd, unsigned long address, unsigned long size,
     /*  */
 478         unsigned long offset, pgprot_t prot)
 479 {
 480         unsigned long end;
 481 
 482         address &= ~PGDIR_MASK;
 483         end = address + size;
 484         if (end > PGDIR_SIZE)
 485                 end = PGDIR_SIZE;
 486         offset -= address;
 487         do {
 488                 pte_t * pte = pte_alloc(pmd, address);
 489                 if (!pte)
 490                         return -ENOMEM;
 491                 remap_pte_range(pte, address, end - address, address + offset, prot);
 492                 address = (address + PMD_SIZE) & PMD_MASK;
 493                 pmd++;
 494         } while (address < end);
 495         return 0;
 496 }
 497 
 498 int remap_page_range(unsigned long from, unsigned long offset, unsigned long size, pgprot_t prot)
     /*  */
 499 {
 500         int error = 0;
 501         pgd_t * dir;
 502         unsigned long end = from + size;
 503 
 504         offset -= from;
 505         dir = pgd_offset(current->mm, from);
 506         while (from < end) {
 507                 pmd_t *pmd = pmd_alloc(dir, from);
 508                 error = -ENOMEM;
 509                 if (!pmd)
 510                         break;
 511                 error = remap_pmd_range(pmd, from, end - from, offset + from, prot);
 512                 if (error)
 513                         break;
 514                 from = (from + PGDIR_SIZE) & PGDIR_MASK;
 515                 dir++;
 516         }
 517         invalidate_range(current->mm, from - size, from);
 518         return error;
 519 }
 520 
 521 /*
 522  * sanity-check function..
 523  */
 524 static void put_page(pte_t * page_table, pte_t pte)
     /*  */
 525 {
 526         if (!pte_none(*page_table)) {
 527                 printk("put_page: page already exists %08lx\n", pte_val(*page_table));
 528                 free_page(pte_page(pte));
 529                 return;
 530         }
 531 /* no need for invalidate */
 532         set_pte(page_table, pte);
 533 }
 534 
 535 /*
 536  * This routine is used to map in a page into an address space: needed by
 537  * execve() for the initial stack and environment pages.
 538  */
 539 unsigned long put_dirty_page(struct task_struct * tsk, unsigned long page, unsigned long address)
     /*  */
 540 {
 541         pgd_t * pgd;
 542         pmd_t * pmd;
 543         pte_t * pte;
 544 
 545         if (page >= high_memory)
 546                 printk("put_dirty_page: trying to put page %08lx at %08lx\n",page,address);
 547         if (mem_map[MAP_NR(page)].count != 1)
 548                 printk("mem_map disagrees with %08lx at %08lx\n",page,address);
 549         pgd = pgd_offset(tsk->mm,address);
 550         pmd = pmd_alloc(pgd, address);
 551         if (!pmd) {
 552                 free_page(page);
 553                 oom(tsk);
 554                 return 0;
 555         }
 556         pte = pte_alloc(pmd, address);
 557         if (!pte) {
 558                 free_page(page);
 559                 oom(tsk);
 560                 return 0;
 561         }
 562         if (!pte_none(*pte)) {
 563                 printk("put_dirty_page: page already exists\n");
 564                 free_page(page);
 565                 return 0;
 566         }
 567         set_pte(pte, pte_mkwrite(pte_mkdirty(mk_pte(page, PAGE_COPY))));
 568 /* no need for invalidate */
 569         return page;
 570 }
 571 
 572 /*
 573  * This routine handles present pages, when users try to write
 574  * to a shared page. It is done by copying the page to a new address
 575  * and decrementing the shared-page counter for the old page.
 576  *
 577  * Goto-purists beware: the only reason for goto's here is that it results
 578  * in better assembly code.. The "default" path will see no jumps at all.
 579  *
 580  * Note that this routine assumes that the protection checks have been
 581  * done by the caller (the low-level page fault routine in most cases).
 582  * Thus we can safely just mark it writable once we've done any necessary
 583  * COW.
 584  *
 585  * We also mark the page dirty at this point even though the page will
 586  * change only once the write actually happens. This avoids a few races,
 587  * and potentially makes it more efficient.
 588  */
 589 void do_wp_page(struct task_struct * tsk, struct vm_area_struct * vma,
     /*  */
 590         unsigned long address, int write_access)
 591 {
 592         pgd_t *page_dir;
 593         pmd_t *page_middle;
 594         pte_t *page_table, pte;
 595         unsigned long old_page, new_page;
 596 
 597         new_page = __get_free_page(GFP_KERNEL);
 598         page_dir = pgd_offset(vma->vm_mm, address);
 599         if (pgd_none(*page_dir))
 600                 goto end_wp_page;
 601         if (pgd_bad(*page_dir))
 602                 goto bad_wp_pagedir;
 603         page_middle = pmd_offset(page_dir, address);
 604         if (pmd_none(*page_middle))
 605                 goto end_wp_page;
 606         if (pmd_bad(*page_middle))
 607                 goto bad_wp_pagemiddle;
 608         page_table = pte_offset(page_middle, address);
 609         pte = *page_table;
 610         if (!pte_present(pte))
 611                 goto end_wp_page;
 612         if (pte_write(pte))
 613                 goto end_wp_page;
 614         old_page = pte_page(pte);
 615         if (old_page >= high_memory)
 616                 goto bad_wp_page;
 617         tsk->min_flt++;
 618         /*
 619          * Do we need to copy?
 620          */
 621         if (mem_map[MAP_NR(old_page)].count != 1) {
 622                 if (new_page) {
 623                         if (mem_map[MAP_NR(old_page)].reserved)
 624                                 ++vma->vm_mm->rss;
 625                         copy_page(old_page,new_page);
 626                         set_pte(page_table, pte_mkwrite(pte_mkdirty(mk_pte(new_page, vma->vm_page_prot))));
 627                         free_page(old_page);
 628                         invalidate_page(vma, address);
 629                         return;
 630                 }
 631                 set_pte(page_table, BAD_PAGE);
 632                 free_page(old_page);
 633                 oom(tsk);
 634                 invalidate_page(vma, address);
 635                 return;
 636         }
 637         set_pte(page_table, pte_mkdirty(pte_mkwrite(pte)));
 638         invalidate_page(vma, address);
 639         if (new_page)
 640                 free_page(new_page);
 641         return;
 642 bad_wp_page:
 643         printk("do_wp_page: bogus page at address %08lx (%08lx)\n",address,old_page);
 644         send_sig(SIGKILL, tsk, 1);
 645         goto end_wp_page;
 646 bad_wp_pagemiddle:
 647         printk("do_wp_page: bogus page-middle at address %08lx (%08lx)\n", address, pmd_val(*page_middle));
 648         send_sig(SIGKILL, tsk, 1);
 649         goto end_wp_page;
 650 bad_wp_pagedir:
 651         printk("do_wp_page: bogus page-dir entry at address %08lx (%08lx)\n", address, pgd_val(*page_dir));
 652         send_sig(SIGKILL, tsk, 1);
 653 end_wp_page:
 654         if (new_page)
 655                 free_page(new_page);
 656         return;
 657 }
 658 
 659 /*
 660  * Ugly, ugly, but the goto's result in better assembly..
 661  */
 662 int verify_area(int type, const void * addr, unsigned long size)
     /*  */
 663 {
 664         struct vm_area_struct * vma;
 665         unsigned long start = (unsigned long) addr;
 666 
 667         /* If the current user space is mapped to kernel space (for the
 668          * case where we use a fake user buffer with get_fs/set_fs()) we
 669          * don't expect to find the address in the user vm map.
 670          */
 671         if (get_fs() == get_ds())
 672                 return 0;
 673 
 674         vma = find_vma(current, start);
 675         if (!vma)
 676                 goto bad_area;
 677         if (vma->vm_start <= start)
 678                 goto good_area;
 679         if (!(vma->vm_flags & VM_GROWSDOWN))
 680                 goto bad_area;
 681         if (expand_stack(vma, start))
 682                 goto bad_area;
 683 
 684 good_area:
 685         if (type == VERIFY_WRITE)
 686                 goto check_write;
 687         for (;;) {
 688                 struct vm_area_struct * next;
 689                 if (!(vma->vm_flags & VM_READ))
 690                         goto bad_area;
 691                 if (vma->vm_end - start >= size)
 692                         return 0;
 693                 next = vma->vm_next;
 694                 if (!next || vma->vm_end != next->vm_start)
 695                         goto bad_area;
 696                 vma = next;
 697         }
 698 
 699 check_write:
 700         if (!(vma->vm_flags & VM_WRITE))
 701                 goto bad_area;
 702         if (!wp_works_ok)
 703                 goto check_wp_fault_by_hand;
 704         for (;;) {
 705                 if (vma->vm_end - start >= size)
 706                         break;
 707                 if (!vma->vm_next || vma->vm_end != vma->vm_next->vm_start)
 708                         goto bad_area;
 709                 vma = vma->vm_next;
 710                 if (!(vma->vm_flags & VM_WRITE))
 711                         goto bad_area;
 712         }
 713         return 0;
 714 
 715 check_wp_fault_by_hand:
 716         size--;
 717         size += start & ~PAGE_MASK;
 718         size >>= PAGE_SHIFT;
 719         start &= PAGE_MASK;
 720 
 721         for (;;) {
 722                 do_wp_page(current, vma, start, 1);
 723                 if (!size)
 724                         break;
 725                 size--;
 726                 start += PAGE_SIZE;
 727                 if (start < vma->vm_end)
 728                         continue;
 729                 vma = vma->vm_next;
 730                 if (!vma || vma->vm_start != start)
 731                         goto bad_area;
 732                 if (!(vma->vm_flags & VM_WRITE))
 733                         goto bad_area;;
 734         }
 735         return 0;
 736 
 737 bad_area:
 738         return -EFAULT;
 739 }
 740 
 741 static inline void get_empty_page(struct task_struct * tsk, struct vm_area_struct * vma, pte_t * page_table)
     /*  */
 742 {
 743         unsigned long tmp;
 744 
 745         if (!(tmp = get_free_page(GFP_KERNEL))) {
 746                 oom(tsk);
 747                 put_page(page_table, BAD_PAGE);
 748                 return;
 749         }
 750         put_page(page_table, pte_mkwrite(mk_pte(tmp, vma->vm_page_prot)));
 751 }
 752 
 753 /*
 754  * try_to_share() checks the page at address "address" in the task "p",
 755  * to see if it exists, and if it is clean. If so, share it with the current
 756  * task.
 757  *
 758  * NOTE! This assumes we have checked that p != current, and that they
 759  * share the same inode and can generally otherwise be shared.
 760  */
 761 static int try_to_share(unsigned long to_address, struct vm_area_struct * to_area,
     /*  */
 762         unsigned long from_address, struct vm_area_struct * from_area,
 763         unsigned long newpage)
 764 {
 765         pgd_t * from_dir, * to_dir;
 766         pmd_t * from_middle, * to_middle;
 767         pte_t * from_table, * to_table;
 768         pte_t from, to;
 769 
 770         from_dir = pgd_offset(from_area->vm_mm,from_address);
 771 /* is there a page-directory at from? */
 772         if (pgd_none(*from_dir))
 773                 return 0;
 774         if (pgd_bad(*from_dir)) {
 775                 printk("try_to_share: bad page directory %08lx\n", pgd_val(*from_dir));
 776                 pgd_clear(from_dir);
 777                 return 0;
 778         }
 779         from_middle = pmd_offset(from_dir, from_address);
 780 /* is there a mid-directory at from? */
 781         if (pmd_none(*from_middle))
 782                 return 0;
 783         if (pmd_bad(*from_middle)) {
 784                 printk("try_to_share: bad mid directory %08lx\n", pmd_val(*from_middle));
 785                 pmd_clear(from_middle);
 786                 return 0;
 787         }
 788         from_table = pte_offset(from_middle, from_address);
 789         from = *from_table;
 790 /* is the page present? */
 791         if (!pte_present(from))
 792                 return 0;
 793 /* if it is dirty it must be from a shared mapping to be shared */
 794         if (pte_dirty(from)) {
 795                 if (!(from_area->vm_flags & VM_SHARED))
 796                         return 0;
 797         }
 798 /* is the page reasonable at all? */
 799         if (pte_page(from) >= high_memory)
 800                 return 0;
 801         if (mem_map[MAP_NR(pte_page(from))].reserved)
 802                 return 0;
 803 /* is the destination ok? */
 804         to_dir = pgd_offset(to_area->vm_mm,to_address);
 805 /* is there a page-directory at to? */
 806         if (pgd_none(*to_dir))
 807                 return 0;
 808         if (pgd_bad(*to_dir)) {
 809                 printk("try_to_share: bad page directory %08lx\n", pgd_val(*to_dir));
 810                 return 0;
 811         }
 812         to_middle = pmd_offset(to_dir, to_address);
 813 /* is there a mid-directory at to? */
 814         if (pmd_none(*to_middle))
 815                 return 0;
 816         if (pmd_bad(*to_middle)) {
 817                 printk("try_to_share: bad mid directory %08lx\n", pmd_val(*to_middle));
 818                 return 0;
 819         }
 820         to_table = pte_offset(to_middle, to_address);
 821         to = *to_table;
 822         if (!pte_none(to))
 823                 return 0;
 824 /* do we copy? */
 825         if (newpage) {
 826                 /* if it's in the swap cache, it's dirty by implication */
 827                 /* so we can't use it if it's not from a shared mapping */
 828                 if (in_swap_cache(pte_page(from))) {
 829                         if (!(from_area->vm_flags & VM_SHARED))
 830                                 return 0;
 831                 }
 832                 copy_page(pte_page(from), newpage);
 833                 set_pte(to_table, mk_pte(newpage, to_area->vm_page_prot));
 834                 return 1;
 835         }
 836 /*
 837  * do a final swap-cache test before sharing them: if it's in the swap
 838  * cache, we have to remove it now, as we get two pointers to the same
 839  * physical page and the cache can't handle it. Mark the original dirty.
 840  *
 841  * NOTE! Even if "from" is dirty, "to" will be clean: if we get here
 842  * with a dirty "from", the from-mapping is a shared map, so we can trust
 843  * the page contents to be up-to-date
 844  */
 845         if (in_swap_cache(pte_page(from))) {
 846                 if (!(from_area->vm_flags & VM_SHARED))
 847                         return 0;
 848                 set_pte(from_table, pte_mkdirty(from));
 849                 delete_from_swap_cache(pte_page(from));
 850         }
 851         mem_map[MAP_NR(pte_page(from))].count++;
 852         set_pte(to_table, mk_pte(pte_page(from), to_area->vm_page_prot));
 853 /* Check if we need to do anything at all to the 'from' field */
 854         if (!pte_write(from))
 855                 return 1;
 856         if (from_area->vm_flags & VM_SHARED)
 857                 return 1;
 858 /* ok, need to mark it read-only, so invalidate any possible old TB entry */
 859         set_pte(from_table, pte_wrprotect(from));
 860         invalidate_page(from_area, from_address);
 861         return 1;
 862 }
 863 
 864 /*
 865  * share_page() tries to find a process that could share a page with
 866  * the current one.
 867  *
 868  * We first check if it is at all feasible by checking inode->i_count.
 869  * It should be >1 if there are other tasks sharing this inode.
 870  */
 871 static int share_page(struct vm_area_struct * area, unsigned long address,
     /*  */
 872         int write_access, unsigned long newpage)
 873 {
 874         struct inode * inode;
 875         unsigned long offset;
 876         unsigned long from_address;
 877         unsigned long give_page;
 878         struct vm_area_struct * mpnt;
 879 
 880         if (!area || !(inode = area->vm_inode) || inode->i_count < 2)
 881                 return 0;
 882         /* do we need to copy or can we just share? */
 883         give_page = 0;
 884         if (write_access && !(area->vm_flags & VM_SHARED)) {
 885                 if (!newpage)
 886                         return 0;
 887                 give_page = newpage;
 888         }
 889         offset = address - area->vm_start + area->vm_offset;
 890         /* See if there is something in the VM we can share pages with. */
 891         /* Traverse the entire circular i_mmap list, except `area' itself. */
 892         for (mpnt = area->vm_next_share; mpnt != area; mpnt = mpnt->vm_next_share) {
 893                 /* must be same inode */
 894                 if (mpnt->vm_inode != inode) {
 895                         printk("Aiee! Corrupt vm_area_struct i_mmap ring\n");
 896                         break;  
 897                 }
 898                 /* offsets must be mutually page-aligned */
 899                 if ((mpnt->vm_offset ^ area->vm_offset) & ~PAGE_MASK)
 900                         continue;
 901                 /* the other area must actually cover the wanted page.. */
 902                 from_address = offset + mpnt->vm_start - mpnt->vm_offset;
 903                 if (from_address < mpnt->vm_start || from_address >= mpnt->vm_end)
 904                         continue;
 905                 /* .. NOW we can actually try to use the same physical page */
 906                 if (!try_to_share(address, area, from_address, mpnt, give_page))
 907                         continue;
 908                 /* free newpage if we never used it.. */
 909                 if (give_page || !newpage)
 910                         return 1;
 911                 free_page(newpage);
 912                 return 1;
 913         }
 914         return 0;
 915 }
 916 
 917 /*
 918  * This function tries to find a page that is shared with the buffer cache,
 919  * and if so it moves the buffer cache to a new location.
 920  *
 921  * It returns non-zero if we used up the "new_page" page.
 922  */
 923 static int unshare(struct vm_area_struct *vma, unsigned long address, unsigned long new_page)
     /*  */
 924 {
 925         pgd_t *page_dir;
 926         pmd_t *page_middle;
 927         pte_t *page_table, pte;
 928         unsigned long old_page;
 929         struct buffer_head * bh, * tmp;
 930 
 931         page_dir = pgd_offset(vma->vm_mm, address);
 932         if (pgd_none(*page_dir))
 933                 return 0;
 934         if (pgd_bad(*page_dir)) {
 935                 printk("bad page table directory entry %p:[%lx]\n", page_dir, pgd_val(*page_dir));
 936                 pgd_clear(page_dir);
 937                 return 0;
 938         }
 939         page_middle = pmd_offset(page_dir, address);
 940         if (pmd_none(*page_middle))
 941                 return 0;
 942         if (pmd_bad(*page_middle)) {
 943                 printk("bad page table directory entry %p:[%lx]\n", page_dir, pgd_val(*page_dir));
 944                 pmd_clear(page_middle);
 945                 return 0;
 946         }
 947         page_table = pte_offset(page_middle, address);
 948         pte = *page_table;
 949         if (!pte_present(pte))
 950                 return 0;
 951         old_page = pte_page(pte);
 952         if (MAP_NR(old_page) > MAP_NR(high_memory))
 953                 return 0;
 954         address &= ~PAGE_MASK;
 955         memset((void *) (old_page + address), 0, PAGE_SIZE - address);
 956         bh = buffer_pages[MAP_NR(old_page)];
 957         if (!bh)
 958                 return 0;
 959         if (!new_page) {
 960                 printk("Aieee... unshare(): no page available\n");
 961                 return 0;
 962         }
 963         buffer_pages[MAP_NR(old_page)] = NULL;
 964         copy_page(old_page, new_page);
 965         free_page(old_page);
 966         old_page -= new_page;
 967         buffer_pages[MAP_NR(new_page)] = bh;
 968         tmp = bh;
 969         do {
 970                 tmp->b_data -= old_page;
 971                 tmp = tmp->b_this_page;
 972         } while (tmp != bh);
 973         return 1;
 974 }
 975 
 976 /*
 977  * Handle all mappings that got truncated by a "truncate()"
 978  * system call.
 979  *
 980  * NOTE! We have to be ready to update the memory sharing
 981  * between the file and the memory map for a potential last
 982  * incomplete page.  Ugly, but necessary.
 983  */
 984 void vmtruncate(struct inode * inode, unsigned long offset)
     /*  */
 985 {
 986         unsigned long page;
 987         struct vm_area_struct * mpnt;
 988 
 989         if (!inode->i_mmap)
 990                 return;
 991         page = __get_free_page(GFP_KERNEL);
 992         mpnt = inode->i_mmap;
 993         if (!mpnt) {
 994                 free_page(page);
 995                 return;
 996         }
 997         do {
 998                 unsigned long start = mpnt->vm_start;
 999                 unsigned long len = mpnt->vm_end - start;
1000                 unsigned long diff;
1001 
1002                 /* mapping wholly truncated? */
1003                 if (mpnt->vm_offset >= offset) {
1004                         zap_page_range(mpnt->vm_mm, start, len);
1005                         continue;
1006                 }
1007                 /* mapping wholly unaffected? */
1008                 diff = offset - mpnt->vm_offset;
1009                 if (diff >= len)
1010                         continue;
1011                 /* Ok, partially affected.. */
1012                 start += diff;
1013                 len = (len - diff) & PAGE_MASK;
1014                 /* Ugh, here comes the _really_ ugly part.. */
1015                 if (start & ~PAGE_MASK) {
1016                         if (unshare(mpnt, start, page))
1017                                 page = 0;
1018                         start = (start + ~PAGE_MASK) & PAGE_MASK;
1019                 }
1020                 zap_page_range(mpnt->vm_mm, start, len);
1021         } while ((mpnt = mpnt->vm_next_share) != inode->i_mmap);
1022         free_page(page);
1023 }
1024 
1025 /*
1026  * fill in an empty page-table if none exists.
1027  */
1028 static inline pte_t * get_empty_pgtable(struct task_struct * tsk,unsigned long address)
     /*  */
1029 {
1030         pgd_t *pgd;
1031         pmd_t *pmd;
1032         pte_t *pte;
1033 
1034         pgd = pgd_offset(tsk->mm, address);
1035         pmd = pmd_alloc(pgd, address);
1036         if (!pmd) {
1037                 oom(tsk);
1038                 return NULL;
1039         }
1040         pte = pte_alloc(pmd, address);
1041         if (!pte) {
1042                 oom(tsk);
1043                 return NULL;
1044         }
1045         return pte;
1046 }
1047 
1048 static inline void do_swap_page(struct task_struct * tsk, 
     /*  */
1049         struct vm_area_struct * vma, unsigned long address,
1050         pte_t * page_table, pte_t entry, int write_access)
1051 {
1052         pte_t page;
1053 
1054         if (!vma->vm_ops || !vma->vm_ops->swapin) {
1055                 swap_in(tsk, vma, page_table, pte_val(entry), write_access);
1056                 return;
1057         }
1058         page = vma->vm_ops->swapin(vma, address - vma->vm_start + vma->vm_offset, pte_val(entry));
1059         if (pte_val(*page_table) != pte_val(entry)) {
1060                 free_page(pte_page(page));
1061                 return;
1062         }
1063         if (mem_map[MAP_NR(pte_page(page))].count > 1 && !(vma->vm_flags & VM_SHARED))
1064                 page = pte_wrprotect(page);
1065         ++vma->vm_mm->rss;
1066         ++tsk->maj_flt;
1067         set_pte(page_table, page);
1068         return;
1069 }
1070 
1071 /*
1072  * do_no_page() tries to create a new page mapping. It aggressively
1073  * tries to share with existing pages, but makes a separate copy if
1074  * the "write_access" parameter is true in order to avoid the next
1075  * page fault.
1076  */
1077 void do_no_page(struct task_struct * tsk, struct vm_area_struct * vma,
     /*  */
1078         unsigned long address, int write_access)
1079 {
1080         pte_t * page_table;
1081         pte_t entry;
1082         unsigned long page;
1083 
1084         page_table = get_empty_pgtable(tsk, address);
1085         if (!page_table)
1086                 return;
1087         entry = *page_table;
1088         if (pte_present(entry))
1089                 return;
1090         if (!pte_none(entry)) {
1091                 do_swap_page(tsk, vma, address, page_table, entry, write_access);
1092                 return;
1093         }
1094         address &= PAGE_MASK;
1095         if (!vma->vm_ops || !vma->vm_ops->nopage) {
1096                 ++vma->vm_mm->rss;
1097                 ++tsk->min_flt;
1098                 get_empty_page(tsk, vma, page_table);
1099                 return;
1100         }
1101         page = __get_free_page(GFP_KERNEL);
1102         if (share_page(vma, address, write_access, page)) {
1103                 ++vma->vm_mm->rss;
1104                 ++tsk->min_flt;
1105                 return;
1106         }
1107         if (!page) {
1108                 oom(tsk);
1109                 put_page(page_table, BAD_PAGE);
1110                 return;
1111         }
1112         ++tsk->maj_flt;
1113         ++vma->vm_mm->rss;
1114         /*
1115          * The fourth argument is "no_share", which tells the low-level code
1116          * to copy, not share the page even if sharing is possible.  It's
1117          * essentially an early COW detection 
1118          */
1119         page = vma->vm_ops->nopage(vma, address, page,
1120                 write_access && !(vma->vm_flags & VM_SHARED));
1121         if (share_page(vma, address, write_access, 0)) {
1122                 free_page(page);
1123                 return;
1124         }
1125         /*
1126          * This silly early PAGE_DIRTY setting removes a race
1127          * due to the bad i386 page protection. But it's valid
1128          * for other architectures too.
1129          *
1130          * Note that if write_access is true, we either now have
1131          * a exclusive copy of the page, or this is a shared mapping,
1132          * so we can make it writable and dirty to avoid having to
1133          * handle that later.
1134          */
1135         entry = mk_pte(page, vma->vm_page_prot);
1136         if (write_access) {
1137                 entry = pte_mkwrite(pte_mkdirty(entry));
1138         } else if (mem_map[MAP_NR(page)].count > 1 && !(vma->vm_flags & VM_SHARED))
1139                 entry = pte_wrprotect(entry);
1140         put_page(page_table, entry);
1141 }
1142 
1143 /*
1144  * The above separate functions for the no-page and wp-page
1145  * cases will go away (they mostly do the same thing anyway),
1146  * and we'll instead use only a general "handle_mm_fault()".
1147  *
1148  * These routines also need to handle stuff like marking pages dirty
1149  * and/or accessed for architectures that don't do it in hardware (most
1150  * RISC architectures).  The early dirtying is also good on the i386.
1151  *
1152  * There is also a hook called "update_mmu_cache()" that architectures
1153  * with external mmu caches can use to update those (ie the Sparc or
1154  * PowerPC hashed page tables that act as extended TLBs).
1155  */
1156 static inline void handle_pte_fault(struct vm_area_struct * vma, unsigned long address,
     /*  */
1157         int write_access, pte_t * pte)
1158 {
1159         if (!pte_present(*pte)) {
1160                 do_no_page(current, vma, address, write_access);
1161                 return;
1162         }
1163         set_pte(pte, pte_mkyoung(*pte));
1164         if (!write_access)
1165                 return;
1166         if (pte_write(*pte)) {
1167                 set_pte(pte, pte_mkdirty(*pte));
1168                 return;
1169         }
1170         do_wp_page(current, vma, address, write_access);
1171 }
1172 
1173 void handle_mm_fault(struct vm_area_struct * vma, unsigned long address,
     /*  */
1174         int write_access)
1175 {
1176         pgd_t *pgd;
1177         pmd_t *pmd;
1178         pte_t *pte;
1179 
1180         pgd = pgd_offset(vma->vm_mm, address);
1181         pmd = pmd_alloc(pgd, address);
1182         if (!pmd)
1183                 goto no_memory;
1184         pte = pte_alloc(pmd, address);
1185         if (!pte)
1186                 goto no_memory;
1187         handle_pte_fault(vma, address, write_access, pte);
1188         update_mmu_cache(vma, address, *pte);
1189         return;
1190 no_memory:
1191         oom(current);
1192 }
/* */
root/mm/memory.c

DEFINITIONS