mm/filemap.c

/* */
This source file includes following definitions.
invalidate_inode_pages
truncate_inode_pages
shrink_mmap
page_unuse
update_vm_cache
add_to_page_cache
try_to_read_ahead
__wait_on_page
generic_file_readahead
generic_file_read
fill_page
filemap_nopage
do_write_page
filemap_write_page
filemap_swapout
filemap_swapin
filemap_sync_pte
filemap_sync_pte_range
filemap_sync_pmd_range
filemap_sync
filemap_unmap
generic_file_mmap
msync_interval
sys_msync
   1 /*
   2  *      linux/mm/filemap.c
   3  *
   4  * Copyright (C) 1994, 1995  Linus Torvalds
   5  */
   6 
   7 /*
   8  * This file handles the generic file mmap semantics used by
   9  * most "normal" filesystems (but you don't /have/ to use this:
  10  * the NFS filesystem does this differently, for example)
  11  */
  12 #include <linux/stat.h>
  13 #include <linux/sched.h>
  14 #include <linux/kernel.h>
  15 #include <linux/mm.h>
  16 #include <linux/shm.h>
  17 #include <linux/errno.h>
  18 #include <linux/mman.h>
  19 #include <linux/string.h>
  20 #include <linux/malloc.h>
  21 #include <linux/fs.h>
  22 #include <linux/locks.h>
  23 #include <linux/pagemap.h>
  24 #include <linux/swap.h>
  25 
  26 #include <asm/segment.h>
  27 #include <asm/system.h>
  28 #include <asm/pgtable.h>
  29 
  30 #if 0
  31 #define DEBUG_ASYNC_AHEAD
  32 #endif
  33 
  34 /*
  35  * Shared mappings implemented 30.11.1994. It's not fully working yet,
  36  * though.
  37  *
  38  * Shared mappings now work. 15.8.1995  Bruno.
  39  */
  40 
  41 unsigned long page_cache_size = 0;
  42 struct page * page_hash_table[PAGE_HASH_SIZE];
  43 
  44 /*
  45  * Simple routines for both non-shared and shared mappings.
  46  */
  47 
  48 /*
  49  * Invalidate the pages of an inode, removing all pages that aren't
  50  * locked down (those are sure to be up-to-date anyway, so we shouldn't
  51  * invalidate them).
  52  */
  53 void invalidate_inode_pages(struct inode * inode)
     /*  */
  54 {
  55         struct page ** p;
  56         struct page * page;
  57 
  58         p = &inode->i_pages;
  59         while ((page = *p) != NULL) {
  60                 if (PageLocked(page)) {
  61                         p = &page->next;
  62                         continue;
  63                 }
  64                 inode->i_nrpages--;
  65                 if ((*p = page->next) != NULL)
  66                         (*p)->prev = page->prev;
  67                 page->dirty = 0;
  68                 page->next = NULL;
  69                 page->prev = NULL;
  70                 remove_page_from_hash_queue(page);
  71                 page->inode = NULL;
  72                 free_page(page_address(page));
  73                 continue;
  74         }
  75 }
  76 
  77 /*
  78  * Truncate the page cache at a set offset, removing the pages
  79  * that are beyond that offset (and zeroing out partial pages).
  80  */
  81 void truncate_inode_pages(struct inode * inode, unsigned long start)
     /*  */
  82 {
  83         struct page ** p;
  84         struct page * page;
  85 
  86 repeat:
  87         p = &inode->i_pages;
  88         while ((page = *p) != NULL) {
  89                 unsigned long offset = page->offset;
  90 
  91                 /* page wholly truncated - free it */
  92                 if (offset >= start) {
  93                         if (PageLocked(page)) {
  94                                 wait_on_page(page);
  95                                 goto repeat;
  96                         }
  97                         inode->i_nrpages--;
  98                         if ((*p = page->next) != NULL)
  99                                 (*p)->prev = page->prev;
 100                         page->dirty = 0;
 101                         page->next = NULL;
 102                         page->prev = NULL;
 103                         remove_page_from_hash_queue(page);
 104                         page->inode = NULL;
 105                         free_page(page_address(page));
 106                         continue;
 107                 }
 108                 p = &page->next;
 109                 offset = start - offset;
 110                 /* partial truncate, clear end of page */
 111                 if (offset < PAGE_SIZE)
 112                         memset((void *) (offset + page_address(page)), 0, PAGE_SIZE - offset);
 113         }
 114 }
 115 
 116 int shrink_mmap(int priority, int dma)
     /*  */
 117 {
 118         static int clock = 0;
 119         struct page * page;
 120         unsigned long limit = MAP_NR(high_memory);
 121         struct buffer_head *tmp, *bh;
 122 
 123         priority = (limit<<2) >> priority;
 124         page = mem_map + clock;
 125         do {
 126                 priority--;
 127                 if (PageLocked(page))
 128                         goto next;
 129                 if (dma && !PageDMA(page))
 130                         goto next;
 131                 /* First of all, regenerate the page's referenced bit
 132                    from any buffers in the page */
 133                 bh = page->buffers;
 134                 if (bh) {
 135                         tmp = bh;
 136                         do {
 137                                 if (buffer_touched(tmp)) {
 138                                         clear_bit(BH_Touched, &tmp->b_state);
 139                                         set_bit(PG_referenced, &page->flags);
 140                                 }
 141                                 tmp = tmp->b_this_page;
 142                         } while (tmp != bh);
 143                 }
 144 
 145                 /* We can't throw away shared pages, but we do mark
 146                    them as referenced.  This relies on the fact that
 147                    no page is currently in both the page cache and the
 148                    buffer cache; we'd have to modify the following
 149                    test to allow for that case. */
 150 
 151                 switch (page->count) {
 152                         case 1:
 153                                 /* If it has been referenced recently, don't free it */
 154                                 if (clear_bit(PG_referenced, &page->flags))
 155                                         break;
 156 
 157                                 /* is it a page cache page? */
 158                                 if (page->inode) {
 159                                         remove_page_from_hash_queue(page);
 160                                         remove_page_from_inode_queue(page);
 161                                         free_page(page_address(page));
 162                                         return 1;
 163                                 }
 164 
 165                                 /* is it a buffer cache page? */
 166                                 if (bh && try_to_free_buffer(bh, &bh, 6))
 167                                         return 1;
 168                                 break;
 169 
 170                         default:
 171                                 /* more than one users: we can't throw it away */
 172                                 set_bit(PG_referenced, &page->flags);
 173                                 /* fall through */
 174                         case 0:
 175                                 /* nothing */
 176                 }
 177 next:
 178                 page++;
 179                 clock++;
 180                 if (clock >= limit) {
 181                         clock = 0;
 182                         page = mem_map;
 183                 }
 184         } while (priority > 0);
 185         return 0;
 186 }
 187 
 188 /*
 189  * This is called from try_to_swap_out() when we try to get rid of some
 190  * pages..  If we're unmapping the last occurrence of this page, we also
 191  * free it from the page hash-queues etc, as we don't want to keep it
 192  * in-core unnecessarily.
 193  */
 194 unsigned long page_unuse(unsigned long page)
     /*  */
 195 {
 196         struct page * p = mem_map + MAP_NR(page);
 197         int count = p->count;
 198 
 199         if (count != 2)
 200                 return count;
 201         if (!p->inode)
 202                 return count;
 203         remove_page_from_hash_queue(p);
 204         remove_page_from_inode_queue(p);
 205         free_page(page);
 206         return 1;
 207 }
 208 
 209 /*
 210  * Update a page cache copy, when we're doing a "write()" system call
 211  * See also "update_vm_cache()".
 212  */
 213 void update_vm_cache(struct inode * inode, unsigned long pos, const char * buf, int count)
     /*  */
 214 {
 215         unsigned long offset, len;
 216 
 217         offset = (pos & ~PAGE_MASK);
 218         pos = pos & PAGE_MASK;
 219         len = PAGE_SIZE - offset;
 220         do {
 221                 struct page * page;
 222 
 223                 if (len > count)
 224                         len = count;
 225                 page = find_page(inode, pos);
 226                 if (page) {
 227                         unsigned long addr;
 228 
 229                         wait_on_page(page);
 230                         addr = page_address(page);
 231                         memcpy((void *) (offset + addr), buf, len);
 232                         free_page(addr);
 233                 }
 234                 count -= len;
 235                 buf += len;
 236                 len = PAGE_SIZE;
 237                 offset = 0;
 238                 pos += PAGE_SIZE;
 239         } while (count);
 240 }
 241 
 242 static inline void add_to_page_cache(struct page * page,
     /*  */
 243         struct inode * inode, unsigned long offset)
 244 {
 245         page->count++;
 246         page->flags &= ~((1 << PG_uptodate) | (1 << PG_error));
 247         page->offset = offset;
 248         add_page_to_inode_queue(inode, page);
 249         add_page_to_hash_queue(inode, page);
 250 }
 251 
 252 /*
 253  * Try to read ahead in the file. "page_cache" is a potentially free page
 254  * that we could use for the cache (if it is 0 we can try to create one,
 255  * this is all overlapped with the IO on the previous page finishing anyway)
 256  */
 257 static unsigned long try_to_read_ahead(struct inode * inode, unsigned long offset, unsigned long page_cache)
     /*  */
 258 {
 259         struct page * page;
 260 
 261         offset &= PAGE_MASK;
 262         if (!page_cache) {
 263                 page_cache = __get_free_page(GFP_KERNEL);
 264                 if (!page_cache)
 265                         return 0;
 266         }
 267         if (offset >= inode->i_size)
 268                 return page_cache;
 269 #if 1
 270         page = find_page(inode, offset);
 271         if (page) {
 272                 page->count--;
 273                 return page_cache;
 274         }
 275         /*
 276          * Ok, add the new page to the hash-queues...
 277          */
 278         page = mem_map + MAP_NR(page_cache);
 279         add_to_page_cache(page, inode, offset);
 280         inode->i_op->readpage(inode, page);
 281         free_page(page_cache);
 282         return 0;
 283 #else
 284         return page_cache;
 285 #endif
 286 }
 287 
 288 /* 
 289  * Wait for IO to complete on a locked page.
 290  */
 291 void __wait_on_page(struct page *page)
     /*  */
 292 {
 293         struct wait_queue wait = { current, NULL };
 294 
 295         page->count++;
 296         add_wait_queue(&page->wait, &wait);
 297 repeat:
 298         run_task_queue(&tq_disk);
 299         current->state = TASK_UNINTERRUPTIBLE;
 300         if (PageLocked(page)) {
 301                 schedule();
 302                 goto repeat;
 303         }
 304         remove_wait_queue(&page->wait, &wait);
 305         page->count--;
 306         current->state = TASK_RUNNING;
 307 }
 308 
 309 
 310 /*
 311  * This is a generic file read routine, and uses the
 312  * inode->i_op->readpage() function for the actual low-level
 313  * stuff.
 314  *
 315  * This is really ugly. But the goto's actually try to clarify some
 316  * of the logic when it comes to error handling etc.
 317  */
 318 #define MAX_READAHEAD (PAGE_SIZE*8)
 319 #define MIN_READAHEAD (PAGE_SIZE)
 320 
 321 static inline unsigned long generic_file_readahead(struct file * filp, struct inode * inode,
     /*  */
 322         int try_async, unsigned long pos, struct page * page,
 323         unsigned long page_cache)
 324 {
 325         unsigned long max_ahead, ahead;
 326         unsigned long rapos, ppos;
 327 
 328         ppos = pos & PAGE_MASK;
 329         rapos = filp->f_rapos & PAGE_MASK;
 330         max_ahead = 0;
 331 /*
 332  * If the current page is locked, try some synchronous read-ahead in order
 333  * to avoid too small IO requests.
 334  */
 335         if (PageLocked(page)) {
 336                 max_ahead = filp->f_ramax;
 337                 rapos = ppos;
 338                 filp->f_rawin = 0;
 339                 filp->f_ralen = PAGE_SIZE;
 340         }
 341 /*
 342  * The current page is not locked
 343  * It may be the moment to try asynchronous read-ahead.
 344  * If asynchronous is the suggested tactics and if the current position is
 345  * inside the previous read-ahead window, check the last read page:
 346  * - if locked, the previous IO request is probably not complete, and
 347  *   we will not try to do another IO request.
 348  * - if not locked, the previous IO request is probably complete, and
 349  *   this is a good moment to try a new asynchronous read-ahead request.
 350  * try_async = 2 means that we have to force unplug of the device in
 351  * order to force call to the strategy routine of the disk driver and 
 352  * start IO asynchronously.
 353  */
 354         else if (try_async == 1 && rapos >= PAGE_SIZE &&
 355                  ppos <= rapos && ppos + filp->f_ralen >= rapos) {
 356                 struct page *a_page;
 357 /*
 358  * Add ONE page to max_ahead in order to try to have the same IO max size as
 359  * synchronous read-ahead (MAX_READAHEAD + 1)*PAGE_SIZE.
 360  * Compute the position of the last page we have tried to read.
 361  */
 362                 max_ahead = filp->f_ramax + PAGE_SIZE;
 363                 rapos -= PAGE_SIZE;
 364 
 365                 if (rapos < inode->i_size) {
 366                         a_page = find_page(inode, rapos);
 367                         if (a_page) {
 368                                 if (PageLocked(a_page))
 369                                         max_ahead = 0;
 370                                 a_page->count--;
 371                         }
 372                 }
 373                 if (max_ahead) {
 374                         filp->f_rawin = filp->f_ralen;
 375                         filp->f_ralen = 0;
 376                         try_async = 2;
 377                 }
 378         }
 379 /*
 380  * Try to read pages.
 381  * We hope that ll_rw_blk() plug/unplug, coalescence and sort will work fine
 382  * enough to avoid too bad actuals IO requests.
 383  */
 384         ahead = 0;
 385         while (ahead < max_ahead) {
 386                 ahead += PAGE_SIZE;
 387                 page_cache = try_to_read_ahead(inode, rapos + ahead, page_cache);
 388         }
 389 /*
 390  * If we tried to read some pages,
 391  * Compute the new read-ahead position.
 392  * It is the position of the next byte.
 393  * Store the length of the current read-ahead window.
 394  * If necessary,
 395  *    Try to force unplug of the device in order to start an asynchronous
 396  *    read IO.
 397  */
 398         if (ahead) {
 399                 filp->f_ralen += ahead;
 400                 filp->f_rawin += filp->f_ralen;
 401                 filp->f_rapos = rapos + ahead + PAGE_SIZE;
 402                 if (try_async == 2) {
 403                         run_task_queue(&tq_disk);
 404                 }
 405         }
 406 /*
 407  * Wait on the page if necessary
 408  */
 409         if (PageLocked(page)) {
 410                 __wait_on_page(page);
 411         }
 412         return page_cache;
 413 }
 414 
 415 
 416 int generic_file_read(struct inode * inode, struct file * filp, char * buf, int count)
     /*  */
 417 {
 418         int error, read;
 419         unsigned long pos, ppos, page_cache;
 420         int try_async;
 421 
 422 #ifdef DEBUG_ASYNC_AHEAD
 423 static long ccount = 0;
 424 if (count > 0) ccount += count;
 425 #endif
 426         
 427         if (count <= 0)
 428                 return 0;
 429 
 430         error = 0;
 431         read = 0;
 432         page_cache = 0;
 433 
 434         pos = filp->f_pos;
 435         ppos = pos & PAGE_MASK;
 436 /*
 437  * Check if the current position is inside the previous read-ahead window.
 438  * If that's true, I assume that the file accesses are sequential enough to
 439  * continue asynchronous read-ahead.
 440  * Do minimum read-ahead at the beginning of the file since some tools
 441  * only read the beginning of files.
 442  * Break read-ahead if the file position is outside the previous read ahead
 443  * window or if read-ahead position is 0.
 444  */
 445 /*
 446  * Will not try asynchronous read-ahead.
 447  * Reset to zero, read-ahead context.
 448  */
 449         if (pos+count < MIN_READAHEAD || !filp->f_rapos ||
 450             ppos > filp->f_rapos || ppos + filp->f_rawin < filp->f_rapos) {
 451                 try_async = 0;
 452 #ifdef DEBUG_ASYNC_AHEAD
 453                 if (ccount > 10000000) {
 454                         ccount = 0;
 455                         printk("XXXXXXXX ppos=%ld rapos=%ld ralen=%ld ramax=%ld rawin=%ld\n",
 456                                 ppos, filp->f_rapos, filp->f_ralen, filp->f_ramax, filp->f_rawin);
 457                 }
 458 #endif
 459                 filp->f_rapos = 0;
 460                 filp->f_ralen = 0;
 461                 filp->f_ramax = 0;
 462                 filp->f_rawin = 0;
 463 /*
 464  * Will try asynchronous read-ahead.
 465  * Double the max read ahead size each time.
 466  *   That heuristic avoid to do some large IO for files that are not really
 467  *   accessed sequentialy.
 468  */
 469         } else {
 470                 try_async = 1;
 471 #ifdef DEBUG_ASYNC_AHEAD
 472                 if (ccount > 10000000) {
 473                         ccount = 0;
 474                         printk("XXXXXXXX ppos=%ld rapos=%ld ralen=%ld ramax=%ld rawin=%ld\n",
 475                                 ppos, filp->f_rapos, filp->f_ralen, filp->f_ramax, filp->f_rawin);
 476                 }
 477 #endif
 478                 filp->f_ramax += filp->f_ramax;
 479         }
 480 /*
 481  * Compute a good value for read-ahead max
 482  * If the read operation stay in the first half page, force no readahead.
 483  * Else try first some value near count.
 484  *      do at least MIN_READAHEAD and at most MAX_READAHEAD.
 485  * (Should be a little reworked)
 486  */
 487         if (pos + count <= (PAGE_SIZE >> 1)) {
 488                 try_async = 0;
 489                 filp->f_ramax = 0;
 490         } else {
 491                 if (filp->f_ramax < count)
 492                         filp->f_ramax = count & PAGE_MASK;
 493 
 494                 if (filp->f_ramax < MIN_READAHEAD)
 495                         filp->f_ramax = MIN_READAHEAD;
 496                 else if (filp->f_ramax > MAX_READAHEAD)
 497                         filp->f_ramax = MAX_READAHEAD;
 498         }
 499 
 500         for (;;) {
 501                 struct page *page;
 502                 unsigned long offset, addr, nr;
 503 
 504                 if (pos >= inode->i_size)
 505                         break;
 506                 offset = pos & ~PAGE_MASK;
 507                 nr = PAGE_SIZE - offset;
 508                 /*
 509                  * Try to find the data in the page cache..
 510                  */
 511                 page = find_page(inode, pos & PAGE_MASK);
 512                 if (page)
 513                         goto found_page;
 514 
 515                 /*
 516                  * Ok, it wasn't cached, so we need to create a new
 517                  * page..
 518                  */
 519                 if (page_cache)
 520                         goto new_page;
 521 
 522                 error = -ENOMEM;
 523                 page_cache = __get_free_page(GFP_KERNEL);
 524                 if (!page_cache)
 525                         break;
 526                 error = 0;
 527 
 528                 /*
 529                  * That could have slept, so we need to check again..
 530                  */
 531                 if (pos >= inode->i_size)
 532                         break;
 533                 page = find_page(inode, pos & PAGE_MASK);
 534                 if (!page)
 535                         goto new_page;
 536 
 537 found_page:
 538                 addr = page_address(page);
 539                 if (nr > count)
 540                         nr = count;
 541 /*
 542  * Do not try to readahead if the current page is not filled or being filled.
 543  * If our goal was to try asynchronous read-ahead, we were quite wrong.
 544  * Set max readahead to some shorter value in order to fix a little
 545  * this mistake.
 546  */
 547                 if (PageUptodate(page) || PageLocked(page))
 548                         page_cache = generic_file_readahead(filp, inode, try_async, pos, page, page_cache);
 549                 else if (try_async) {
 550                         filp->f_ramax = (filp->f_ramax / 2) & PAGE_MASK;
 551                         if (filp->f_ramax < MIN_READAHEAD)
 552                                 filp->f_ramax = MIN_READAHEAD;
 553                 }
 554 
 555                 if (!PageUptodate(page))
 556                         goto read_page;
 557                 if (nr > inode->i_size - pos)
 558                         nr = inode->i_size - pos;
 559                 memcpy_tofs(buf, (void *) (addr + offset), nr);
 560                 free_page(addr);
 561                 buf += nr;
 562                 pos += nr;
 563                 read += nr;
 564                 count -= nr;
 565                 if (count)
 566                         continue;
 567                 break;
 568         
 569 
 570 new_page:
 571                 /*
 572                  * Ok, add the new page to the hash-queues...
 573                  */
 574                 addr = page_cache;
 575                 page = mem_map + MAP_NR(page_cache);
 576                 page_cache = 0;
 577                 add_to_page_cache(page, inode, pos & PAGE_MASK);
 578 
 579                 /*
 580                  * Error handling is tricky. If we get a read error,
 581                  * the cached page stays in the cache (but uptodate=0),
 582                  * and the next process that accesses it will try to
 583                  * re-read it. This is needed for NFS etc, where the
 584                  * identity of the reader can decide if we can read the
 585                  * page or not..
 586                  */
 587 read_page:
 588                 error = inode->i_op->readpage(inode, page);
 589                 if (!error)
 590                         goto found_page;
 591                 free_page(addr);
 592                 break;
 593         }
 594 
 595         filp->f_pos = pos;
 596         filp->f_reada = 1;
 597         if (page_cache)
 598                 free_page(page_cache);
 599         if (!IS_RDONLY(inode)) {
 600                 inode->i_atime = CURRENT_TIME;
 601                 inode->i_dirt = 1;
 602         }
 603         if (!read)
 604                 read = error;
 605         return read;
 606 }
 607 
 608 /*
 609  * Find a cached page and wait for it to become up-to-date, return
 610  * the page address.  Increments the page count.
 611  */
 612 static inline unsigned long fill_page(struct inode * inode, unsigned long offset)
     /*  */
 613 {
 614         struct page * page;
 615         unsigned long new_page;
 616 
 617         page = find_page(inode, offset);
 618         if (page)
 619                 goto found_page_dont_free;
 620         new_page = __get_free_page(GFP_KERNEL);
 621         page = find_page(inode, offset);
 622         if (page)
 623                 goto found_page;
 624         if (!new_page)
 625                 return 0;
 626         page = mem_map + MAP_NR(new_page);
 627         new_page = 0;
 628         add_to_page_cache(page, inode, offset);
 629         inode->i_op->readpage(inode, page);
 630         if (PageLocked(page))
 631                 new_page = try_to_read_ahead(inode, offset + PAGE_SIZE, 0);
 632 found_page:
 633         if (new_page)
 634                 free_page(new_page);
 635 found_page_dont_free:
 636         wait_on_page(page);
 637         return page_address(page);
 638 }
 639 
 640 /*
 641  * Semantics for shared and private memory areas are different past the end
 642  * of the file. A shared mapping past the last page of the file is an error
 643  * and results in a SIGBUS, while a private mapping just maps in a zero page.
 644  */
 645 static unsigned long filemap_nopage(struct vm_area_struct * area, unsigned long address, int no_share)
     /*  */
 646 {
 647         unsigned long offset;
 648         struct inode * inode = area->vm_inode;
 649         unsigned long page;
 650 
 651         offset = (address & PAGE_MASK) - area->vm_start + area->vm_offset;
 652         if (offset >= inode->i_size && (area->vm_flags & VM_SHARED) && area->vm_mm == current->mm)
 653                 return 0;
 654 
 655         page = fill_page(inode, offset);
 656         if (page && no_share) {
 657                 unsigned long new_page = __get_free_page(GFP_KERNEL);
 658                 if (new_page)
 659                         memcpy((void *) new_page, (void *) page, PAGE_SIZE);
 660                 free_page(page);
 661                 return new_page;
 662         }
 663         return page;
 664 }
 665 
 666 /*
 667  * Tries to write a shared mapped page to its backing store. May return -EIO
 668  * if the disk is full.
 669  */
 670 static inline int do_write_page(struct inode * inode, struct file * file,
     /*  */
 671         const char * page, unsigned long offset)
 672 {
 673         int old_fs, retval;
 674         unsigned long size;
 675 
 676         size = offset + PAGE_SIZE;
 677         /* refuse to extend file size.. */
 678         if (S_ISREG(inode->i_mode)) {
 679                 if (size > inode->i_size)
 680                         size = inode->i_size;
 681                 /* Ho humm.. We should have tested for this earlier */
 682                 if (size < offset)
 683                         return -EIO;
 684         }
 685         size -= offset;
 686         old_fs = get_fs();
 687         set_fs(KERNEL_DS);
 688         retval = -EIO;
 689         if (size == file->f_op->write(inode, file, (const char *) page, size))
 690                 retval = 0;
 691         set_fs(old_fs);
 692         return retval;
 693 }
 694 
 695 static int filemap_write_page(struct vm_area_struct * vma,
     /*  */
 696         unsigned long offset,
 697         unsigned long page)
 698 {
 699         int result;
 700         struct file file;
 701         struct inode * inode;
 702         struct buffer_head * bh;
 703 
 704         bh = mem_map[MAP_NR(page)].buffers;
 705         if (bh) {
 706                 /* whee.. just mark the buffer heads dirty */
 707                 struct buffer_head * tmp = bh;
 708                 do {
 709                         mark_buffer_dirty(tmp, 0);
 710                         tmp = tmp->b_this_page;
 711                 } while (tmp != bh);
 712                 return 0;
 713         }
 714 
 715         inode = vma->vm_inode;
 716         file.f_op = inode->i_op->default_file_ops;
 717         if (!file.f_op->write)
 718                 return -EIO;
 719         file.f_mode = 3;
 720         file.f_flags = 0;
 721         file.f_count = 1;
 722         file.f_inode = inode;
 723         file.f_pos = offset;
 724         file.f_reada = 0;
 725 
 726         down(&inode->i_sem);
 727         result = do_write_page(inode, &file, (const char *) page, offset);
 728         up(&inode->i_sem);
 729         return result;
 730 }
 731 
 732 
 733 /*
 734  * Swapping to a shared file: while we're busy writing out the page
 735  * (and the page still exists in memory), we save the page information
 736  * in the page table, so that "filemap_swapin()" can re-use the page
 737  * immediately if it is called while we're busy swapping it out..
 738  *
 739  * Once we've written it all out, we mark the page entry "empty", which
 740  * will result in a normal page-in (instead of a swap-in) from the now
 741  * up-to-date disk file.
 742  */
 743 int filemap_swapout(struct vm_area_struct * vma,
     /*  */
 744         unsigned long offset,
 745         pte_t *page_table)
 746 {
 747         int error;
 748         unsigned long page = pte_page(*page_table);
 749         unsigned long entry = SWP_ENTRY(SHM_SWP_TYPE, MAP_NR(page));
 750 
 751         flush_cache_page(vma, (offset + vma->vm_start - vma->vm_offset));
 752         set_pte(page_table, __pte(entry));
 753         flush_tlb_page(vma, (offset + vma->vm_start - vma->vm_offset));
 754         error = filemap_write_page(vma, offset, page);
 755         if (pte_val(*page_table) == entry)
 756                 pte_clear(page_table);
 757         return error;
 758 }
 759 
 760 /*
 761  * filemap_swapin() is called only if we have something in the page
 762  * tables that is non-zero (but not present), which we know to be the
 763  * page index of a page that is busy being swapped out (see above).
 764  * So we just use it directly..
 765  */
 766 static pte_t filemap_swapin(struct vm_area_struct * vma,
     /*  */
 767         unsigned long offset,
 768         unsigned long entry)
 769 {
 770         unsigned long page = SWP_OFFSET(entry);
 771 
 772         mem_map[page].count++;
 773         page = (page << PAGE_SHIFT) + PAGE_OFFSET;
 774         return mk_pte(page,vma->vm_page_prot);
 775 }
 776 
 777 
 778 static inline int filemap_sync_pte(pte_t * ptep, struct vm_area_struct *vma,
     /*  */
 779         unsigned long address, unsigned int flags)
 780 {
 781         pte_t pte = *ptep;
 782         unsigned long page;
 783         int error;
 784 
 785         if (!(flags & MS_INVALIDATE)) {
 786                 if (!pte_present(pte))
 787                         return 0;
 788                 if (!pte_dirty(pte))
 789                         return 0;
 790                 flush_cache_page(vma, address);
 791                 set_pte(ptep, pte_mkclean(pte));
 792                 flush_tlb_page(vma, address);
 793                 page = pte_page(pte);
 794                 mem_map[MAP_NR(page)].count++;
 795         } else {
 796                 if (pte_none(pte))
 797                         return 0;
 798                 flush_cache_page(vma, address);
 799                 pte_clear(ptep);
 800                 flush_tlb_page(vma, address);
 801                 if (!pte_present(pte)) {
 802                         swap_free(pte_val(pte));
 803                         return 0;
 804                 }
 805                 page = pte_page(pte);
 806                 if (!pte_dirty(pte) || flags == MS_INVALIDATE) {
 807                         free_page(page);
 808                         return 0;
 809                 }
 810         }
 811         error = filemap_write_page(vma, address - vma->vm_start + vma->vm_offset, page);
 812         free_page(page);
 813         return error;
 814 }
 815 
 816 static inline int filemap_sync_pte_range(pmd_t * pmd,
     /*  */
 817         unsigned long address, unsigned long size, 
 818         struct vm_area_struct *vma, unsigned long offset, unsigned int flags)
 819 {
 820         pte_t * pte;
 821         unsigned long end;
 822         int error;
 823 
 824         if (pmd_none(*pmd))
 825                 return 0;
 826         if (pmd_bad(*pmd)) {
 827                 printk("filemap_sync_pte_range: bad pmd (%08lx)\n", pmd_val(*pmd));
 828                 pmd_clear(pmd);
 829                 return 0;
 830         }
 831         pte = pte_offset(pmd, address);
 832         offset += address & PMD_MASK;
 833         address &= ~PMD_MASK;
 834         end = address + size;
 835         if (end > PMD_SIZE)
 836                 end = PMD_SIZE;
 837         error = 0;
 838         do {
 839                 error |= filemap_sync_pte(pte, vma, address + offset, flags);
 840                 address += PAGE_SIZE;
 841                 pte++;
 842         } while (address < end);
 843         return error;
 844 }
 845 
 846 static inline int filemap_sync_pmd_range(pgd_t * pgd,
     /*  */
 847         unsigned long address, unsigned long size, 
 848         struct vm_area_struct *vma, unsigned int flags)
 849 {
 850         pmd_t * pmd;
 851         unsigned long offset, end;
 852         int error;
 853 
 854         if (pgd_none(*pgd))
 855                 return 0;
 856         if (pgd_bad(*pgd)) {
 857                 printk("filemap_sync_pmd_range: bad pgd (%08lx)\n", pgd_val(*pgd));
 858                 pgd_clear(pgd);
 859                 return 0;
 860         }
 861         pmd = pmd_offset(pgd, address);
 862         offset = address & PMD_MASK;
 863         address &= ~PMD_MASK;
 864         end = address + size;
 865         if (end > PGDIR_SIZE)
 866                 end = PGDIR_SIZE;
 867         error = 0;
 868         do {
 869                 error |= filemap_sync_pte_range(pmd, address, end - address, vma, offset, flags);
 870                 address = (address + PMD_SIZE) & PMD_MASK;
 871                 pmd++;
 872         } while (address < end);
 873         return error;
 874 }
 875 
 876 static int filemap_sync(struct vm_area_struct * vma, unsigned long address,
     /*  */
 877         size_t size, unsigned int flags)
 878 {
 879         pgd_t * dir;
 880         unsigned long end = address + size;
 881         int error = 0;
 882 
 883         dir = pgd_offset(current->mm, address);
 884         flush_cache_range(vma->vm_mm, end - size, end);
 885         while (address < end) {
 886                 error |= filemap_sync_pmd_range(dir, address, end - address, vma, flags);
 887                 address = (address + PGDIR_SIZE) & PGDIR_MASK;
 888                 dir++;
 889         }
 890         flush_tlb_range(vma->vm_mm, end - size, end);
 891         return error;
 892 }
 893 
 894 /*
 895  * This handles (potentially partial) area unmaps..
 896  */
 897 static void filemap_unmap(struct vm_area_struct *vma, unsigned long start, size_t len)
     /*  */
 898 {
 899         filemap_sync(vma, start, len, MS_ASYNC);
 900 }
 901 
 902 /*
 903  * Shared mappings need to be able to do the right thing at
 904  * close/unmap/sync. They will also use the private file as
 905  * backing-store for swapping..
 906  */
 907 static struct vm_operations_struct file_shared_mmap = {
 908         NULL,                   /* no special open */
 909         NULL,                   /* no special close */
 910         filemap_unmap,          /* unmap - we need to sync the pages */
 911         NULL,                   /* no special protect */
 912         filemap_sync,           /* sync */
 913         NULL,                   /* advise */
 914         filemap_nopage,         /* nopage */
 915         NULL,                   /* wppage */
 916         filemap_swapout,        /* swapout */
 917         filemap_swapin,         /* swapin */
 918 };
 919 
 920 /*
 921  * Private mappings just need to be able to load in the map.
 922  *
 923  * (This is actually used for shared mappings as well, if we
 924  * know they can't ever get write permissions..)
 925  */
 926 static struct vm_operations_struct file_private_mmap = {
 927         NULL,                   /* open */
 928         NULL,                   /* close */
 929         NULL,                   /* unmap */
 930         NULL,                   /* protect */
 931         NULL,                   /* sync */
 932         NULL,                   /* advise */
 933         filemap_nopage,         /* nopage */
 934         NULL,                   /* wppage */
 935         NULL,                   /* swapout */
 936         NULL,                   /* swapin */
 937 };
 938 
 939 /* This is used for a general mmap of a disk file */
 940 int generic_file_mmap(struct inode * inode, struct file * file, struct vm_area_struct * vma)
     /*  */
 941 {
 942         struct vm_operations_struct * ops;
 943 
 944         if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) {
 945                 ops = &file_shared_mmap;
 946                 /* share_page() can only guarantee proper page sharing if
 947                  * the offsets are all page aligned. */
 948                 if (vma->vm_offset & (PAGE_SIZE - 1))
 949                         return -EINVAL;
 950         } else {
 951                 ops = &file_private_mmap;
 952                 if (vma->vm_offset & (inode->i_sb->s_blocksize - 1))
 953                         return -EINVAL;
 954         }
 955         if (!inode->i_sb || !S_ISREG(inode->i_mode))
 956                 return -EACCES;
 957         if (!inode->i_op || !inode->i_op->readpage)
 958                 return -ENOEXEC;
 959         if (!IS_RDONLY(inode)) {
 960                 inode->i_atime = CURRENT_TIME;
 961                 inode->i_dirt = 1;
 962         }
 963         vma->vm_inode = inode;
 964         inode->i_count++;
 965         vma->vm_ops = ops;
 966         return 0;
 967 }
 968 
 969 
 970 /*
 971  * The msync() system call.
 972  */
 973 
 974 static int msync_interval(struct vm_area_struct * vma,
     /*  */
 975         unsigned long start, unsigned long end, int flags)
 976 {
 977         if (!vma->vm_inode)
 978                 return 0;
 979         if (vma->vm_ops->sync) {
 980                 int error;
 981                 error = vma->vm_ops->sync(vma, start, end-start, flags);
 982                 if (error)
 983                         return error;
 984                 if (flags & MS_SYNC)
 985                         return file_fsync(vma->vm_inode, NULL);
 986                 return 0;
 987         }
 988         return 0;
 989 }
 990 
 991 asmlinkage int sys_msync(unsigned long start, size_t len, int flags)
     /*  */
 992 {
 993         unsigned long end;
 994         struct vm_area_struct * vma;
 995         int unmapped_error, error;
 996 
 997         if (start & ~PAGE_MASK)
 998                 return -EINVAL;
 999         len = (len + ~PAGE_MASK) & PAGE_MASK;
1000         end = start + len;
1001         if (end < start)
1002                 return -EINVAL;
1003         if (flags & ~(MS_ASYNC | MS_INVALIDATE | MS_SYNC))
1004                 return -EINVAL;
1005         if (end == start)
1006                 return 0;
1007         /*
1008          * If the interval [start,end) covers some unmapped address ranges,
1009          * just ignore them, but return -EFAULT at the end.
1010          */
1011         vma = find_vma(current, start);
1012         unmapped_error = 0;
1013         for (;;) {
1014                 /* Still start < end. */
1015                 if (!vma)
1016                         return -EFAULT;
1017                 /* Here start < vma->vm_end. */
1018                 if (start < vma->vm_start) {
1019                         unmapped_error = -EFAULT;
1020                         start = vma->vm_start;
1021                 }
1022                 /* Here vma->vm_start <= start < vma->vm_end. */
1023                 if (end <= vma->vm_end) {
1024                         if (start < end) {
1025                                 error = msync_interval(vma, start, end, flags);
1026                                 if (error)
1027                                         return error;
1028                         }
1029                         return unmapped_error;
1030                 }
1031                 /* Here vma->vm_start <= start < vma->vm_end < end. */
1032                 error = msync_interval(vma, start, vma->vm_end, flags);
1033                 if (error)
1034                         return error;
1035                 start = vma->vm_end;
1036                 vma = vma->vm_next;
1037         }
1038 }
/* */
root/mm/filemap.c

DEFINITIONS