mm/filemap.c

/* */
This source file includes following definitions.
invalidate_inode_pages
truncate_inode_pages
shrink_mmap
page_unuse
update_vm_cache
add_to_page_cache
try_to_read_ahead
__wait_on_page
profile_readahead
generic_file_readahead
generic_file_read
fill_page
filemap_nopage
do_write_page
filemap_write_page
filemap_swapout
filemap_swapin
filemap_sync_pte
filemap_sync_pte_range
filemap_sync_pmd_range
filemap_sync
filemap_unmap
generic_file_mmap
msync_interval
sys_msync
   1 /*
   2  *      linux/mm/filemap.c
   3  *
   4  * Copyright (C) 1994, 1995  Linus Torvalds
   5  */
   6 
   7 /*
   8  * This file handles the generic file mmap semantics used by
   9  * most "normal" filesystems (but you don't /have/ to use this:
  10  * the NFS filesystem does this differently, for example)
  11  */
  12 #include <linux/stat.h>
  13 #include <linux/sched.h>
  14 #include <linux/kernel.h>
  15 #include <linux/mm.h>
  16 #include <linux/shm.h>
  17 #include <linux/errno.h>
  18 #include <linux/mman.h>
  19 #include <linux/string.h>
  20 #include <linux/malloc.h>
  21 #include <linux/fs.h>
  22 #include <linux/locks.h>
  23 #include <linux/pagemap.h>
  24 #include <linux/swap.h>
  25 
  26 #include <asm/segment.h>
  27 #include <asm/system.h>
  28 #include <asm/pgtable.h>
  29 
  30 /*
  31  * Shared mappings implemented 30.11.1994. It's not fully working yet,
  32  * though.
  33  *
  34  * Shared mappings now work. 15.8.1995  Bruno.
  35  */
  36 
  37 unsigned long page_cache_size = 0;
  38 struct page * page_hash_table[PAGE_HASH_SIZE];
  39 
  40 /*
  41  * Simple routines for both non-shared and shared mappings.
  42  */
  43 
  44 /*
  45  * Invalidate the pages of an inode, removing all pages that aren't
  46  * locked down (those are sure to be up-to-date anyway, so we shouldn't
  47  * invalidate them).
  48  */
  49 void invalidate_inode_pages(struct inode * inode)
     /*  */
  50 {
  51         struct page ** p;
  52         struct page * page;
  53 
  54         p = &inode->i_pages;
  55         while ((page = *p) != NULL) {
  56                 if (PageLocked(page)) {
  57                         p = &page->next;
  58                         continue;
  59                 }
  60                 inode->i_nrpages--;
  61                 if ((*p = page->next) != NULL)
  62                         (*p)->prev = page->prev;
  63                 page->dirty = 0;
  64                 page->next = NULL;
  65                 page->prev = NULL;
  66                 remove_page_from_hash_queue(page);
  67                 page->inode = NULL;
  68                 free_page(page_address(page));
  69                 continue;
  70         }
  71 }
  72 
  73 /*
  74  * Truncate the page cache at a set offset, removing the pages
  75  * that are beyond that offset (and zeroing out partial pages).
  76  */
  77 void truncate_inode_pages(struct inode * inode, unsigned long start)
     /*  */
  78 {
  79         struct page ** p;
  80         struct page * page;
  81 
  82 repeat:
  83         p = &inode->i_pages;
  84         while ((page = *p) != NULL) {
  85                 unsigned long offset = page->offset;
  86 
  87                 /* page wholly truncated - free it */
  88                 if (offset >= start) {
  89                         if (PageLocked(page)) {
  90                                 wait_on_page(page);
  91                                 goto repeat;
  92                         }
  93                         inode->i_nrpages--;
  94                         if ((*p = page->next) != NULL)
  95                                 (*p)->prev = page->prev;
  96                         page->dirty = 0;
  97                         page->next = NULL;
  98                         page->prev = NULL;
  99                         remove_page_from_hash_queue(page);
 100                         page->inode = NULL;
 101                         free_page(page_address(page));
 102                         continue;
 103                 }
 104                 p = &page->next;
 105                 offset = start - offset;
 106                 /* partial truncate, clear end of page */
 107                 if (offset < PAGE_SIZE) {
 108                         memset((void *) (offset + page_address(page)), 0, PAGE_SIZE - offset);
 109                         flush_page_to_ram(page_address(page));
 110                 }
 111         }
 112 }
 113 
 114 int shrink_mmap(int priority, int dma)
     /*  */
 115 {
 116         static int clock = 0;
 117         struct page * page;
 118         unsigned long limit = MAP_NR(high_memory);
 119         struct buffer_head *tmp, *bh;
 120 
 121         priority = (limit<<2) >> priority;
 122         page = mem_map + clock;
 123         do {
 124                 priority--;
 125                 if (PageLocked(page))
 126                         goto next;
 127                 if (dma && !PageDMA(page))
 128                         goto next;
 129                 /* First of all, regenerate the page's referenced bit
 130                    from any buffers in the page */
 131                 bh = page->buffers;
 132                 if (bh) {
 133                         tmp = bh;
 134                         do {
 135                                 if (buffer_touched(tmp)) {
 136                                         clear_bit(BH_Touched, &tmp->b_state);
 137                                         set_bit(PG_referenced, &page->flags);
 138                                 }
 139                                 tmp = tmp->b_this_page;
 140                         } while (tmp != bh);
 141                 }
 142 
 143                 /* We can't throw away shared pages, but we do mark
 144                    them as referenced.  This relies on the fact that
 145                    no page is currently in both the page cache and the
 146                    buffer cache; we'd have to modify the following
 147                    test to allow for that case. */
 148 
 149                 switch (page->count) {
 150                         case 1:
 151                                 /* If it has been referenced recently, don't free it */
 152                                 if (clear_bit(PG_referenced, &page->flags))
 153                                         break;
 154 
 155                                 /* is it a page cache page? */
 156                                 if (page->inode) {
 157                                         remove_page_from_hash_queue(page);
 158                                         remove_page_from_inode_queue(page);
 159                                         free_page(page_address(page));
 160                                         return 1;
 161                                 }
 162 
 163                                 /* is it a buffer cache page? */
 164                                 if (bh && try_to_free_buffer(bh, &bh, 6))
 165                                         return 1;
 166                                 break;
 167 
 168                         default:
 169                                 /* more than one users: we can't throw it away */
 170                                 set_bit(PG_referenced, &page->flags);
 171                                 /* fall through */
 172                         case 0:
 173                                 /* nothing */
 174                 }
 175 next:
 176                 page++;
 177                 clock++;
 178                 if (clock >= limit) {
 179                         clock = 0;
 180                         page = mem_map;
 181                 }
 182         } while (priority > 0);
 183         return 0;
 184 }
 185 
 186 /*
 187  * This is called from try_to_swap_out() when we try to get rid of some
 188  * pages..  If we're unmapping the last occurrence of this page, we also
 189  * free it from the page hash-queues etc, as we don't want to keep it
 190  * in-core unnecessarily.
 191  */
 192 unsigned long page_unuse(unsigned long page)
     /*  */
 193 {
 194         struct page * p = mem_map + MAP_NR(page);
 195         int count = p->count;
 196 
 197         if (count != 2)
 198                 return count;
 199         if (!p->inode)
 200                 return count;
 201         remove_page_from_hash_queue(p);
 202         remove_page_from_inode_queue(p);
 203         free_page(page);
 204         return 1;
 205 }
 206 
 207 /*
 208  * Update a page cache copy, when we're doing a "write()" system call
 209  * See also "update_vm_cache()".
 210  */
 211 void update_vm_cache(struct inode * inode, unsigned long pos, const char * buf, int count)
     /*  */
 212 {
 213         unsigned long offset, len;
 214 
 215         offset = (pos & ~PAGE_MASK);
 216         pos = pos & PAGE_MASK;
 217         len = PAGE_SIZE - offset;
 218         do {
 219                 struct page * page;
 220 
 221                 if (len > count)
 222                         len = count;
 223                 page = find_page(inode, pos);
 224                 if (page) {
 225                         unsigned long addr;
 226 
 227                         wait_on_page(page);
 228                         addr = page_address(page);
 229                         memcpy((void *) (offset + addr), buf, len);
 230                         free_page(addr);
 231                 }
 232                 count -= len;
 233                 buf += len;
 234                 len = PAGE_SIZE;
 235                 offset = 0;
 236                 pos += PAGE_SIZE;
 237         } while (count);
 238 }
 239 
 240 static inline void add_to_page_cache(struct page * page,
     /*  */
 241         struct inode * inode, unsigned long offset)
 242 {
 243         page->count++;
 244         page->flags &= ~((1 << PG_uptodate) | (1 << PG_error));
 245         page->offset = offset;
 246         add_page_to_inode_queue(inode, page);
 247         add_page_to_hash_queue(inode, page);
 248 }
 249 
 250 /*
 251  * Try to read ahead in the file. "page_cache" is a potentially free page
 252  * that we could use for the cache (if it is 0 we can try to create one,
 253  * this is all overlapped with the IO on the previous page finishing anyway)
 254  */
 255 static unsigned long try_to_read_ahead(struct inode * inode, unsigned long offset, unsigned long page_cache)
     /*  */
 256 {
 257         struct page * page;
 258 
 259         offset &= PAGE_MASK;
 260         if (!page_cache) {
 261                 page_cache = __get_free_page(GFP_KERNEL);
 262                 if (!page_cache)
 263                         return 0;
 264         }
 265         if (offset >= inode->i_size)
 266                 return page_cache;
 267 #if 1
 268         page = find_page(inode, offset);
 269         if (page) {
 270                 page->count--;
 271                 return page_cache;
 272         }
 273         /*
 274          * Ok, add the new page to the hash-queues...
 275          */
 276         page = mem_map + MAP_NR(page_cache);
 277         add_to_page_cache(page, inode, offset);
 278         inode->i_op->readpage(inode, page);
 279         free_page(page_cache);
 280         return 0;
 281 #else
 282         return page_cache;
 283 #endif
 284 }
 285 
 286 /* 
 287  * Wait for IO to complete on a locked page.
 288  */
 289 void __wait_on_page(struct page *page)
     /*  */
 290 {
 291         struct wait_queue wait = { current, NULL };
 292 
 293         page->count++;
 294         add_wait_queue(&page->wait, &wait);
 295 repeat:
 296         run_task_queue(&tq_disk);
 297         current->state = TASK_UNINTERRUPTIBLE;
 298         if (PageLocked(page)) {
 299                 schedule();
 300                 goto repeat;
 301         }
 302         remove_wait_queue(&page->wait, &wait);
 303         page->count--;
 304         current->state = TASK_RUNNING;
 305 }
 306 
 307 #if 0
 308 #define PROFILE_READAHEAD
 309 #define DEBUG_READAHEAD
 310 #endif
 311 
 312 /*
 313  * Read-ahead profiling informations
 314  * ---------------------------------
 315  * Every PROFILE_MAXREADCOUNT, the following informations are written 
 316  * to the syslog:
 317  *   Percentage of asynchronous read-ahead.
 318  *   Average of read-ahead fields context value.
 319  * If DEBUG_READAHEAD is defined, a snapshot of these fields is written 
 320  * to the syslog.
 321  */
 322 
 323 #ifdef PROFILE_READAHEAD
 324 
 325 #define PROFILE_MAXREADCOUNT 1000
 326 
 327 static unsigned long total_reada;
 328 static unsigned long total_async;
 329 static unsigned long total_ramax;
 330 static unsigned long total_ralen;
 331 static unsigned long total_rawin;
 332 
 333 static void profile_readahead(int async, struct file *filp)
     /*  */
 334 {
 335         unsigned long flags;
 336 
 337         ++total_reada;
 338         if (async)
 339                 ++total_async;
 340 
 341         total_ramax     += filp->f_ramax;
 342         total_ralen     += filp->f_ralen;
 343         total_rawin     += filp->f_rawin;
 344 
 345         if (total_reada > PROFILE_MAXREADCOUNT) {
 346                 save_flags(flags);
 347                 cli();
 348                 if (!(total_reada > PROFILE_MAXREADCOUNT)) {
 349                         restore_flags(flags);
 350                         return;
 351                 }
 352 
 353                 printk("Readahead average:  max=%ld, len=%ld, win=%ld, async=%ld%%\n",
 354                         total_ramax/total_reada,
 355                         total_ralen/total_reada,
 356                         total_rawin/total_reada,
 357                         (total_async*100)/total_reada);
 358 #ifdef DEBUG_READAHEAD
 359                 printk("Readahead snapshot: max=%ld, len=%ld, win=%ld, raend=%ld\n",
 360                         filp->f_ramax, filp->f_ralen, filp->f_rawin, filp->f_raend);
 361 #endif
 362 
 363                 total_reada     = 0;
 364                 total_async     = 0;
 365                 total_ramax     = 0;
 366                 total_ralen     = 0;
 367                 total_rawin     = 0;
 368 
 369                 restore_flags(flags);
 370         }
 371 }
 372 #endif  /* defined PROFILE_READAHEAD */
 373 
 374 /*
 375  * Read-ahead context:
 376  * -------------------
 377  * The read ahead context fields of the "struct file" are the following:
 378  * - f_raend : position of the first byte after the last page we tried to
 379  *             read ahead.
 380  * - f_ramax : current read-ahead maximum size.
 381  * - f_ralen : length of the current IO read block we tried to read-ahead.
 382  * - f_rawin : length of the current read-ahead window.
 383  *             if last read-ahead was synchronous then
 384  *                  f_rawin = f_ralen
 385  *             otherwise (was asynchronous)
 386  *                  f_rawin = previous value of f_ralen + f_ralen
 387  *
 388  * Read-ahead limits:
 389  * ------------------
 390  * MIN_READAHEAD   : minimum read-ahead size when read-ahead.
 391  * MAX_READAHEAD   : maximum read-ahead size when read-ahead.
 392  *
 393  * Synchronous read-ahead benefits:
 394  * --------------------------------
 395  * Using reasonable IO xfer length from peripheral devices increase system 
 396  * performances.
 397  * Reasonable means, in this context, not too large but not too small.
 398  * The actual maximum value is:
 399  *      MAX_READAHEAD + PAGE_SIZE = 76k is CONFIG_READA_SMALL is undefined
 400  *      and 32K if defined.
 401  *
 402  * Asynchronous read-ahead benefits:
 403  * ---------------------------------
 404  * Overlapping next read request and user process execution increase system 
 405  * performance.
 406  *
 407  * Read-ahead risks:
 408  * -----------------
 409  * We have to guess which further data are needed by the user process.
 410  * If these data are often not really needed, it's bad for system 
 411  * performances.
 412  * However, we know that files are often accessed sequentially by 
 413  * application programs and it seems that it is possible to have some good 
 414  * strategy in that guessing.
 415  * We only try to read-ahead files that seems to be read sequentially.
 416  *
 417  * Asynchronous read-ahead risks:
 418  * ------------------------------
 419  * In order to maximize overlapping, we must start some asynchronous read 
 420  * request from the device, as soon as possible.
 421  * We must be very careful about:
 422  * - The number of effective pending IO read requests.
 423  *   ONE seems to be the only reasonable value.
 424  * - The total memory pool usage for the file access stream.
 425  *   This maximum memory usage is implicitly 2 IO read chunks:
 426  *   2*(MAX_READAHEAD + PAGE_SIZE) = 156K if CONFIG_READA_SMALL is undefined,
 427  *   64k if defined.
 428  */
 429 
 430 #if 0 /* small readahead */
 431 #define MAX_READAHEAD (PAGE_SIZE*7)
 432 #define MIN_READAHEAD (PAGE_SIZE*2)
 433 #else
 434 #define MAX_READAHEAD (PAGE_SIZE*18)
 435 #define MIN_READAHEAD (PAGE_SIZE*3)
 436 #endif
 437 
 438 static inline unsigned long generic_file_readahead(int reada_ok, struct file * filp, struct inode * inode,
     /*  */
 439         unsigned long pos, struct page * page,
 440         unsigned long page_cache)
 441 {
 442         unsigned long max_ahead, ahead;
 443         unsigned long raend, ppos;
 444 
 445         ppos = pos & PAGE_MASK;
 446         raend = filp->f_raend & PAGE_MASK;
 447         max_ahead = 0;
 448 
 449 /*
 450  * The current page is locked.
 451  * If the current position is inside the previous read IO request, do not
 452  * try to reread previously read ahead pages.
 453  * Otherwise decide or not to read ahead some pages synchronously.
 454  * If we are not going to read ahead, set the read ahead context for this 
 455  * page only.
 456  */
 457         if (PageLocked(page)) {
 458                 if (!filp->f_ralen || ppos >= raend || ppos + filp->f_ralen < raend) {
 459                         raend = ppos;
 460                         if (raend < inode->i_size)
 461                                 max_ahead = filp->f_ramax;
 462                         filp->f_rawin = 0;
 463                         filp->f_ralen = PAGE_SIZE;
 464                         if (!max_ahead) {
 465                                 filp->f_raend  = ppos + filp->f_ralen;
 466                                 filp->f_rawin += filp->f_ralen;
 467                         }
 468                 }
 469         }
 470 /*
 471  * The current page is not locked.
 472  * If we were reading ahead and,
 473  * if the current max read ahead size is not zero and,
 474  * if the current position is inside the last read-ahead IO request,
 475  *   it is the moment to try to read ahead asynchronously.
 476  * We will later force unplug device in order to force asynchronous read IO.
 477  */
 478         else if (reada_ok && filp->f_ramax && raend >= PAGE_SIZE &&
 479                  ppos <= raend && ppos + filp->f_ralen >= raend) {
 480 /*
 481  * Add ONE page to max_ahead in order to try to have about the same IO max size
 482  * as synchronous read-ahead (MAX_READAHEAD + 1)*PAGE_SIZE.
 483  * Compute the position of the last page we have tried to read in order to 
 484  * begin to read ahead just at the next page.
 485  */
 486                 raend -= PAGE_SIZE;
 487                 if (raend < inode->i_size)
 488                         max_ahead = filp->f_ramax + PAGE_SIZE;
 489 
 490                 if (max_ahead) {
 491                         filp->f_rawin = filp->f_ralen;
 492                         filp->f_ralen = 0;
 493                         reada_ok      = 2;
 494                 }
 495         }
 496 /*
 497  * Try to read ahead pages.
 498  * We hope that ll_rw_blk() plug/unplug, coalescence, requests sort and the
 499  * scheduler, will work enough for us to avoid too bad actuals IO requests.
 500  */
 501         ahead = 0;
 502         while (ahead < max_ahead) {
 503                 ahead += PAGE_SIZE;
 504                 page_cache = try_to_read_ahead(inode, raend + ahead, page_cache);
 505         }
 506 /*
 507  * If we tried to read ahead some pages,
 508  * If we tried to read ahead asynchronously,
 509  *   Try to force unplug of the device in order to start an asynchronous
 510  *   read IO request.
 511  * Update the read-ahead context.
 512  * Store the length of the current read-ahead window.
 513  * Double the current max read ahead size.
 514  *   That heuristic avoid to do some large IO for files that are not really
 515  *   accessed sequentially.
 516  */
 517         if (ahead) {
 518                 if (reada_ok == 2) {
 519                         run_task_queue(&tq_disk);
 520                 }
 521 
 522                 filp->f_ralen += ahead;
 523                 filp->f_rawin += filp->f_ralen;
 524                 filp->f_raend = raend + ahead + PAGE_SIZE;
 525 
 526                 filp->f_ramax += filp->f_ramax;
 527 
 528                 if (filp->f_ramax > MAX_READAHEAD)
 529                         filp->f_ramax = MAX_READAHEAD;
 530 
 531 #ifdef PROFILE_READAHEAD
 532                 profile_readahead((reada_ok == 2), filp);
 533 #endif
 534         }
 535 
 536         return page_cache;
 537 }
 538 
 539 
 540 /*
 541  * This is a generic file read routine, and uses the
 542  * inode->i_op->readpage() function for the actual low-level
 543  * stuff.
 544  *
 545  * This is really ugly. But the goto's actually try to clarify some
 546  * of the logic when it comes to error handling etc.
 547  */
 548 
 549 int generic_file_read(struct inode * inode, struct file * filp, char * buf, int count)
     /*  */
 550 {
 551         int error, read;
 552         unsigned long pos, ppos, page_cache;
 553         int reada_ok;
 554 
 555         if (count <= 0)
 556                 return 0;
 557 
 558         error = 0;
 559         read = 0;
 560         page_cache = 0;
 561 
 562         pos = filp->f_pos;
 563         ppos = pos & PAGE_MASK;
 564 /*
 565  * If the current position is outside the previous read-ahead window, 
 566  * we reset the current read-ahead context and set read ahead max to zero
 567  * (will be set to just needed value later),
 568  * otherwise, we assume that the file accesses are sequential enough to
 569  * continue read-ahead.
 570  */
 571         if (ppos > filp->f_raend || ppos + filp->f_rawin < filp->f_raend) {
 572                 reada_ok = 0;
 573                 filp->f_raend = 0;
 574                 filp->f_ralen = 0;
 575                 filp->f_ramax = 0;
 576                 filp->f_rawin = 0;
 577         } else {
 578                 reada_ok = 1;
 579         }
 580 /*
 581  * Adjust the current value of read-ahead max.
 582  * If the read operation stay in the first half page, force no readahead.
 583  * Otherwise try to increase read ahead max just enough to do the read request.
 584  * Then, at least MIN_READAHEAD if read ahead is ok,
 585  * and at most MAX_READAHEAD in all cases.
 586  */
 587         if (pos + count <= (PAGE_SIZE >> 1)) {
 588                 filp->f_ramax = 0;
 589         } else {
 590                 unsigned long needed;
 591 
 592                 needed = ((pos + count) & PAGE_MASK) - (pos & PAGE_MASK);
 593 
 594                 if (filp->f_ramax < needed)
 595                         filp->f_ramax = needed;
 596 
 597                 if (reada_ok && filp->f_ramax < MIN_READAHEAD)
 598                                 filp->f_ramax = MIN_READAHEAD;
 599                 if (filp->f_ramax > MAX_READAHEAD)
 600                         filp->f_ramax = MAX_READAHEAD;
 601         }
 602 
 603         for (;;) {
 604                 struct page *page;
 605                 unsigned long offset, addr, nr;
 606 
 607                 if (pos >= inode->i_size)
 608                         break;
 609                 offset = pos & ~PAGE_MASK;
 610                 nr = PAGE_SIZE - offset;
 611                 /*
 612                  * Try to find the data in the page cache..
 613                  */
 614                 page = find_page(inode, pos & PAGE_MASK);
 615                 if (page)
 616                         goto found_page;
 617 
 618                 /*
 619                  * Ok, it wasn't cached, so we need to create a new
 620                  * page..
 621                  */
 622                 if (page_cache)
 623                         goto new_page;
 624 
 625                 error = -ENOMEM;
 626                 page_cache = __get_free_page(GFP_KERNEL);
 627                 if (!page_cache)
 628                         break;
 629                 error = 0;
 630 
 631                 /*
 632                  * That could have slept, so we need to check again..
 633                  */
 634                 if (pos >= inode->i_size)
 635                         break;
 636                 page = find_page(inode, pos & PAGE_MASK);
 637                 if (!page)
 638                         goto new_page;
 639 
 640 found_page:
 641                 addr = page_address(page);
 642                 if (nr > count)
 643                         nr = count;
 644 /*
 645  * Try to read ahead only if the current page is filled or being filled.
 646  * Otherwise, if we were reading ahead, decrease max read ahead size to
 647  * the minimum value.
 648  * In this context, that seems to may happen only on some read error or if 
 649  * the page has been rewritten.
 650  */
 651                 if (PageUptodate(page) || PageLocked(page))
 652                         page_cache = generic_file_readahead(reada_ok, filp, inode, pos, page, page_cache);
 653                 else if (reada_ok && filp->f_ramax > MIN_READAHEAD)
 654                                 filp->f_ramax = MIN_READAHEAD;
 655 
 656                 if (PageLocked(page))
 657                         __wait_on_page(page);
 658 
 659                 if (!PageUptodate(page))
 660                         goto read_page;
 661                 if (nr > inode->i_size - pos)
 662                         nr = inode->i_size - pos;
 663                 memcpy_tofs(buf, (void *) (addr + offset), nr);
 664                 free_page(addr);
 665                 buf += nr;
 666                 pos += nr;
 667                 read += nr;
 668                 count -= nr;
 669                 if (count)
 670                         continue;
 671                 break;
 672         
 673 
 674 new_page:
 675                 /*
 676                  * Ok, add the new page to the hash-queues...
 677                  */
 678                 addr = page_cache;
 679                 page = mem_map + MAP_NR(page_cache);
 680                 page_cache = 0;
 681                 add_to_page_cache(page, inode, pos & PAGE_MASK);
 682 
 683                 /*
 684                  * Error handling is tricky. If we get a read error,
 685                  * the cached page stays in the cache (but uptodate=0),
 686                  * and the next process that accesses it will try to
 687                  * re-read it. This is needed for NFS etc, where the
 688                  * identity of the reader can decide if we can read the
 689                  * page or not..
 690                  */
 691 read_page:
 692 /*
 693  * We have to read the page.
 694  * If we were reading ahead, we had previously tried to read this page,
 695  * That means that the page has probably been removed from the cache before 
 696  * the application process needs it, or has been rewritten.
 697  * Decrease max readahead size to the minimum value in that situation.
 698  */
 699                 if (reada_ok && filp->f_ramax > MIN_READAHEAD)
 700                         filp->f_ramax = MIN_READAHEAD;
 701 
 702                 error = inode->i_op->readpage(inode, page);
 703                 if (!error) {
 704                         if (!PageError(page))
 705                                 goto found_page;
 706                         error = -EIO;
 707                 }
 708                 free_page(addr);
 709                 break;
 710         }
 711 
 712         filp->f_pos = pos;
 713         filp->f_reada = 1;
 714         if (page_cache)
 715                 free_page(page_cache);
 716         if (!IS_RDONLY(inode)) {
 717                 inode->i_atime = CURRENT_TIME;
 718                 inode->i_dirt = 1;
 719         }
 720         if (!read)
 721                 read = error;
 722         return read;
 723 }
 724 
 725 /*
 726  * Find a cached page and wait for it to become up-to-date, return
 727  * the page address.  Increments the page count.
 728  */
 729 static inline unsigned long fill_page(struct inode * inode, unsigned long offset)
     /*  */
 730 {
 731         struct page * page;
 732         unsigned long new_page;
 733 
 734         page = find_page(inode, offset);
 735         if (page)
 736                 goto found_page_dont_free;
 737         new_page = __get_free_page(GFP_KERNEL);
 738         page = find_page(inode, offset);
 739         if (page)
 740                 goto found_page;
 741         if (!new_page)
 742                 return 0;
 743         page = mem_map + MAP_NR(new_page);
 744         new_page = 0;
 745         add_to_page_cache(page, inode, offset);
 746         inode->i_op->readpage(inode, page);
 747         if (PageLocked(page))
 748                 new_page = try_to_read_ahead(inode, offset + PAGE_SIZE, 0);
 749 found_page:
 750         if (new_page)
 751                 free_page(new_page);
 752 found_page_dont_free:
 753         wait_on_page(page);
 754         return page_address(page);
 755 }
 756 
 757 /*
 758  * Semantics for shared and private memory areas are different past the end
 759  * of the file. A shared mapping past the last page of the file is an error
 760  * and results in a SIGBUS, while a private mapping just maps in a zero page.
 761  */
 762 static unsigned long filemap_nopage(struct vm_area_struct * area, unsigned long address, int no_share)
     /*  */
 763 {
 764         unsigned long offset;
 765         struct inode * inode = area->vm_inode;
 766         unsigned long page;
 767 
 768         offset = (address & PAGE_MASK) - area->vm_start + area->vm_offset;
 769         if (offset >= inode->i_size && (area->vm_flags & VM_SHARED) && area->vm_mm == current->mm)
 770                 return 0;
 771 
 772         page = fill_page(inode, offset);
 773         if (page && no_share) {
 774                 unsigned long new_page = __get_free_page(GFP_KERNEL);
 775                 if (new_page) {
 776                         memcpy((void *) new_page, (void *) page, PAGE_SIZE);
 777                         flush_page_to_ram(new_page);
 778                 }
 779                 free_page(page);
 780                 return new_page;
 781         }
 782         if (page)
 783                 flush_page_to_ram(page);
 784         return page;
 785 }
 786 
 787 /*
 788  * Tries to write a shared mapped page to its backing store. May return -EIO
 789  * if the disk is full.
 790  */
 791 static inline int do_write_page(struct inode * inode, struct file * file,
     /*  */
 792         const char * page, unsigned long offset)
 793 {
 794         int old_fs, retval;
 795         unsigned long size;
 796 
 797         size = offset + PAGE_SIZE;
 798         /* refuse to extend file size.. */
 799         if (S_ISREG(inode->i_mode)) {
 800                 if (size > inode->i_size)
 801                         size = inode->i_size;
 802                 /* Ho humm.. We should have tested for this earlier */
 803                 if (size < offset)
 804                         return -EIO;
 805         }
 806         size -= offset;
 807         old_fs = get_fs();
 808         set_fs(KERNEL_DS);
 809         retval = -EIO;
 810         if (size == file->f_op->write(inode, file, (const char *) page, size))
 811                 retval = 0;
 812         set_fs(old_fs);
 813         return retval;
 814 }
 815 
 816 static int filemap_write_page(struct vm_area_struct * vma,
     /*  */
 817         unsigned long offset,
 818         unsigned long page)
 819 {
 820         int result;
 821         struct file file;
 822         struct inode * inode;
 823         struct buffer_head * bh;
 824 
 825         bh = mem_map[MAP_NR(page)].buffers;
 826         if (bh) {
 827                 /* whee.. just mark the buffer heads dirty */
 828                 struct buffer_head * tmp = bh;
 829                 do {
 830                         mark_buffer_dirty(tmp, 0);
 831                         tmp = tmp->b_this_page;
 832                 } while (tmp != bh);
 833                 return 0;
 834         }
 835 
 836         inode = vma->vm_inode;
 837         file.f_op = inode->i_op->default_file_ops;
 838         if (!file.f_op->write)
 839                 return -EIO;
 840         file.f_mode = 3;
 841         file.f_flags = 0;
 842         file.f_count = 1;
 843         file.f_inode = inode;
 844         file.f_pos = offset;
 845         file.f_reada = 0;
 846 
 847         down(&inode->i_sem);
 848         result = do_write_page(inode, &file, (const char *) page, offset);
 849         up(&inode->i_sem);
 850         return result;
 851 }
 852 
 853 
 854 /*
 855  * Swapping to a shared file: while we're busy writing out the page
 856  * (and the page still exists in memory), we save the page information
 857  * in the page table, so that "filemap_swapin()" can re-use the page
 858  * immediately if it is called while we're busy swapping it out..
 859  *
 860  * Once we've written it all out, we mark the page entry "empty", which
 861  * will result in a normal page-in (instead of a swap-in) from the now
 862  * up-to-date disk file.
 863  */
 864 int filemap_swapout(struct vm_area_struct * vma,
     /*  */
 865         unsigned long offset,
 866         pte_t *page_table)
 867 {
 868         int error;
 869         unsigned long page = pte_page(*page_table);
 870         unsigned long entry = SWP_ENTRY(SHM_SWP_TYPE, MAP_NR(page));
 871 
 872         flush_cache_page(vma, (offset + vma->vm_start - vma->vm_offset));
 873         set_pte(page_table, __pte(entry));
 874         flush_tlb_page(vma, (offset + vma->vm_start - vma->vm_offset));
 875         error = filemap_write_page(vma, offset, page);
 876         if (pte_val(*page_table) == entry)
 877                 pte_clear(page_table);
 878         return error;
 879 }
 880 
 881 /*
 882  * filemap_swapin() is called only if we have something in the page
 883  * tables that is non-zero (but not present), which we know to be the
 884  * page index of a page that is busy being swapped out (see above).
 885  * So we just use it directly..
 886  */
 887 static pte_t filemap_swapin(struct vm_area_struct * vma,
     /*  */
 888         unsigned long offset,
 889         unsigned long entry)
 890 {
 891         unsigned long page = SWP_OFFSET(entry);
 892 
 893         mem_map[page].count++;
 894         page = (page << PAGE_SHIFT) + PAGE_OFFSET;
 895         return mk_pte(page,vma->vm_page_prot);
 896 }
 897 
 898 
 899 static inline int filemap_sync_pte(pte_t * ptep, struct vm_area_struct *vma,
     /*  */
 900         unsigned long address, unsigned int flags)
 901 {
 902         pte_t pte = *ptep;
 903         unsigned long page;
 904         int error;
 905 
 906         if (!(flags & MS_INVALIDATE)) {
 907                 if (!pte_present(pte))
 908                         return 0;
 909                 if (!pte_dirty(pte))
 910                         return 0;
 911                 flush_page_to_ram(pte_page(pte));
 912                 flush_cache_page(vma, address);
 913                 set_pte(ptep, pte_mkclean(pte));
 914                 flush_tlb_page(vma, address);
 915                 page = pte_page(pte);
 916                 mem_map[MAP_NR(page)].count++;
 917         } else {
 918                 if (pte_none(pte))
 919                         return 0;
 920                 flush_cache_page(vma, address);
 921                 pte_clear(ptep);
 922                 flush_tlb_page(vma, address);
 923                 if (!pte_present(pte)) {
 924                         swap_free(pte_val(pte));
 925                         return 0;
 926                 }
 927                 page = pte_page(pte);
 928                 if (!pte_dirty(pte) || flags == MS_INVALIDATE) {
 929                         free_page(page);
 930                         return 0;
 931                 }
 932         }
 933         error = filemap_write_page(vma, address - vma->vm_start + vma->vm_offset, page);
 934         free_page(page);
 935         return error;
 936 }
 937 
 938 static inline int filemap_sync_pte_range(pmd_t * pmd,
     /*  */
 939         unsigned long address, unsigned long size, 
 940         struct vm_area_struct *vma, unsigned long offset, unsigned int flags)
 941 {
 942         pte_t * pte;
 943         unsigned long end;
 944         int error;
 945 
 946         if (pmd_none(*pmd))
 947                 return 0;
 948         if (pmd_bad(*pmd)) {
 949                 printk("filemap_sync_pte_range: bad pmd (%08lx)\n", pmd_val(*pmd));
 950                 pmd_clear(pmd);
 951                 return 0;
 952         }
 953         pte = pte_offset(pmd, address);
 954         offset += address & PMD_MASK;
 955         address &= ~PMD_MASK;
 956         end = address + size;
 957         if (end > PMD_SIZE)
 958                 end = PMD_SIZE;
 959         error = 0;
 960         do {
 961                 error |= filemap_sync_pte(pte, vma, address + offset, flags);
 962                 address += PAGE_SIZE;
 963                 pte++;
 964         } while (address < end);
 965         return error;
 966 }
 967 
 968 static inline int filemap_sync_pmd_range(pgd_t * pgd,
     /*  */
 969         unsigned long address, unsigned long size, 
 970         struct vm_area_struct *vma, unsigned int flags)
 971 {
 972         pmd_t * pmd;
 973         unsigned long offset, end;
 974         int error;
 975 
 976         if (pgd_none(*pgd))
 977                 return 0;
 978         if (pgd_bad(*pgd)) {
 979                 printk("filemap_sync_pmd_range: bad pgd (%08lx)\n", pgd_val(*pgd));
 980                 pgd_clear(pgd);
 981                 return 0;
 982         }
 983         pmd = pmd_offset(pgd, address);
 984         offset = address & PGDIR_MASK;
 985         address &= ~PGDIR_MASK;
 986         end = address + size;
 987         if (end > PGDIR_SIZE)
 988                 end = PGDIR_SIZE;
 989         error = 0;
 990         do {
 991                 error |= filemap_sync_pte_range(pmd, address, end - address, vma, offset, flags);
 992                 address = (address + PMD_SIZE) & PMD_MASK;
 993                 pmd++;
 994         } while (address < end);
 995         return error;
 996 }
 997 
 998 static int filemap_sync(struct vm_area_struct * vma, unsigned long address,
     /*  */
 999         size_t size, unsigned int flags)
1000 {
1001         pgd_t * dir;
1002         unsigned long end = address + size;
1003         int error = 0;
1004 
1005         dir = pgd_offset(vma->vm_mm, address);
1006         flush_cache_range(vma->vm_mm, end - size, end);
1007         while (address < end) {
1008                 error |= filemap_sync_pmd_range(dir, address, end - address, vma, flags);
1009                 address = (address + PGDIR_SIZE) & PGDIR_MASK;
1010                 dir++;
1011         }
1012         flush_tlb_range(vma->vm_mm, end - size, end);
1013         return error;
1014 }
1015 
1016 /*
1017  * This handles (potentially partial) area unmaps..
1018  */
1019 static void filemap_unmap(struct vm_area_struct *vma, unsigned long start, size_t len)
     /*  */
1020 {
1021         filemap_sync(vma, start, len, MS_ASYNC);
1022 }
1023 
1024 /*
1025  * Shared mappings need to be able to do the right thing at
1026  * close/unmap/sync. They will also use the private file as
1027  * backing-store for swapping..
1028  */
1029 static struct vm_operations_struct file_shared_mmap = {
1030         NULL,                   /* no special open */
1031         NULL,                   /* no special close */
1032         filemap_unmap,          /* unmap - we need to sync the pages */
1033         NULL,                   /* no special protect */
1034         filemap_sync,           /* sync */
1035         NULL,                   /* advise */
1036         filemap_nopage,         /* nopage */
1037         NULL,                   /* wppage */
1038         filemap_swapout,        /* swapout */
1039         filemap_swapin,         /* swapin */
1040 };
1041 
1042 /*
1043  * Private mappings just need to be able to load in the map.
1044  *
1045  * (This is actually used for shared mappings as well, if we
1046  * know they can't ever get write permissions..)
1047  */
1048 static struct vm_operations_struct file_private_mmap = {
1049         NULL,                   /* open */
1050         NULL,                   /* close */
1051         NULL,                   /* unmap */
1052         NULL,                   /* protect */
1053         NULL,                   /* sync */
1054         NULL,                   /* advise */
1055         filemap_nopage,         /* nopage */
1056         NULL,                   /* wppage */
1057         NULL,                   /* swapout */
1058         NULL,                   /* swapin */
1059 };
1060 
1061 /* This is used for a general mmap of a disk file */
1062 int generic_file_mmap(struct inode * inode, struct file * file, struct vm_area_struct * vma)
     /*  */
1063 {
1064         struct vm_operations_struct * ops;
1065 
1066         if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) {
1067                 ops = &file_shared_mmap;
1068                 /* share_page() can only guarantee proper page sharing if
1069                  * the offsets are all page aligned. */
1070                 if (vma->vm_offset & (PAGE_SIZE - 1))
1071                         return -EINVAL;
1072         } else {
1073                 ops = &file_private_mmap;
1074                 if (vma->vm_offset & (inode->i_sb->s_blocksize - 1))
1075                         return -EINVAL;
1076         }
1077         if (!inode->i_sb || !S_ISREG(inode->i_mode))
1078                 return -EACCES;
1079         if (!inode->i_op || !inode->i_op->readpage)
1080                 return -ENOEXEC;
1081         if (!IS_RDONLY(inode)) {
1082                 inode->i_atime = CURRENT_TIME;
1083                 inode->i_dirt = 1;
1084         }
1085         vma->vm_inode = inode;
1086         inode->i_count++;
1087         vma->vm_ops = ops;
1088         return 0;
1089 }
1090 
1091 
1092 /*
1093  * The msync() system call.
1094  */
1095 
1096 static int msync_interval(struct vm_area_struct * vma,
     /*  */
1097         unsigned long start, unsigned long end, int flags)
1098 {
1099         if (!vma->vm_inode)
1100                 return 0;
1101         if (vma->vm_ops->sync) {
1102                 int error;
1103                 error = vma->vm_ops->sync(vma, start, end-start, flags);
1104                 if (error)
1105                         return error;
1106                 if (flags & MS_SYNC)
1107                         return file_fsync(vma->vm_inode, NULL);
1108                 return 0;
1109         }
1110         return 0;
1111 }
1112 
1113 asmlinkage int sys_msync(unsigned long start, size_t len, int flags)
     /*  */
1114 {
1115         unsigned long end;
1116         struct vm_area_struct * vma;
1117         int unmapped_error, error;
1118 
1119         if (start & ~PAGE_MASK)
1120                 return -EINVAL;
1121         len = (len + ~PAGE_MASK) & PAGE_MASK;
1122         end = start + len;
1123         if (end < start)
1124                 return -EINVAL;
1125         if (flags & ~(MS_ASYNC | MS_INVALIDATE | MS_SYNC))
1126                 return -EINVAL;
1127         if (end == start)
1128                 return 0;
1129         /*
1130          * If the interval [start,end) covers some unmapped address ranges,
1131          * just ignore them, but return -EFAULT at the end.
1132          */
1133         vma = find_vma(current, start);
1134         unmapped_error = 0;
1135         for (;;) {
1136                 /* Still start < end. */
1137                 if (!vma)
1138                         return -EFAULT;
1139                 /* Here start < vma->vm_end. */
1140                 if (start < vma->vm_start) {
1141                         unmapped_error = -EFAULT;
1142                         start = vma->vm_start;
1143                 }
1144                 /* Here vma->vm_start <= start < vma->vm_end. */
1145                 if (end <= vma->vm_end) {
1146                         if (start < end) {
1147                                 error = msync_interval(vma, start, end, flags);
1148                                 if (error)
1149                                         return error;
1150                         }
1151                         return unmapped_error;
1152                 }
1153                 /* Here vma->vm_start <= start < vma->vm_end < end. */
1154                 error = msync_interval(vma, start, vma->vm_end, flags);
1155                 if (error)
1156                         return error;
1157                 start = vma->vm_end;
1158                 vma = vma->vm_next;
1159         }
1160 }
/* */
root/mm/filemap.c

DEFINITIONS