mm/filemap.c

/* */
This source file includes following definitions.
invalidate_inode_pages
truncate_inode_pages
shrink_mmap
page_unuse
update_vm_cache
add_to_page_cache
try_to_read_ahead
__wait_on_page
profile_readahead
generic_file_readahead
generic_file_read
fill_page
filemap_nopage
do_write_page
filemap_write_page
filemap_swapout
filemap_swapin
filemap_sync_pte
filemap_sync_pte_range
filemap_sync_pmd_range
filemap_sync
filemap_unmap
generic_file_mmap
msync_interval
sys_msync
   1 /*
   2  *      linux/mm/filemap.c
   3  *
   4  * Copyright (C) 1994, 1995  Linus Torvalds
   5  */
   6 
   7 /*
   8  * This file handles the generic file mmap semantics used by
   9  * most "normal" filesystems (but you don't /have/ to use this:
  10  * the NFS filesystem does this differently, for example)
  11  */
  12 #include <linux/stat.h>
  13 #include <linux/sched.h>
  14 #include <linux/kernel.h>
  15 #include <linux/mm.h>
  16 #include <linux/shm.h>
  17 #include <linux/errno.h>
  18 #include <linux/mman.h>
  19 #include <linux/string.h>
  20 #include <linux/malloc.h>
  21 #include <linux/fs.h>
  22 #include <linux/locks.h>
  23 #include <linux/pagemap.h>
  24 #include <linux/swap.h>
  25 
  26 #include <asm/segment.h>
  27 #include <asm/system.h>
  28 #include <asm/pgtable.h>
  29 
  30 /*
  31  * Shared mappings implemented 30.11.1994. It's not fully working yet,
  32  * though.
  33  *
  34  * Shared mappings now work. 15.8.1995  Bruno.
  35  */
  36 
  37 unsigned long page_cache_size = 0;
  38 struct page * page_hash_table[PAGE_HASH_SIZE];
  39 
  40 /*
  41  * Simple routines for both non-shared and shared mappings.
  42  */
  43 
  44 /*
  45  * Invalidate the pages of an inode, removing all pages that aren't
  46  * locked down (those are sure to be up-to-date anyway, so we shouldn't
  47  * invalidate them).
  48  */
  49 void invalidate_inode_pages(struct inode * inode)
     /*  */
  50 {
  51         struct page ** p;
  52         struct page * page;
  53 
  54         p = &inode->i_pages;
  55         while ((page = *p) != NULL) {
  56                 if (PageLocked(page)) {
  57                         p = &page->next;
  58                         continue;
  59                 }
  60                 inode->i_nrpages--;
  61                 if ((*p = page->next) != NULL)
  62                         (*p)->prev = page->prev;
  63                 page->dirty = 0;
  64                 page->next = NULL;
  65                 page->prev = NULL;
  66                 remove_page_from_hash_queue(page);
  67                 page->inode = NULL;
  68                 free_page(page_address(page));
  69                 continue;
  70         }
  71 }
  72 
  73 /*
  74  * Truncate the page cache at a set offset, removing the pages
  75  * that are beyond that offset (and zeroing out partial pages).
  76  */
  77 void truncate_inode_pages(struct inode * inode, unsigned long start)
     /*  */
  78 {
  79         struct page ** p;
  80         struct page * page;
  81 
  82 repeat:
  83         p = &inode->i_pages;
  84         while ((page = *p) != NULL) {
  85                 unsigned long offset = page->offset;
  86 
  87                 /* page wholly truncated - free it */
  88                 if (offset >= start) {
  89                         if (PageLocked(page)) {
  90                                 wait_on_page(page);
  91                                 goto repeat;
  92                         }
  93                         inode->i_nrpages--;
  94                         if ((*p = page->next) != NULL)
  95                                 (*p)->prev = page->prev;
  96                         page->dirty = 0;
  97                         page->next = NULL;
  98                         page->prev = NULL;
  99                         remove_page_from_hash_queue(page);
 100                         page->inode = NULL;
 101                         free_page(page_address(page));
 102                         continue;
 103                 }
 104                 p = &page->next;
 105                 offset = start - offset;
 106                 /* partial truncate, clear end of page */
 107                 if (offset < PAGE_SIZE) {
 108                         memset((void *) (offset + page_address(page)), 0, PAGE_SIZE - offset);
 109                         flush_page_to_ram(page_address(page));
 110                 }
 111         }
 112 }
 113 
 114 int shrink_mmap(int priority, int dma)
     /*  */
 115 {
 116         static int clock = 0;
 117         struct page * page;
 118         unsigned long limit = MAP_NR(high_memory);
 119         struct buffer_head *tmp, *bh;
 120 
 121         priority = (limit<<2) >> priority;
 122         page = mem_map + clock;
 123         do {
 124                 priority--;
 125                 if (PageLocked(page))
 126                         goto next;
 127                 if (dma && !PageDMA(page))
 128                         goto next;
 129                 /* First of all, regenerate the page's referenced bit
 130                    from any buffers in the page */
 131                 bh = page->buffers;
 132                 if (bh) {
 133                         tmp = bh;
 134                         do {
 135                                 if (buffer_touched(tmp)) {
 136                                         clear_bit(BH_Touched, &tmp->b_state);
 137                                         set_bit(PG_referenced, &page->flags);
 138                                 }
 139                                 tmp = tmp->b_this_page;
 140                         } while (tmp != bh);
 141                 }
 142 
 143                 /* We can't throw away shared pages, but we do mark
 144                    them as referenced.  This relies on the fact that
 145                    no page is currently in both the page cache and the
 146                    buffer cache; we'd have to modify the following
 147                    test to allow for that case. */
 148 
 149                 switch (page->count) {
 150                         case 1:
 151                                 /* If it has been referenced recently, don't free it */
 152                                 if (clear_bit(PG_referenced, &page->flags))
 153                                         break;
 154 
 155                                 /* is it a page cache page? */
 156                                 if (page->inode) {
 157                                         remove_page_from_hash_queue(page);
 158                                         remove_page_from_inode_queue(page);
 159                                         free_page(page_address(page));
 160                                         return 1;
 161                                 }
 162 
 163                                 /* is it a buffer cache page? */
 164                                 if (bh && try_to_free_buffer(bh, &bh, 6))
 165                                         return 1;
 166                                 break;
 167 
 168                         default:
 169                                 /* more than one users: we can't throw it away */
 170                                 set_bit(PG_referenced, &page->flags);
 171                                 /* fall through */
 172                         case 0:
 173                                 /* nothing */
 174                 }
 175 next:
 176                 page++;
 177                 clock++;
 178                 if (clock >= limit) {
 179                         clock = 0;
 180                         page = mem_map;
 181                 }
 182         } while (priority > 0);
 183         return 0;
 184 }
 185 
 186 /*
 187  * This is called from try_to_swap_out() when we try to get rid of some
 188  * pages..  If we're unmapping the last occurrence of this page, we also
 189  * free it from the page hash-queues etc, as we don't want to keep it
 190  * in-core unnecessarily.
 191  */
 192 unsigned long page_unuse(unsigned long page)
     /*  */
 193 {
 194         struct page * p = mem_map + MAP_NR(page);
 195         int count = p->count;
 196 
 197         if (count != 2)
 198                 return count;
 199         if (!p->inode)
 200                 return count;
 201         remove_page_from_hash_queue(p);
 202         remove_page_from_inode_queue(p);
 203         free_page(page);
 204         return 1;
 205 }
 206 
 207 /*
 208  * Update a page cache copy, when we're doing a "write()" system call
 209  * See also "update_vm_cache()".
 210  */
 211 void update_vm_cache(struct inode * inode, unsigned long pos, const char * buf, int count)
     /*  */
 212 {
 213         unsigned long offset, len;
 214 
 215         offset = (pos & ~PAGE_MASK);
 216         pos = pos & PAGE_MASK;
 217         len = PAGE_SIZE - offset;
 218         do {
 219                 struct page * page;
 220 
 221                 if (len > count)
 222                         len = count;
 223                 page = find_page(inode, pos);
 224                 if (page) {
 225                         unsigned long addr;
 226 
 227                         wait_on_page(page);
 228                         addr = page_address(page);
 229                         memcpy((void *) (offset + addr), buf, len);
 230                         free_page(addr);
 231                 }
 232                 count -= len;
 233                 buf += len;
 234                 len = PAGE_SIZE;
 235                 offset = 0;
 236                 pos += PAGE_SIZE;
 237         } while (count);
 238 }
 239 
 240 static inline void add_to_page_cache(struct page * page,
     /*  */
 241         struct inode * inode, unsigned long offset)
 242 {
 243         page->count++;
 244         page->flags &= ~((1 << PG_uptodate) | (1 << PG_error));
 245         page->offset = offset;
 246         add_page_to_inode_queue(inode, page);
 247         add_page_to_hash_queue(inode, page);
 248 }
 249 
 250 /*
 251  * Try to read ahead in the file. "page_cache" is a potentially free page
 252  * that we could use for the cache (if it is 0 we can try to create one,
 253  * this is all overlapped with the IO on the previous page finishing anyway)
 254  */
 255 static unsigned long try_to_read_ahead(struct inode * inode, unsigned long offset, unsigned long page_cache)
     /*  */
 256 {
 257         struct page * page;
 258 
 259         offset &= PAGE_MASK;
 260         if (!page_cache) {
 261                 page_cache = __get_free_page(GFP_KERNEL);
 262                 if (!page_cache)
 263                         return 0;
 264         }
 265         if (offset >= inode->i_size)
 266                 return page_cache;
 267 #if 1
 268         page = find_page(inode, offset);
 269         if (page) {
 270                 page->count--;
 271                 return page_cache;
 272         }
 273         /*
 274          * Ok, add the new page to the hash-queues...
 275          */
 276         page = mem_map + MAP_NR(page_cache);
 277         add_to_page_cache(page, inode, offset);
 278         inode->i_op->readpage(inode, page);
 279         free_page(page_cache);
 280         return 0;
 281 #else
 282         return page_cache;
 283 #endif
 284 }
 285 
 286 /* 
 287  * Wait for IO to complete on a locked page.
 288  */
 289 void __wait_on_page(struct page *page)
     /*  */
 290 {
 291         struct wait_queue wait = { current, NULL };
 292 
 293         page->count++;
 294         add_wait_queue(&page->wait, &wait);
 295 repeat:
 296         run_task_queue(&tq_disk);
 297         current->state = TASK_UNINTERRUPTIBLE;
 298         if (PageLocked(page)) {
 299                 schedule();
 300                 goto repeat;
 301         }
 302         remove_wait_queue(&page->wait, &wait);
 303         page->count--;
 304         current->state = TASK_RUNNING;
 305 }
 306 
 307 #if 0
 308 #define PROFILE_READAHEAD
 309 #define DEBUG_READAHEAD
 310 #endif
 311 
 312 /*
 313  * Read-ahead profiling informations
 314  * ---------------------------------
 315  * Every PROFILE_MAXREADCOUNT, the following informations are written 
 316  * to the syslog:
 317  *   Percentage of asynchronous read-ahead.
 318  *   Average of read-ahead fields context value.
 319  * If DEBUG_READAHEAD is defined, a snapshot of these fields is written 
 320  * to the syslog.
 321  */
 322 
 323 #ifdef PROFILE_READAHEAD
 324 
 325 #define PROFILE_MAXREADCOUNT 1000
 326 
 327 static unsigned long total_reada;
 328 static unsigned long total_async;
 329 static unsigned long total_ramax;
 330 static unsigned long total_ralen;
 331 static unsigned long total_rawin;
 332 
 333 static void profile_readahead(int async, struct file *filp)
     /*  */
 334 {
 335         unsigned long flags;
 336 
 337         ++total_reada;
 338         if (async)
 339                 ++total_async;
 340 
 341         total_ramax     += filp->f_ramax;
 342         total_ralen     += filp->f_ralen;
 343         total_rawin     += filp->f_rawin;
 344 
 345         if (total_reada > PROFILE_MAXREADCOUNT) {
 346                 save_flags(flags);
 347                 cli();
 348                 if (!(total_reada > PROFILE_MAXREADCOUNT)) {
 349                         restore_flags(flags);
 350                         return;
 351                 }
 352 
 353                 printk("Readahead average:  max=%ld, len=%ld, win=%ld, async=%ld%%\n",
 354                         total_ramax/total_reada,
 355                         total_ralen/total_reada,
 356                         total_rawin/total_reada,
 357                         (total_async*100)/total_reada);
 358 #ifdef DEBUG_READAHEAD
 359                 printk("Readahead snapshot: max=%ld, len=%ld, win=%ld, raend=%ld\n",
 360                         filp->f_ramax, filp->f_ralen, filp->f_rawin, filp->f_raend);
 361 #endif
 362 
 363                 total_reada     = 0;
 364                 total_async     = 0;
 365                 total_ramax     = 0;
 366                 total_ralen     = 0;
 367                 total_rawin     = 0;
 368 
 369                 restore_flags(flags);
 370         }
 371 }
 372 #endif  /* defined PROFILE_READAHEAD */
 373 
 374 /*
 375  * Read-ahead context:
 376  * -------------------
 377  * The read ahead context fields of the "struct file" are the following:
 378  * - f_raend : position of the first byte after the last page we tried to
 379  *             read ahead.
 380  * - f_ramax : current read-ahead maximum size.
 381  * - f_ralen : length of the current IO read block we tried to read-ahead.
 382  * - f_rawin : length of the current read-ahead window.
 383  *             if last read-ahead was synchronous then
 384  *                  f_rawin = f_ralen
 385  *             otherwise (was asynchronous)
 386  *                  f_rawin = previous value of f_ralen + f_ralen
 387  *
 388  * Read-ahead limits:
 389  * ------------------
 390  * MIN_READAHEAD   : minimum read-ahead size when read-ahead.
 391  * MAX_READAHEAD   : maximum read-ahead size when read-ahead.
 392  *
 393  * Synchronous read-ahead benefits:
 394  * --------------------------------
 395  * Using reasonable IO xfer length from peripheral devices increase system 
 396  * performances.
 397  * Reasonable means, in this context, not too large but not too small.
 398  * The actual maximum value is:
 399  *      MAX_READAHEAD + PAGE_SIZE = 76k is CONFIG_READA_SMALL is undefined
 400  *      and 32K if defined.
 401  *
 402  * Asynchronous read-ahead benefits:
 403  * ---------------------------------
 404  * Overlapping next read request and user process execution increase system 
 405  * performance.
 406  *
 407  * Read-ahead risks:
 408  * -----------------
 409  * We have to guess which further data are needed by the user process.
 410  * If these data are often not really needed, it's bad for system 
 411  * performances.
 412  * However, we know that files are often accessed sequentially by 
 413  * application programs and it seems that it is possible to have some good 
 414  * strategy in that guessing.
 415  * We only try to read-ahead files that seems to be read sequentially.
 416  *
 417  * Asynchronous read-ahead risks:
 418  * ------------------------------
 419  * In order to maximize overlapping, we must start some asynchronous read 
 420  * request from the device, as soon as possible.
 421  * We must be very careful about:
 422  * - The number of effective pending IO read requests.
 423  *   ONE seems to be the only reasonable value.
 424  * - The total memory pool usage for the file access stream.
 425  *   This maximum memory usage is implicitly 2 IO read chunks:
 426  *   2*(MAX_READAHEAD + PAGE_SIZE) = 156K if CONFIG_READA_SMALL is undefined,
 427  *   64k if defined.
 428  */
 429 
 430 #if 0 /* small readahead */
 431 #define MAX_READAHEAD (PAGE_SIZE*7)
 432 #define MIN_READAHEAD (PAGE_SIZE*2)
 433 #else
 434 #define MAX_READAHEAD (PAGE_SIZE*18)
 435 #define MIN_READAHEAD (PAGE_SIZE*3)
 436 #endif
 437 
 438 static inline unsigned long generic_file_readahead(int reada_ok, struct file * filp, struct inode * inode,
     /*  */
 439         unsigned long pos, struct page * page,
 440         unsigned long page_cache)
 441 {
 442         unsigned long max_ahead, ahead;
 443         unsigned long raend, ppos;
 444 
 445         ppos = pos & PAGE_MASK;
 446         raend = filp->f_raend & PAGE_MASK;
 447         max_ahead = 0;
 448 
 449 /*
 450  * The current page is locked.
 451  * If the current position is inside the previous read IO request, do not
 452  * try to reread previously read ahead pages.
 453  * Otherwise decide or not to read ahead some pages synchronously.
 454  * If we are not going to read ahead, set the read ahead context for this 
 455  * page only.
 456  */
 457         if (PageLocked(page)) {
 458                 if (!filp->f_ralen || ppos >= raend || ppos + filp->f_ralen < raend) {
 459                         raend = ppos;
 460                         if (raend < inode->i_size)
 461                                 max_ahead = filp->f_ramax;
 462                         filp->f_rawin = 0;
 463                         filp->f_ralen = PAGE_SIZE;
 464                         if (!max_ahead) {
 465                                 filp->f_raend  = ppos + filp->f_ralen;
 466                                 filp->f_rawin += filp->f_ralen;
 467                         }
 468                 }
 469         }
 470 /*
 471  * The current page is not locked.
 472  * If we were reading ahead and,
 473  * if the current max read ahead size is not zero and,
 474  * if the current position is inside the last read-ahead IO request,
 475  *   it is the moment to try to read ahead asynchronously.
 476  * We will later force unplug device in order to force asynchronous read IO.
 477  */
 478         else if (reada_ok && filp->f_ramax && raend >= PAGE_SIZE &&
 479                  ppos <= raend && ppos + filp->f_ralen >= raend) {
 480 /*
 481  * Add ONE page to max_ahead in order to try to have about the same IO max size
 482  * as synchronous read-ahead (MAX_READAHEAD + 1)*PAGE_SIZE.
 483  * Compute the position of the last page we have tried to read in order to 
 484  * begin to read ahead just at the next page.
 485  */
 486                 raend -= PAGE_SIZE;
 487                 if (raend < inode->i_size)
 488                         max_ahead = filp->f_ramax + PAGE_SIZE;
 489 
 490                 if (max_ahead) {
 491                         filp->f_rawin = filp->f_ralen;
 492                         filp->f_ralen = 0;
 493                         reada_ok      = 2;
 494                 }
 495         }
 496 /*
 497  * Try to read ahead pages.
 498  * We hope that ll_rw_blk() plug/unplug, coalescence, requests sort and the
 499  * scheduler, will work enough for us to avoid too bad actuals IO requests.
 500  */
 501         ahead = 0;
 502         while (ahead < max_ahead) {
 503                 ahead += PAGE_SIZE;
 504                 page_cache = try_to_read_ahead(inode, raend + ahead, page_cache);
 505         }
 506 /*
 507  * If we tried to read ahead some pages,
 508  * If we tried to read ahead asynchronously,
 509  *   Try to force unplug of the device in order to start an asynchronous
 510  *   read IO request.
 511  * Update the read-ahead context.
 512  * Store the length of the current read-ahead window.
 513  * Double the current max read ahead size.
 514  *   That heuristic avoid to do some large IO for files that are not really
 515  *   accessed sequentially.
 516  */
 517         if (ahead) {
 518                 if (reada_ok == 2) {
 519                         run_task_queue(&tq_disk);
 520                 }
 521 
 522                 filp->f_ralen += ahead;
 523                 filp->f_rawin += filp->f_ralen;
 524                 filp->f_raend = raend + ahead + PAGE_SIZE;
 525 
 526                 filp->f_ramax += filp->f_ramax;
 527 
 528                 if (filp->f_ramax > MAX_READAHEAD)
 529                         filp->f_ramax = MAX_READAHEAD;
 530 
 531 #ifdef PROFILE_READAHEAD
 532                 profile_readahead((reada_ok == 2), filp);
 533 #endif
 534         }
 535 
 536         return page_cache;
 537 }
 538 
 539 
 540 /*
 541  * This is a generic file read routine, and uses the
 542  * inode->i_op->readpage() function for the actual low-level
 543  * stuff.
 544  *
 545  * This is really ugly. But the goto's actually try to clarify some
 546  * of the logic when it comes to error handling etc.
 547  */
 548 
 549 int generic_file_read(struct inode * inode, struct file * filp, char * buf, int count)
     /*  */
 550 {
 551         int error, read;
 552         unsigned long pos, ppos, page_cache;
 553         int reada_ok;
 554 
 555         if (count <= 0)
 556                 return 0;
 557 
 558         error = 0;
 559         read = 0;
 560         page_cache = 0;
 561 
 562         pos = filp->f_pos;
 563         ppos = pos & PAGE_MASK;
 564 /*
 565  * If the current position is outside the previous read-ahead window, 
 566  * we reset the current read-ahead context and set read ahead max to zero
 567  * (will be set to just needed value later),
 568  * otherwise, we assume that the file accesses are sequential enough to
 569  * continue read-ahead.
 570  */
 571         if (ppos > filp->f_raend || ppos + filp->f_rawin < filp->f_raend) {
 572                 reada_ok = 0;
 573                 filp->f_raend = 0;
 574                 filp->f_ralen = 0;
 575                 filp->f_ramax = 0;
 576                 filp->f_rawin = 0;
 577         } else {
 578                 reada_ok = 1;
 579         }
 580 /*
 581  * Adjust the current value of read-ahead max.
 582  * If the read operation stay in the first half page, force no readahead.
 583  * Otherwise try to increase read ahead max just enough to do the read request.
 584  * Then, at least MIN_READAHEAD if read ahead is ok,
 585  * and at most MAX_READAHEAD in all cases.
 586  */
 587         if (pos + count <= (PAGE_SIZE >> 1)) {
 588                 filp->f_ramax = 0;
 589         } else {
 590                 unsigned long needed;
 591 
 592                 needed = ((pos + count) & PAGE_MASK) - (pos & PAGE_MASK);
 593 
 594                 if (filp->f_ramax < needed)
 595                         filp->f_ramax = needed;
 596 
 597                 if (reada_ok && filp->f_ramax < MIN_READAHEAD)
 598                                 filp->f_ramax = MIN_READAHEAD;
 599                 if (filp->f_ramax > MAX_READAHEAD)
 600                         filp->f_ramax = MAX_READAHEAD;
 601         }
 602 
 603         for (;;) {
 604                 struct page *page;
 605                 unsigned long offset, addr, nr;
 606 
 607                 if (pos >= inode->i_size)
 608                         break;
 609                 offset = pos & ~PAGE_MASK;
 610                 nr = PAGE_SIZE - offset;
 611                 /*
 612                  * Try to find the data in the page cache..
 613                  */
 614                 page = find_page(inode, pos & PAGE_MASK);
 615                 if (page)
 616                         goto found_page;
 617 
 618                 /*
 619                  * Ok, it wasn't cached, so we need to create a new
 620                  * page..
 621                  */
 622                 if (page_cache)
 623                         goto new_page;
 624 
 625                 error = -ENOMEM;
 626                 page_cache = __get_free_page(GFP_KERNEL);
 627                 if (!page_cache)
 628                         break;
 629                 error = 0;
 630 
 631                 /*
 632                  * That could have slept, so we need to check again..
 633                  */
 634                 if (pos >= inode->i_size)
 635                         break;
 636                 page = find_page(inode, pos & PAGE_MASK);
 637                 if (!page)
 638                         goto new_page;
 639 
 640 found_page:
 641                 addr = page_address(page);
 642                 if (nr > count)
 643                         nr = count;
 644 /*
 645  * Try to read ahead only if the current page is filled or being filled.
 646  * Otherwise, if we were reading ahead, decrease max read ahead size to
 647  * the minimum value.
 648  * In this context, that seems to may happen only on some read error or if 
 649  * the page has been rewritten.
 650  */
 651                 if (PageUptodate(page) || PageLocked(page))
 652                         page_cache = generic_file_readahead(reada_ok, filp, inode, pos, page, page_cache);
 653                 else if (reada_ok && filp->f_ramax > MIN_READAHEAD)
 654                                 filp->f_ramax = MIN_READAHEAD;
 655 
 656                 if (PageLocked(page))
 657                         __wait_on_page(page);
 658 
 659                 if (!PageUptodate(page))
 660                         goto read_page;
 661                 if (nr > inode->i_size - pos)
 662                         nr = inode->i_size - pos;
 663                 memcpy_tofs(buf, (void *) (addr + offset), nr);
 664                 free_page(addr);
 665                 buf += nr;
 666                 pos += nr;
 667                 read += nr;
 668                 count -= nr;
 669                 if (count)
 670                         continue;
 671                 break;
 672         
 673 
 674 new_page:
 675                 /*
 676                  * Ok, add the new page to the hash-queues...
 677                  */
 678                 addr = page_cache;
 679                 page = mem_map + MAP_NR(page_cache);
 680                 page_cache = 0;
 681                 add_to_page_cache(page, inode, pos & PAGE_MASK);
 682 
 683                 /*
 684                  * Error handling is tricky. If we get a read error,
 685                  * the cached page stays in the cache (but uptodate=0),
 686                  * and the next process that accesses it will try to
 687                  * re-read it. This is needed for NFS etc, where the
 688                  * identity of the reader can decide if we can read the
 689                  * page or not..
 690                  */
 691 read_page:
 692 /*
 693  * We have to read the page.
 694  * If we were reading ahead, we had previously tried to read this page,
 695  * That means that the page has probably been removed from the cache before 
 696  * the application process needs it, or has been rewritten.
 697  * Decrease max readahead size to the minimum value in that situation.
 698  */
 699                 if (reada_ok && filp->f_ramax > MIN_READAHEAD)
 700                         filp->f_ramax = MIN_READAHEAD;
 701 
 702                 error = inode->i_op->readpage(inode, page);
 703                 if (!error) {
 704                         if (!PageError(page))
 705                                 goto found_page;
 706                         error = -EIO;
 707                 }
 708                 free_page(addr);
 709                 break;
 710         }
 711 
 712         filp->f_pos = pos;
 713         filp->f_reada = 1;
 714         if (page_cache)
 715                 free_page(page_cache);
 716         if (!IS_RDONLY(inode)) {
 717                 inode->i_atime = CURRENT_TIME;
 718                 inode->i_dirt = 1;
 719         }
 720         if (!read)
 721                 read = error;
 722         return read;
 723 }
 724 
 725 /*
 726  * Find a cached page and wait for it to become up-to-date, return
 727  * the page address.  Increments the page count.
 728  */
 729 static inline unsigned long fill_page(struct inode * inode, unsigned long offset)
     /*  */
 730 {
 731         struct page * page;
 732         unsigned long new_page;
 733 
 734         page = find_page(inode, offset);
 735         if (page)
 736                 goto found_page_dont_free;
 737         new_page = __get_free_page(GFP_KERNEL);
 738         page = find_page(inode, offset);
 739         if (page)
 740                 goto found_page;
 741         if (!new_page)
 742                 return 0;
 743         page = mem_map + MAP_NR(new_page);
 744         new_page = 0;
 745         add_to_page_cache(page, inode, offset);
 746         inode->i_op->readpage(inode, page);
 747         if (PageLocked(page))
 748                 new_page = try_to_read_ahead(inode, offset + PAGE_SIZE, 0);
 749 found_page:
 750         if (new_page)
 751                 free_page(new_page);
 752 found_page_dont_free:
 753         wait_on_page(page);
 754         return page_address(page);
 755 }
 756 
 757 /*
 758  * Semantics for shared and private memory areas are different past the end
 759  * of the file. A shared mapping past the last page of the file is an error
 760  * and results in a SIGBUS, while a private mapping just maps in a zero page.
 761  */
 762 static unsigned long filemap_nopage(struct vm_area_struct * area, unsigned long address, int no_share)
     /*  */
 763 {
 764         unsigned long offset;
 765         struct inode * inode = area->vm_inode;
 766         unsigned long page;
 767 
 768         offset = (address & PAGE_MASK) - area->vm_start + area->vm_offset;
 769         if (offset >= inode->i_size && (area->vm_flags & VM_SHARED) && area->vm_mm == current->mm)
 770                 return 0;
 771 
 772         page = fill_page(inode, offset);
 773         if (page && no_share) {
 774                 unsigned long new_page = __get_free_page(GFP_KERNEL);
 775                 if (new_page) {
 776                         memcpy((void *) new_page, (void *) page, PAGE_SIZE);
 777                         flush_page_to_ram(new_page);
 778                 }
 779                 free_page(page);
 780                 return new_page;
 781         }
 782         flush_page_to_ram(page);
 783         return page;
 784 }
 785 
 786 /*
 787  * Tries to write a shared mapped page to its backing store. May return -EIO
 788  * if the disk is full.
 789  */
 790 static inline int do_write_page(struct inode * inode, struct file * file,
     /*  */
 791         const char * page, unsigned long offset)
 792 {
 793         int old_fs, retval;
 794         unsigned long size;
 795 
 796         size = offset + PAGE_SIZE;
 797         /* refuse to extend file size.. */
 798         if (S_ISREG(inode->i_mode)) {
 799                 if (size > inode->i_size)
 800                         size = inode->i_size;
 801                 /* Ho humm.. We should have tested for this earlier */
 802                 if (size < offset)
 803                         return -EIO;
 804         }
 805         size -= offset;
 806         old_fs = get_fs();
 807         set_fs(KERNEL_DS);
 808         retval = -EIO;
 809         if (size == file->f_op->write(inode, file, (const char *) page, size))
 810                 retval = 0;
 811         set_fs(old_fs);
 812         return retval;
 813 }
 814 
 815 static int filemap_write_page(struct vm_area_struct * vma,
     /*  */
 816         unsigned long offset,
 817         unsigned long page)
 818 {
 819         int result;
 820         struct file file;
 821         struct inode * inode;
 822         struct buffer_head * bh;
 823 
 824         bh = mem_map[MAP_NR(page)].buffers;
 825         if (bh) {
 826                 /* whee.. just mark the buffer heads dirty */
 827                 struct buffer_head * tmp = bh;
 828                 do {
 829                         mark_buffer_dirty(tmp, 0);
 830                         tmp = tmp->b_this_page;
 831                 } while (tmp != bh);
 832                 return 0;
 833         }
 834 
 835         inode = vma->vm_inode;
 836         file.f_op = inode->i_op->default_file_ops;
 837         if (!file.f_op->write)
 838                 return -EIO;
 839         file.f_mode = 3;
 840         file.f_flags = 0;
 841         file.f_count = 1;
 842         file.f_inode = inode;
 843         file.f_pos = offset;
 844         file.f_reada = 0;
 845 
 846         down(&inode->i_sem);
 847         result = do_write_page(inode, &file, (const char *) page, offset);
 848         up(&inode->i_sem);
 849         return result;
 850 }
 851 
 852 
 853 /*
 854  * Swapping to a shared file: while we're busy writing out the page
 855  * (and the page still exists in memory), we save the page information
 856  * in the page table, so that "filemap_swapin()" can re-use the page
 857  * immediately if it is called while we're busy swapping it out..
 858  *
 859  * Once we've written it all out, we mark the page entry "empty", which
 860  * will result in a normal page-in (instead of a swap-in) from the now
 861  * up-to-date disk file.
 862  */
 863 int filemap_swapout(struct vm_area_struct * vma,
     /*  */
 864         unsigned long offset,
 865         pte_t *page_table)
 866 {
 867         int error;
 868         unsigned long page = pte_page(*page_table);
 869         unsigned long entry = SWP_ENTRY(SHM_SWP_TYPE, MAP_NR(page));
 870 
 871         flush_cache_page(vma, (offset + vma->vm_start - vma->vm_offset));
 872         set_pte(page_table, __pte(entry));
 873         flush_tlb_page(vma, (offset + vma->vm_start - vma->vm_offset));
 874         error = filemap_write_page(vma, offset, page);
 875         if (pte_val(*page_table) == entry)
 876                 pte_clear(page_table);
 877         return error;
 878 }
 879 
 880 /*
 881  * filemap_swapin() is called only if we have something in the page
 882  * tables that is non-zero (but not present), which we know to be the
 883  * page index of a page that is busy being swapped out (see above).
 884  * So we just use it directly..
 885  */
 886 static pte_t filemap_swapin(struct vm_area_struct * vma,
     /*  */
 887         unsigned long offset,
 888         unsigned long entry)
 889 {
 890         unsigned long page = SWP_OFFSET(entry);
 891 
 892         mem_map[page].count++;
 893         page = (page << PAGE_SHIFT) + PAGE_OFFSET;
 894         return mk_pte(page,vma->vm_page_prot);
 895 }
 896 
 897 
 898 static inline int filemap_sync_pte(pte_t * ptep, struct vm_area_struct *vma,
     /*  */
 899         unsigned long address, unsigned int flags)
 900 {
 901         pte_t pte = *ptep;
 902         unsigned long page;
 903         int error;
 904 
 905         if (!(flags & MS_INVALIDATE)) {
 906                 if (!pte_present(pte))
 907                         return 0;
 908                 if (!pte_dirty(pte))
 909                         return 0;
 910                 flush_page_to_ram(pte_page(pte));
 911                 flush_cache_page(vma, address);
 912                 set_pte(ptep, pte_mkclean(pte));
 913                 flush_tlb_page(vma, address);
 914                 page = pte_page(pte);
 915                 mem_map[MAP_NR(page)].count++;
 916         } else {
 917                 if (pte_none(pte))
 918                         return 0;
 919                 flush_cache_page(vma, address);
 920                 pte_clear(ptep);
 921                 flush_tlb_page(vma, address);
 922                 if (!pte_present(pte)) {
 923                         swap_free(pte_val(pte));
 924                         return 0;
 925                 }
 926                 page = pte_page(pte);
 927                 if (!pte_dirty(pte) || flags == MS_INVALIDATE) {
 928                         free_page(page);
 929                         return 0;
 930                 }
 931         }
 932         error = filemap_write_page(vma, address - vma->vm_start + vma->vm_offset, page);
 933         free_page(page);
 934         return error;
 935 }
 936 
 937 static inline int filemap_sync_pte_range(pmd_t * pmd,
     /*  */
 938         unsigned long address, unsigned long size, 
 939         struct vm_area_struct *vma, unsigned long offset, unsigned int flags)
 940 {
 941         pte_t * pte;
 942         unsigned long end;
 943         int error;
 944 
 945         if (pmd_none(*pmd))
 946                 return 0;
 947         if (pmd_bad(*pmd)) {
 948                 printk("filemap_sync_pte_range: bad pmd (%08lx)\n", pmd_val(*pmd));
 949                 pmd_clear(pmd);
 950                 return 0;
 951         }
 952         pte = pte_offset(pmd, address);
 953         offset += address & PMD_MASK;
 954         address &= ~PMD_MASK;
 955         end = address + size;
 956         if (end > PMD_SIZE)
 957                 end = PMD_SIZE;
 958         error = 0;
 959         do {
 960                 error |= filemap_sync_pte(pte, vma, address + offset, flags);
 961                 address += PAGE_SIZE;
 962                 pte++;
 963         } while (address < end);
 964         return error;
 965 }
 966 
 967 static inline int filemap_sync_pmd_range(pgd_t * pgd,
     /*  */
 968         unsigned long address, unsigned long size, 
 969         struct vm_area_struct *vma, unsigned int flags)
 970 {
 971         pmd_t * pmd;
 972         unsigned long offset, end;
 973         int error;
 974 
 975         if (pgd_none(*pgd))
 976                 return 0;
 977         if (pgd_bad(*pgd)) {
 978                 printk("filemap_sync_pmd_range: bad pgd (%08lx)\n", pgd_val(*pgd));
 979                 pgd_clear(pgd);
 980                 return 0;
 981         }
 982         pmd = pmd_offset(pgd, address);
 983         offset = address & PGDIR_MASK;
 984         address &= ~PGDIR_MASK;
 985         end = address + size;
 986         if (end > PGDIR_SIZE)
 987                 end = PGDIR_SIZE;
 988         error = 0;
 989         do {
 990                 error |= filemap_sync_pte_range(pmd, address, end - address, vma, offset, flags);
 991                 address = (address + PMD_SIZE) & PMD_MASK;
 992                 pmd++;
 993         } while (address < end);
 994         return error;
 995 }
 996 
 997 static int filemap_sync(struct vm_area_struct * vma, unsigned long address,
     /*  */
 998         size_t size, unsigned int flags)
 999 {
1000         pgd_t * dir;
1001         unsigned long end = address + size;
1002         int error = 0;
1003 
1004         dir = pgd_offset(vma->vm_mm, address);
1005         flush_cache_range(vma->vm_mm, end - size, end);
1006         while (address < end) {
1007                 error |= filemap_sync_pmd_range(dir, address, end - address, vma, flags);
1008                 address = (address + PGDIR_SIZE) & PGDIR_MASK;
1009                 dir++;
1010         }
1011         flush_tlb_range(vma->vm_mm, end - size, end);
1012         return error;
1013 }
1014 
1015 /*
1016  * This handles (potentially partial) area unmaps..
1017  */
1018 static void filemap_unmap(struct vm_area_struct *vma, unsigned long start, size_t len)
     /*  */
1019 {
1020         filemap_sync(vma, start, len, MS_ASYNC);
1021 }
1022 
1023 /*
1024  * Shared mappings need to be able to do the right thing at
1025  * close/unmap/sync. They will also use the private file as
1026  * backing-store for swapping..
1027  */
1028 static struct vm_operations_struct file_shared_mmap = {
1029         NULL,                   /* no special open */
1030         NULL,                   /* no special close */
1031         filemap_unmap,          /* unmap - we need to sync the pages */
1032         NULL,                   /* no special protect */
1033         filemap_sync,           /* sync */
1034         NULL,                   /* advise */
1035         filemap_nopage,         /* nopage */
1036         NULL,                   /* wppage */
1037         filemap_swapout,        /* swapout */
1038         filemap_swapin,         /* swapin */
1039 };
1040 
1041 /*
1042  * Private mappings just need to be able to load in the map.
1043  *
1044  * (This is actually used for shared mappings as well, if we
1045  * know they can't ever get write permissions..)
1046  */
1047 static struct vm_operations_struct file_private_mmap = {
1048         NULL,                   /* open */
1049         NULL,                   /* close */
1050         NULL,                   /* unmap */
1051         NULL,                   /* protect */
1052         NULL,                   /* sync */
1053         NULL,                   /* advise */
1054         filemap_nopage,         /* nopage */
1055         NULL,                   /* wppage */
1056         NULL,                   /* swapout */
1057         NULL,                   /* swapin */
1058 };
1059 
1060 /* This is used for a general mmap of a disk file */
1061 int generic_file_mmap(struct inode * inode, struct file * file, struct vm_area_struct * vma)
     /*  */
1062 {
1063         struct vm_operations_struct * ops;
1064 
1065         if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) {
1066                 ops = &file_shared_mmap;
1067                 /* share_page() can only guarantee proper page sharing if
1068                  * the offsets are all page aligned. */
1069                 if (vma->vm_offset & (PAGE_SIZE - 1))
1070                         return -EINVAL;
1071         } else {
1072                 ops = &file_private_mmap;
1073                 if (vma->vm_offset & (inode->i_sb->s_blocksize - 1))
1074                         return -EINVAL;
1075         }
1076         if (!inode->i_sb || !S_ISREG(inode->i_mode))
1077                 return -EACCES;
1078         if (!inode->i_op || !inode->i_op->readpage)
1079                 return -ENOEXEC;
1080         if (!IS_RDONLY(inode)) {
1081                 inode->i_atime = CURRENT_TIME;
1082                 inode->i_dirt = 1;
1083         }
1084         vma->vm_inode = inode;
1085         inode->i_count++;
1086         vma->vm_ops = ops;
1087         return 0;
1088 }
1089 
1090 
1091 /*
1092  * The msync() system call.
1093  */
1094 
1095 static int msync_interval(struct vm_area_struct * vma,
     /*  */
1096         unsigned long start, unsigned long end, int flags)
1097 {
1098         if (!vma->vm_inode)
1099                 return 0;
1100         if (vma->vm_ops->sync) {
1101                 int error;
1102                 error = vma->vm_ops->sync(vma, start, end-start, flags);
1103                 if (error)
1104                         return error;
1105                 if (flags & MS_SYNC)
1106                         return file_fsync(vma->vm_inode, NULL);
1107                 return 0;
1108         }
1109         return 0;
1110 }
1111 
1112 asmlinkage int sys_msync(unsigned long start, size_t len, int flags)
     /*  */
1113 {
1114         unsigned long end;
1115         struct vm_area_struct * vma;
1116         int unmapped_error, error;
1117 
1118         if (start & ~PAGE_MASK)
1119                 return -EINVAL;
1120         len = (len + ~PAGE_MASK) & PAGE_MASK;
1121         end = start + len;
1122         if (end < start)
1123                 return -EINVAL;
1124         if (flags & ~(MS_ASYNC | MS_INVALIDATE | MS_SYNC))
1125                 return -EINVAL;
1126         if (end == start)
1127                 return 0;
1128         /*
1129          * If the interval [start,end) covers some unmapped address ranges,
1130          * just ignore them, but return -EFAULT at the end.
1131          */
1132         vma = find_vma(current, start);
1133         unmapped_error = 0;
1134         for (;;) {
1135                 /* Still start < end. */
1136                 if (!vma)
1137                         return -EFAULT;
1138                 /* Here start < vma->vm_end. */
1139                 if (start < vma->vm_start) {
1140                         unmapped_error = -EFAULT;
1141                         start = vma->vm_start;
1142                 }
1143                 /* Here vma->vm_start <= start < vma->vm_end. */
1144                 if (end <= vma->vm_end) {
1145                         if (start < end) {
1146                                 error = msync_interval(vma, start, end, flags);
1147                                 if (error)
1148                                         return error;
1149                         }
1150                         return unmapped_error;
1151                 }
1152                 /* Here vma->vm_start <= start < vma->vm_end < end. */
1153                 error = msync_interval(vma, start, vma->vm_end, flags);
1154                 if (error)
1155                         return error;
1156                 start = vma->vm_end;
1157                 vma = vma->vm_next;
1158         }
1159 }
/* */
root/mm/filemap.c

DEFINITIONS