Merge master.kernel.org:/home/rmk/linux-2.6-arm
[linux-2.6] / fs / xfs / linux-2.6 / xfs_aops.c
1 /*
2  * Copyright (c) 2000-2005 Silicon Graphics, Inc.
3  * All Rights Reserved.
4  *
5  * This program is free software; you can redistribute it and/or
6  * modify it under the terms of the GNU General Public License as
7  * published by the Free Software Foundation.
8  *
9  * This program is distributed in the hope that it would be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12  * GNU General Public License for more details.
13  *
14  * You should have received a copy of the GNU General Public License
15  * along with this program; if not, write the Free Software Foundation,
16  * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
17  */
18 #include "xfs.h"
19 #include "xfs_bit.h"
20 #include "xfs_log.h"
21 #include "xfs_inum.h"
22 #include "xfs_sb.h"
23 #include "xfs_ag.h"
24 #include "xfs_dir.h"
25 #include "xfs_dir2.h"
26 #include "xfs_trans.h"
27 #include "xfs_dmapi.h"
28 #include "xfs_mount.h"
29 #include "xfs_bmap_btree.h"
30 #include "xfs_alloc_btree.h"
31 #include "xfs_ialloc_btree.h"
32 #include "xfs_dir_sf.h"
33 #include "xfs_dir2_sf.h"
34 #include "xfs_attr_sf.h"
35 #include "xfs_dinode.h"
36 #include "xfs_inode.h"
37 #include "xfs_alloc.h"
38 #include "xfs_btree.h"
39 #include "xfs_error.h"
40 #include "xfs_rw.h"
41 #include "xfs_iomap.h"
42 #include <linux/mpage.h>
43 #include <linux/writeback.h>
44
45 STATIC void xfs_count_page_state(struct page *, int *, int *, int *);
46 STATIC void xfs_convert_page(struct inode *, struct page *, xfs_iomap_t *,
47                 struct writeback_control *wbc, void *, int, int);
48
49 #if defined(XFS_RW_TRACE)
50 void
51 xfs_page_trace(
52         int             tag,
53         struct inode    *inode,
54         struct page     *page,
55         int             mask)
56 {
57         xfs_inode_t     *ip;
58         bhv_desc_t      *bdp;
59         vnode_t         *vp = LINVFS_GET_VP(inode);
60         loff_t          isize = i_size_read(inode);
61         loff_t          offset = (loff_t)page->index << PAGE_CACHE_SHIFT;
62         int             delalloc = -1, unmapped = -1, unwritten = -1;
63
64         if (page_has_buffers(page))
65                 xfs_count_page_state(page, &delalloc, &unmapped, &unwritten);
66
67         bdp = vn_bhv_lookup(VN_BHV_HEAD(vp), &xfs_vnodeops);
68         ip = XFS_BHVTOI(bdp);
69         if (!ip->i_rwtrace)
70                 return;
71
72         ktrace_enter(ip->i_rwtrace,
73                 (void *)((unsigned long)tag),
74                 (void *)ip,
75                 (void *)inode,
76                 (void *)page,
77                 (void *)((unsigned long)mask),
78                 (void *)((unsigned long)((ip->i_d.di_size >> 32) & 0xffffffff)),
79                 (void *)((unsigned long)(ip->i_d.di_size & 0xffffffff)),
80                 (void *)((unsigned long)((isize >> 32) & 0xffffffff)),
81                 (void *)((unsigned long)(isize & 0xffffffff)),
82                 (void *)((unsigned long)((offset >> 32) & 0xffffffff)),
83                 (void *)((unsigned long)(offset & 0xffffffff)),
84                 (void *)((unsigned long)delalloc),
85                 (void *)((unsigned long)unmapped),
86                 (void *)((unsigned long)unwritten),
87                 (void *)NULL,
88                 (void *)NULL);
89 }
90 #else
91 #define xfs_page_trace(tag, inode, page, mask)
92 #endif
93
94 /*
95  * Schedule IO completion handling on a xfsdatad if this was
96  * the final hold on this ioend.
97  */
98 STATIC void
99 xfs_finish_ioend(
100         xfs_ioend_t             *ioend)
101 {
102         if (atomic_dec_and_test(&ioend->io_remaining))
103                 queue_work(xfsdatad_workqueue, &ioend->io_work);
104 }
105
106 STATIC void
107 xfs_destroy_ioend(
108         xfs_ioend_t             *ioend)
109 {
110         vn_iowake(ioend->io_vnode);
111         mempool_free(ioend, xfs_ioend_pool);
112 }
113
114 /*
115  * Issue transactions to convert a buffer range from unwritten
116  * to written extents.
117  */
118 STATIC void
119 xfs_end_bio_unwritten(
120         void                    *data)
121 {
122         xfs_ioend_t             *ioend = data;
123         vnode_t                 *vp = ioend->io_vnode;
124         xfs_off_t               offset = ioend->io_offset;
125         size_t                  size = ioend->io_size;
126         struct buffer_head      *bh, *next;
127         int                     error;
128
129         if (ioend->io_uptodate)
130                 VOP_BMAP(vp, offset, size, BMAPI_UNWRITTEN, NULL, NULL, error);
131
132         /* ioend->io_buffer_head is only non-NULL for buffered I/O */
133         for (bh = ioend->io_buffer_head; bh; bh = next) {
134                 next = bh->b_private;
135
136                 bh->b_end_io = NULL;
137                 clear_buffer_unwritten(bh);
138                 end_buffer_async_write(bh, ioend->io_uptodate);
139         }
140
141         xfs_destroy_ioend(ioend);
142 }
143
144 /*
145  * Allocate and initialise an IO completion structure.
146  * We need to track unwritten extent write completion here initially.
147  * We'll need to extend this for updating the ondisk inode size later
148  * (vs. incore size).
149  */
150 STATIC xfs_ioend_t *
151 xfs_alloc_ioend(
152         struct inode            *inode)
153 {
154         xfs_ioend_t             *ioend;
155
156         ioend = mempool_alloc(xfs_ioend_pool, GFP_NOFS);
157
158         /*
159          * Set the count to 1 initially, which will prevent an I/O
160          * completion callback from happening before we have started
161          * all the I/O from calling the completion routine too early.
162          */
163         atomic_set(&ioend->io_remaining, 1);
164         ioend->io_uptodate = 1; /* cleared if any I/O fails */
165         ioend->io_vnode = LINVFS_GET_VP(inode);
166         ioend->io_buffer_head = NULL;
167         atomic_inc(&ioend->io_vnode->v_iocount);
168         ioend->io_offset = 0;
169         ioend->io_size = 0;
170
171         INIT_WORK(&ioend->io_work, xfs_end_bio_unwritten, ioend);
172
173         return ioend;
174 }
175
176 void
177 linvfs_unwritten_done(
178         struct buffer_head      *bh,
179         int                     uptodate)
180 {
181         xfs_ioend_t             *ioend = bh->b_private;
182         static spinlock_t       unwritten_done_lock = SPIN_LOCK_UNLOCKED;
183         unsigned long           flags;
184
185         ASSERT(buffer_unwritten(bh));
186         bh->b_end_io = NULL;
187
188         if (!uptodate)
189                 ioend->io_uptodate = 0;
190
191         /*
192          * Deep magic here.  We reuse b_private in the buffer_heads to build
193          * a chain for completing the I/O from user context after we've issued
194          * a transaction to convert the unwritten extent.
195          */
196         spin_lock_irqsave(&unwritten_done_lock, flags);
197         bh->b_private = ioend->io_buffer_head;
198         ioend->io_buffer_head = bh;
199         spin_unlock_irqrestore(&unwritten_done_lock, flags);
200
201         xfs_finish_ioend(ioend);
202 }
203
204 STATIC int
205 xfs_map_blocks(
206         struct inode            *inode,
207         loff_t                  offset,
208         ssize_t                 count,
209         xfs_iomap_t             *mapp,
210         int                     flags)
211 {
212         vnode_t                 *vp = LINVFS_GET_VP(inode);
213         int                     error, nmaps = 1;
214
215         VOP_BMAP(vp, offset, count, flags, mapp, &nmaps, error);
216         if (!error && (flags & (BMAPI_WRITE|BMAPI_ALLOCATE)))
217                 VMODIFY(vp);
218         return -error;
219 }
220
221 /*
222  * Finds the corresponding mapping in block @map array of the
223  * given @offset within a @page.
224  */
225 STATIC xfs_iomap_t *
226 xfs_offset_to_map(
227         struct page             *page,
228         xfs_iomap_t             *iomapp,
229         unsigned long           offset)
230 {
231         loff_t                  full_offset;    /* offset from start of file */
232
233         ASSERT(offset < PAGE_CACHE_SIZE);
234
235         full_offset = page->index;              /* NB: using 64bit number */
236         full_offset <<= PAGE_CACHE_SHIFT;       /* offset from file start */
237         full_offset += offset;                  /* offset from page start */
238
239         if (full_offset < iomapp->iomap_offset)
240                 return NULL;
241         if (iomapp->iomap_offset + (iomapp->iomap_bsize -1) >= full_offset)
242                 return iomapp;
243         return NULL;
244 }
245
246 STATIC void
247 xfs_map_at_offset(
248         struct page             *page,
249         struct buffer_head      *bh,
250         unsigned long           offset,
251         int                     block_bits,
252         xfs_iomap_t             *iomapp)
253 {
254         xfs_daddr_t             bn;
255         loff_t                  delta;
256         int                     sector_shift;
257
258         ASSERT(!(iomapp->iomap_flags & IOMAP_HOLE));
259         ASSERT(!(iomapp->iomap_flags & IOMAP_DELAY));
260         ASSERT(iomapp->iomap_bn != IOMAP_DADDR_NULL);
261
262         delta = page->index;
263         delta <<= PAGE_CACHE_SHIFT;
264         delta += offset;
265         delta -= iomapp->iomap_offset;
266         delta >>= block_bits;
267
268         sector_shift = block_bits - BBSHIFT;
269         bn = iomapp->iomap_bn >> sector_shift;
270         bn += delta;
271         BUG_ON(!bn && !(iomapp->iomap_flags & IOMAP_REALTIME));
272         ASSERT((bn << sector_shift) >= iomapp->iomap_bn);
273
274         lock_buffer(bh);
275         bh->b_blocknr = bn;
276         bh->b_bdev = iomapp->iomap_target->pbr_bdev;
277         set_buffer_mapped(bh);
278         clear_buffer_delay(bh);
279 }
280
281 /*
282  * Look for a page at index which is unlocked and contains our
283  * unwritten extent flagged buffers at its head.  Returns page
284  * locked and with an extra reference count, and length of the
285  * unwritten extent component on this page that we can write,
286  * in units of filesystem blocks.
287  */
288 STATIC struct page *
289 xfs_probe_unwritten_page(
290         struct address_space    *mapping,
291         pgoff_t                 index,
292         xfs_iomap_t             *iomapp,
293         xfs_ioend_t             *ioend,
294         unsigned long           max_offset,
295         unsigned long           *fsbs,
296         unsigned int            bbits)
297 {
298         struct page             *page;
299
300         page = find_trylock_page(mapping, index);
301         if (!page)
302                 return NULL;
303         if (PageWriteback(page))
304                 goto out;
305
306         if (page->mapping && page_has_buffers(page)) {
307                 struct buffer_head      *bh, *head;
308                 unsigned long           p_offset = 0;
309
310                 *fsbs = 0;
311                 bh = head = page_buffers(page);
312                 do {
313                         if (!buffer_unwritten(bh) || !buffer_uptodate(bh))
314                                 break;
315                         if (!xfs_offset_to_map(page, iomapp, p_offset))
316                                 break;
317                         if (p_offset >= max_offset)
318                                 break;
319                         xfs_map_at_offset(page, bh, p_offset, bbits, iomapp);
320                         set_buffer_unwritten_io(bh);
321                         bh->b_private = ioend;
322                         p_offset += bh->b_size;
323                         (*fsbs)++;
324                 } while ((bh = bh->b_this_page) != head);
325
326                 if (p_offset)
327                         return page;
328         }
329
330 out:
331         unlock_page(page);
332         return NULL;
333 }
334
335 /*
336  * Look for a page at index which is unlocked and not mapped
337  * yet - clustering for mmap write case.
338  */
339 STATIC unsigned int
340 xfs_probe_unmapped_page(
341         struct address_space    *mapping,
342         pgoff_t                 index,
343         unsigned int            pg_offset)
344 {
345         struct page             *page;
346         int                     ret = 0;
347
348         page = find_trylock_page(mapping, index);
349         if (!page)
350                 return 0;
351         if (PageWriteback(page))
352                 goto out;
353
354         if (page->mapping && PageDirty(page)) {
355                 if (page_has_buffers(page)) {
356                         struct buffer_head      *bh, *head;
357
358                         bh = head = page_buffers(page);
359                         do {
360                                 if (buffer_mapped(bh) || !buffer_uptodate(bh))
361                                         break;
362                                 ret += bh->b_size;
363                                 if (ret >= pg_offset)
364                                         break;
365                         } while ((bh = bh->b_this_page) != head);
366                 } else
367                         ret = PAGE_CACHE_SIZE;
368         }
369
370 out:
371         unlock_page(page);
372         return ret;
373 }
374
375 STATIC unsigned int
376 xfs_probe_unmapped_cluster(
377         struct inode            *inode,
378         struct page             *startpage,
379         struct buffer_head      *bh,
380         struct buffer_head      *head)
381 {
382         pgoff_t                 tindex, tlast, tloff;
383         unsigned int            pg_offset, len, total = 0;
384         struct address_space    *mapping = inode->i_mapping;
385
386         /* First sum forwards in this page */
387         do {
388                 if (buffer_mapped(bh))
389                         break;
390                 total += bh->b_size;
391         } while ((bh = bh->b_this_page) != head);
392
393         /* If we reached the end of the page, sum forwards in
394          * following pages.
395          */
396         if (bh == head) {
397                 tlast = i_size_read(inode) >> PAGE_CACHE_SHIFT;
398                 /* Prune this back to avoid pathological behavior */
399                 tloff = min(tlast, startpage->index + 64);
400                 for (tindex = startpage->index + 1; tindex < tloff; tindex++) {
401                         len = xfs_probe_unmapped_page(mapping, tindex,
402                                                         PAGE_CACHE_SIZE);
403                         if (!len)
404                                 return total;
405                         total += len;
406                 }
407                 if (tindex == tlast &&
408                     (pg_offset = i_size_read(inode) & (PAGE_CACHE_SIZE - 1))) {
409                         total += xfs_probe_unmapped_page(mapping,
410                                                         tindex, pg_offset);
411                 }
412         }
413         return total;
414 }
415
416 /*
417  * Probe for a given page (index) in the inode and test if it is delayed
418  * and without unwritten buffers.  Returns page locked and with an extra
419  * reference count.
420  */
421 STATIC struct page *
422 xfs_probe_delalloc_page(
423         struct inode            *inode,
424         pgoff_t                 index)
425 {
426         struct page             *page;
427
428         page = find_trylock_page(inode->i_mapping, index);
429         if (!page)
430                 return NULL;
431         if (PageWriteback(page))
432                 goto out;
433
434         if (page->mapping && page_has_buffers(page)) {
435                 struct buffer_head      *bh, *head;
436                 int                     acceptable = 0;
437
438                 bh = head = page_buffers(page);
439                 do {
440                         if (buffer_unwritten(bh)) {
441                                 acceptable = 0;
442                                 break;
443                         } else if (buffer_delay(bh)) {
444                                 acceptable = 1;
445                         }
446                 } while ((bh = bh->b_this_page) != head);
447
448                 if (acceptable)
449                         return page;
450         }
451
452 out:
453         unlock_page(page);
454         return NULL;
455 }
456
457 STATIC int
458 xfs_map_unwritten(
459         struct inode            *inode,
460         struct page             *start_page,
461         struct buffer_head      *head,
462         struct buffer_head      *curr,
463         unsigned long           p_offset,
464         int                     block_bits,
465         xfs_iomap_t             *iomapp,
466         struct writeback_control *wbc,
467         int                     startio,
468         int                     all_bh)
469 {
470         struct buffer_head      *bh = curr;
471         xfs_iomap_t             *tmp;
472         xfs_ioend_t             *ioend;
473         loff_t                  offset;
474         unsigned long           nblocks = 0;
475
476         offset = start_page->index;
477         offset <<= PAGE_CACHE_SHIFT;
478         offset += p_offset;
479
480         ioend = xfs_alloc_ioend(inode);
481
482         /* First map forwards in the page consecutive buffers
483          * covering this unwritten extent
484          */
485         do {
486                 if (!buffer_unwritten(bh))
487                         break;
488                 tmp = xfs_offset_to_map(start_page, iomapp, p_offset);
489                 if (!tmp)
490                         break;
491                 xfs_map_at_offset(start_page, bh, p_offset, block_bits, iomapp);
492                 set_buffer_unwritten_io(bh);
493                 bh->b_private = ioend;
494                 p_offset += bh->b_size;
495                 nblocks++;
496         } while ((bh = bh->b_this_page) != head);
497
498         atomic_add(nblocks, &ioend->io_remaining);
499
500         /* If we reached the end of the page, map forwards in any
501          * following pages which are also covered by this extent.
502          */
503         if (bh == head) {
504                 struct address_space    *mapping = inode->i_mapping;
505                 pgoff_t                 tindex, tloff, tlast;
506                 unsigned long           bs;
507                 unsigned int            pg_offset, bbits = inode->i_blkbits;
508                 struct page             *page;
509
510                 tlast = i_size_read(inode) >> PAGE_CACHE_SHIFT;
511                 tloff = (iomapp->iomap_offset + iomapp->iomap_bsize) >> PAGE_CACHE_SHIFT;
512                 tloff = min(tlast, tloff);
513                 for (tindex = start_page->index + 1; tindex < tloff; tindex++) {
514                         page = xfs_probe_unwritten_page(mapping,
515                                                 tindex, iomapp, ioend,
516                                                 PAGE_CACHE_SIZE, &bs, bbits);
517                         if (!page)
518                                 break;
519                         nblocks += bs;
520                         atomic_add(bs, &ioend->io_remaining);
521                         xfs_convert_page(inode, page, iomapp, wbc, ioend,
522                                                         startio, all_bh);
523                         /* stop if converting the next page might add
524                          * enough blocks that the corresponding byte
525                          * count won't fit in our ulong page buf length */
526                         if (nblocks >= ((ULONG_MAX - PAGE_SIZE) >> block_bits))
527                                 goto enough;
528                 }
529
530                 if (tindex == tlast &&
531                     (pg_offset = (i_size_read(inode) & (PAGE_CACHE_SIZE - 1)))) {
532                         page = xfs_probe_unwritten_page(mapping,
533                                                         tindex, iomapp, ioend,
534                                                         pg_offset, &bs, bbits);
535                         if (page) {
536                                 nblocks += bs;
537                                 atomic_add(bs, &ioend->io_remaining);
538                                 xfs_convert_page(inode, page, iomapp, wbc, ioend,
539                                                         startio, all_bh);
540                                 if (nblocks >= ((ULONG_MAX - PAGE_SIZE) >> block_bits))
541                                         goto enough;
542                         }
543                 }
544         }
545
546 enough:
547         ioend->io_size = (xfs_off_t)nblocks << block_bits;
548         ioend->io_offset = offset;
549         xfs_finish_ioend(ioend);
550         return 0;
551 }
552
553 STATIC void
554 xfs_submit_page(
555         struct page             *page,
556         struct writeback_control *wbc,
557         struct buffer_head      *bh_arr[],
558         int                     bh_count,
559         int                     probed_page,
560         int                     clear_dirty)
561 {
562         struct buffer_head      *bh;
563         int                     i;
564
565         BUG_ON(PageWriteback(page));
566         if (bh_count)
567                 set_page_writeback(page);
568         if (clear_dirty)
569                 clear_page_dirty(page);
570         unlock_page(page);
571
572         if (bh_count) {
573                 for (i = 0; i < bh_count; i++) {
574                         bh = bh_arr[i];
575                         mark_buffer_async_write(bh);
576                         if (buffer_unwritten(bh))
577                                 set_buffer_unwritten_io(bh);
578                         set_buffer_uptodate(bh);
579                         clear_buffer_dirty(bh);
580                 }
581
582                 for (i = 0; i < bh_count; i++)
583                         submit_bh(WRITE, bh_arr[i]);
584
585                 if (probed_page && clear_dirty)
586                         wbc->nr_to_write--;     /* Wrote an "extra" page */
587         }
588 }
589
590 /*
591  * Allocate & map buffers for page given the extent map. Write it out.
592  * except for the original page of a writepage, this is called on
593  * delalloc/unwritten pages only, for the original page it is possible
594  * that the page has no mapping at all.
595  */
596 STATIC void
597 xfs_convert_page(
598         struct inode            *inode,
599         struct page             *page,
600         xfs_iomap_t             *iomapp,
601         struct writeback_control *wbc,
602         void                    *private,
603         int                     startio,
604         int                     all_bh)
605 {
606         struct buffer_head      *bh_arr[MAX_BUF_PER_PAGE], *bh, *head;
607         xfs_iomap_t             *mp = iomapp, *tmp;
608         unsigned long           offset, end_offset;
609         int                     index = 0;
610         int                     bbits = inode->i_blkbits;
611         int                     len, page_dirty;
612
613         end_offset = (i_size_read(inode) & (PAGE_CACHE_SIZE - 1));
614
615         /*
616          * page_dirty is initially a count of buffers on the page before
617          * EOF and is decrememted as we move each into a cleanable state.
618          */
619         len = 1 << inode->i_blkbits;
620         end_offset = max(end_offset, PAGE_CACHE_SIZE);
621         end_offset = roundup(end_offset, len);
622         page_dirty = end_offset / len;
623
624         offset = 0;
625         bh = head = page_buffers(page);
626         do {
627                 if (offset >= end_offset)
628                         break;
629                 if (!(PageUptodate(page) || buffer_uptodate(bh)))
630                         continue;
631                 if (buffer_mapped(bh) && all_bh &&
632                     !(buffer_unwritten(bh) || buffer_delay(bh))) {
633                         if (startio) {
634                                 lock_buffer(bh);
635                                 bh_arr[index++] = bh;
636                                 page_dirty--;
637                         }
638                         continue;
639                 }
640                 tmp = xfs_offset_to_map(page, mp, offset);
641                 if (!tmp)
642                         continue;
643                 ASSERT(!(tmp->iomap_flags & IOMAP_HOLE));
644                 ASSERT(!(tmp->iomap_flags & IOMAP_DELAY));
645
646                 /* If this is a new unwritten extent buffer (i.e. one
647                  * that we haven't passed in private data for, we must
648                  * now map this buffer too.
649                  */
650                 if (buffer_unwritten(bh) && !bh->b_end_io) {
651                         ASSERT(tmp->iomap_flags & IOMAP_UNWRITTEN);
652                         xfs_map_unwritten(inode, page, head, bh, offset,
653                                         bbits, tmp, wbc, startio, all_bh);
654                 } else if (! (buffer_unwritten(bh) && buffer_locked(bh))) {
655                         xfs_map_at_offset(page, bh, offset, bbits, tmp);
656                         if (buffer_unwritten(bh)) {
657                                 set_buffer_unwritten_io(bh);
658                                 bh->b_private = private;
659                                 ASSERT(private);
660                         }
661                 }
662                 if (startio) {
663                         bh_arr[index++] = bh;
664                 } else {
665                         set_buffer_dirty(bh);
666                         unlock_buffer(bh);
667                         mark_buffer_dirty(bh);
668                 }
669                 page_dirty--;
670         } while (offset += len, (bh = bh->b_this_page) != head);
671
672         if (startio && index) {
673                 xfs_submit_page(page, wbc, bh_arr, index, 1, !page_dirty);
674         } else {
675                 unlock_page(page);
676         }
677 }
678
679 /*
680  * Convert & write out a cluster of pages in the same extent as defined
681  * by mp and following the start page.
682  */
683 STATIC void
684 xfs_cluster_write(
685         struct inode            *inode,
686         pgoff_t                 tindex,
687         xfs_iomap_t             *iomapp,
688         struct writeback_control *wbc,
689         int                     startio,
690         int                     all_bh,
691         pgoff_t                 tlast)
692 {
693         struct page             *page;
694
695         for (; tindex <= tlast; tindex++) {
696                 page = xfs_probe_delalloc_page(inode, tindex);
697                 if (!page)
698                         break;
699                 xfs_convert_page(inode, page, iomapp, wbc, NULL,
700                                 startio, all_bh);
701         }
702 }
703
704 /*
705  * Calling this without startio set means we are being asked to make a dirty
706  * page ready for freeing it's buffers.  When called with startio set then
707  * we are coming from writepage.
708  *
709  * When called with startio set it is important that we write the WHOLE
710  * page if possible.
711  * The bh->b_state's cannot know if any of the blocks or which block for
712  * that matter are dirty due to mmap writes, and therefore bh uptodate is
713  * only vaild if the page itself isn't completely uptodate.  Some layers
714  * may clear the page dirty flag prior to calling write page, under the
715  * assumption the entire page will be written out; by not writing out the
716  * whole page the page can be reused before all valid dirty data is
717  * written out.  Note: in the case of a page that has been dirty'd by
718  * mapwrite and but partially setup by block_prepare_write the
719  * bh->b_states's will not agree and only ones setup by BPW/BCW will have
720  * valid state, thus the whole page must be written out thing.
721  */
722
723 STATIC int
724 xfs_page_state_convert(
725         struct inode    *inode,
726         struct page     *page,
727         struct writeback_control *wbc,
728         int             startio,
729         int             unmapped) /* also implies page uptodate */
730 {
731         struct buffer_head      *bh_arr[MAX_BUF_PER_PAGE], *bh, *head;
732         xfs_iomap_t             *iomp, iomap;
733         loff_t                  offset;
734         unsigned long           p_offset = 0;
735         __uint64_t              end_offset;
736         pgoff_t                 end_index, last_index, tlast;
737         int                     len, err, i, cnt = 0, uptodate = 1;
738         int                     flags;
739         int                     page_dirty;
740
741         /* wait for other IO threads? */
742         flags = (startio && wbc->sync_mode != WB_SYNC_NONE) ? 0 : BMAPI_TRYLOCK;
743
744         /* Is this page beyond the end of the file? */
745         offset = i_size_read(inode);
746         end_index = offset >> PAGE_CACHE_SHIFT;
747         last_index = (offset - 1) >> PAGE_CACHE_SHIFT;
748         if (page->index >= end_index) {
749                 if ((page->index >= end_index + 1) ||
750                     !(i_size_read(inode) & (PAGE_CACHE_SIZE - 1))) {
751                         if (startio)
752                                 unlock_page(page);
753                         return 0;
754                 }
755         }
756
757         end_offset = min_t(unsigned long long,
758                         (loff_t)(page->index + 1) << PAGE_CACHE_SHIFT, offset);
759         offset = (loff_t)page->index << PAGE_CACHE_SHIFT;
760
761         /*
762          * page_dirty is initially a count of buffers on the page before
763          * EOF and is decrememted as we move each into a cleanable state.
764          */
765         len = 1 << inode->i_blkbits;
766         p_offset = max(p_offset, PAGE_CACHE_SIZE);
767         p_offset = roundup(p_offset, len);
768         page_dirty = p_offset / len;
769
770         iomp = NULL;
771         p_offset = 0;
772         bh = head = page_buffers(page);
773
774         do {
775                 if (offset >= end_offset)
776                         break;
777                 if (!buffer_uptodate(bh))
778                         uptodate = 0;
779                 if (!(PageUptodate(page) || buffer_uptodate(bh)) && !startio)
780                         continue;
781
782                 if (iomp) {
783                         iomp = xfs_offset_to_map(page, &iomap, p_offset);
784                 }
785
786                 /*
787                  * First case, map an unwritten extent and prepare for
788                  * extent state conversion transaction on completion.
789                  */
790                 if (buffer_unwritten(bh)) {
791                         if (!startio)
792                                 continue;
793                         if (!iomp) {
794                                 err = xfs_map_blocks(inode, offset, len, &iomap,
795                                                 BMAPI_WRITE|BMAPI_IGNSTATE);
796                                 if (err) {
797                                         goto error;
798                                 }
799                                 iomp = xfs_offset_to_map(page, &iomap,
800                                                                 p_offset);
801                         }
802                         if (iomp) {
803                                 if (!bh->b_end_io) {
804                                         err = xfs_map_unwritten(inode, page,
805                                                         head, bh, p_offset,
806                                                         inode->i_blkbits, iomp,
807                                                         wbc, startio, unmapped);
808                                         if (err) {
809                                                 goto error;
810                                         }
811                                 } else {
812                                         set_bit(BH_Lock, &bh->b_state);
813                                 }
814                                 BUG_ON(!buffer_locked(bh));
815                                 bh_arr[cnt++] = bh;
816                                 page_dirty--;
817                         }
818                 /*
819                  * Second case, allocate space for a delalloc buffer.
820                  * We can return EAGAIN here in the release page case.
821                  */
822                 } else if (buffer_delay(bh)) {
823                         if (!iomp) {
824                                 err = xfs_map_blocks(inode, offset, len, &iomap,
825                                                 BMAPI_ALLOCATE | flags);
826                                 if (err) {
827                                         goto error;
828                                 }
829                                 iomp = xfs_offset_to_map(page, &iomap,
830                                                                 p_offset);
831                         }
832                         if (iomp) {
833                                 xfs_map_at_offset(page, bh, p_offset,
834                                                 inode->i_blkbits, iomp);
835                                 if (startio) {
836                                         bh_arr[cnt++] = bh;
837                                 } else {
838                                         set_buffer_dirty(bh);
839                                         unlock_buffer(bh);
840                                         mark_buffer_dirty(bh);
841                                 }
842                                 page_dirty--;
843                         }
844                 } else if ((buffer_uptodate(bh) || PageUptodate(page)) &&
845                            (unmapped || startio)) {
846
847                         if (!buffer_mapped(bh)) {
848                                 int     size;
849
850                                 /*
851                                  * Getting here implies an unmapped buffer
852                                  * was found, and we are in a path where we
853                                  * need to write the whole page out.
854                                  */
855                                 if (!iomp) {
856                                         size = xfs_probe_unmapped_cluster(
857                                                         inode, page, bh, head);
858                                         err = xfs_map_blocks(inode, offset,
859                                                         size, &iomap,
860                                                         BMAPI_WRITE|BMAPI_MMAP);
861                                         if (err) {
862                                                 goto error;
863                                         }
864                                         iomp = xfs_offset_to_map(page, &iomap,
865                                                                      p_offset);
866                                 }
867                                 if (iomp) {
868                                         xfs_map_at_offset(page,
869                                                         bh, p_offset,
870                                                         inode->i_blkbits, iomp);
871                                         if (startio) {
872                                                 bh_arr[cnt++] = bh;
873                                         } else {
874                                                 set_buffer_dirty(bh);
875                                                 unlock_buffer(bh);
876                                                 mark_buffer_dirty(bh);
877                                         }
878                                         page_dirty--;
879                                 }
880                         } else if (startio) {
881                                 if (buffer_uptodate(bh) &&
882                                     !test_and_set_bit(BH_Lock, &bh->b_state)) {
883                                         bh_arr[cnt++] = bh;
884                                         page_dirty--;
885                                 }
886                         }
887                 }
888         } while (offset += len, p_offset += len,
889                 ((bh = bh->b_this_page) != head));
890
891         if (uptodate && bh == head)
892                 SetPageUptodate(page);
893
894         if (startio) {
895                 xfs_submit_page(page, wbc, bh_arr, cnt, 0, !page_dirty);
896         }
897
898         if (iomp) {
899                 offset = (iomp->iomap_offset + iomp->iomap_bsize - 1) >>
900                                         PAGE_CACHE_SHIFT;
901                 tlast = min_t(pgoff_t, offset, last_index);
902                 xfs_cluster_write(inode, page->index + 1, iomp, wbc,
903                                         startio, unmapped, tlast);
904         }
905
906         return page_dirty;
907
908 error:
909         for (i = 0; i < cnt; i++) {
910                 unlock_buffer(bh_arr[i]);
911         }
912
913         /*
914          * If it's delalloc and we have nowhere to put it,
915          * throw it away, unless the lower layers told
916          * us to try again.
917          */
918         if (err != -EAGAIN) {
919                 if (!unmapped) {
920                         block_invalidatepage(page, 0);
921                 }
922                 ClearPageUptodate(page);
923         }
924         return err;
925 }
926
927 STATIC int
928 __linvfs_get_block(
929         struct inode            *inode,
930         sector_t                iblock,
931         unsigned long           blocks,
932         struct buffer_head      *bh_result,
933         int                     create,
934         int                     direct,
935         bmapi_flags_t           flags)
936 {
937         vnode_t                 *vp = LINVFS_GET_VP(inode);
938         xfs_iomap_t             iomap;
939         xfs_off_t               offset;
940         ssize_t                 size;
941         int                     retpbbm = 1;
942         int                     error;
943
944         offset = (xfs_off_t)iblock << inode->i_blkbits;
945         if (blocks)
946                 size = (ssize_t) min_t(xfs_off_t, LONG_MAX,
947                                         (xfs_off_t)blocks << inode->i_blkbits);
948         else
949                 size = 1 << inode->i_blkbits;
950
951         VOP_BMAP(vp, offset, size,
952                 create ? flags : BMAPI_READ, &iomap, &retpbbm, error);
953         if (error)
954                 return -error;
955
956         if (retpbbm == 0)
957                 return 0;
958
959         if (iomap.iomap_bn != IOMAP_DADDR_NULL) {
960                 xfs_daddr_t     bn;
961                 xfs_off_t       delta;
962
963                 /* For unwritten extents do not report a disk address on
964                  * the read case (treat as if we're reading into a hole).
965                  */
966                 if (create || !(iomap.iomap_flags & IOMAP_UNWRITTEN)) {
967                         delta = offset - iomap.iomap_offset;
968                         delta >>= inode->i_blkbits;
969
970                         bn = iomap.iomap_bn >> (inode->i_blkbits - BBSHIFT);
971                         bn += delta;
972                         BUG_ON(!bn && !(iomap.iomap_flags & IOMAP_REALTIME));
973                         bh_result->b_blocknr = bn;
974                         set_buffer_mapped(bh_result);
975                 }
976                 if (create && (iomap.iomap_flags & IOMAP_UNWRITTEN)) {
977                         if (direct)
978                                 bh_result->b_private = inode;
979                         set_buffer_unwritten(bh_result);
980                         set_buffer_delay(bh_result);
981                 }
982         }
983
984         /* If this is a realtime file, data might be on a new device */
985         bh_result->b_bdev = iomap.iomap_target->pbr_bdev;
986
987         /* If we previously allocated a block out beyond eof and
988          * we are now coming back to use it then we will need to
989          * flag it as new even if it has a disk address.
990          */
991         if (create &&
992             ((!buffer_mapped(bh_result) && !buffer_uptodate(bh_result)) ||
993              (offset >= i_size_read(inode)) || (iomap.iomap_flags & IOMAP_NEW)))
994                 set_buffer_new(bh_result);
995
996         if (iomap.iomap_flags & IOMAP_DELAY) {
997                 BUG_ON(direct);
998                 if (create) {
999                         set_buffer_uptodate(bh_result);
1000                         set_buffer_mapped(bh_result);
1001                         set_buffer_delay(bh_result);
1002                 }
1003         }
1004
1005         if (blocks) {
1006                 ASSERT(iomap.iomap_bsize - iomap.iomap_delta > 0);
1007                 offset = min_t(xfs_off_t,
1008                                 iomap.iomap_bsize - iomap.iomap_delta,
1009                                 (xfs_off_t)blocks << inode->i_blkbits);
1010                 bh_result->b_size = (u32) min_t(xfs_off_t, UINT_MAX, offset);
1011         }
1012
1013         return 0;
1014 }
1015
1016 int
1017 linvfs_get_block(
1018         struct inode            *inode,
1019         sector_t                iblock,
1020         struct buffer_head      *bh_result,
1021         int                     create)
1022 {
1023         return __linvfs_get_block(inode, iblock, 0, bh_result,
1024                                         create, 0, BMAPI_WRITE);
1025 }
1026
1027 STATIC int
1028 linvfs_get_blocks_direct(
1029         struct inode            *inode,
1030         sector_t                iblock,
1031         unsigned long           max_blocks,
1032         struct buffer_head      *bh_result,
1033         int                     create)
1034 {
1035         return __linvfs_get_block(inode, iblock, max_blocks, bh_result,
1036                                         create, 1, BMAPI_WRITE|BMAPI_DIRECT);
1037 }
1038
1039 STATIC void
1040 linvfs_end_io_direct(
1041         struct kiocb    *iocb,
1042         loff_t          offset,
1043         ssize_t         size,
1044         void            *private)
1045 {
1046         xfs_ioend_t     *ioend = iocb->private;
1047
1048         /*
1049          * Non-NULL private data means we need to issue a transaction to
1050          * convert a range from unwritten to written extents.  This needs
1051          * to happen from process contect but aio+dio I/O completion
1052          * happens from irq context so we need to defer it to a workqueue.
1053          * This is not nessecary for synchronous direct I/O, but we do
1054          * it anyway to keep the code uniform and simpler.
1055          *
1056          * The core direct I/O code might be changed to always call the
1057          * completion handler in the future, in which case all this can
1058          * go away.
1059          */
1060         if (private && size > 0) {
1061                 ioend->io_offset = offset;
1062                 ioend->io_size = size;
1063                 xfs_finish_ioend(ioend);
1064         } else {
1065                 ASSERT(size >= 0);
1066                 xfs_destroy_ioend(ioend);
1067         }
1068
1069         /*
1070          * blockdev_direct_IO can return an error even afer the I/O
1071          * completion handler was called.  Thus we need to protect
1072          * against double-freeing.
1073          */
1074         iocb->private = NULL;
1075 }
1076
1077 STATIC ssize_t
1078 linvfs_direct_IO(
1079         int                     rw,
1080         struct kiocb            *iocb,
1081         const struct iovec      *iov,
1082         loff_t                  offset,
1083         unsigned long           nr_segs)
1084 {
1085         struct file     *file = iocb->ki_filp;
1086         struct inode    *inode = file->f_mapping->host;
1087         vnode_t         *vp = LINVFS_GET_VP(inode);
1088         xfs_iomap_t     iomap;
1089         int             maps = 1;
1090         int             error;
1091         ssize_t         ret;
1092
1093         VOP_BMAP(vp, offset, 0, BMAPI_DEVICE, &iomap, &maps, error);
1094         if (error)
1095                 return -error;
1096
1097         iocb->private = xfs_alloc_ioend(inode);
1098
1099         ret = blockdev_direct_IO_own_locking(rw, iocb, inode,
1100                 iomap.iomap_target->pbr_bdev,
1101                 iov, offset, nr_segs,
1102                 linvfs_get_blocks_direct,
1103                 linvfs_end_io_direct);
1104
1105         if (unlikely(ret <= 0 && iocb->private))
1106                 xfs_destroy_ioend(iocb->private);
1107         return ret;
1108 }
1109
1110
1111 STATIC sector_t
1112 linvfs_bmap(
1113         struct address_space    *mapping,
1114         sector_t                block)
1115 {
1116         struct inode            *inode = (struct inode *)mapping->host;
1117         vnode_t                 *vp = LINVFS_GET_VP(inode);
1118         int                     error;
1119
1120         vn_trace_entry(vp, "linvfs_bmap", (inst_t *)__return_address);
1121
1122         VOP_RWLOCK(vp, VRWLOCK_READ);
1123         VOP_FLUSH_PAGES(vp, (xfs_off_t)0, -1, 0, FI_REMAPF, error);
1124         VOP_RWUNLOCK(vp, VRWLOCK_READ);
1125         return generic_block_bmap(mapping, block, linvfs_get_block);
1126 }
1127
1128 STATIC int
1129 linvfs_readpage(
1130         struct file             *unused,
1131         struct page             *page)
1132 {
1133         return mpage_readpage(page, linvfs_get_block);
1134 }
1135
1136 STATIC int
1137 linvfs_readpages(
1138         struct file             *unused,
1139         struct address_space    *mapping,
1140         struct list_head        *pages,
1141         unsigned                nr_pages)
1142 {
1143         return mpage_readpages(mapping, pages, nr_pages, linvfs_get_block);
1144 }
1145
1146 STATIC void
1147 xfs_count_page_state(
1148         struct page             *page,
1149         int                     *delalloc,
1150         int                     *unmapped,
1151         int                     *unwritten)
1152 {
1153         struct buffer_head      *bh, *head;
1154
1155         *delalloc = *unmapped = *unwritten = 0;
1156
1157         bh = head = page_buffers(page);
1158         do {
1159                 if (buffer_uptodate(bh) && !buffer_mapped(bh))
1160                         (*unmapped) = 1;
1161                 else if (buffer_unwritten(bh) && !buffer_delay(bh))
1162                         clear_buffer_unwritten(bh);
1163                 else if (buffer_unwritten(bh))
1164                         (*unwritten) = 1;
1165                 else if (buffer_delay(bh))
1166                         (*delalloc) = 1;
1167         } while ((bh = bh->b_this_page) != head);
1168 }
1169
1170
1171 /*
1172  * writepage: Called from one of two places:
1173  *
1174  * 1. we are flushing a delalloc buffer head.
1175  *
1176  * 2. we are writing out a dirty page. Typically the page dirty
1177  *    state is cleared before we get here. In this case is it
1178  *    conceivable we have no buffer heads.
1179  *
1180  * For delalloc space on the page we need to allocate space and
1181  * flush it. For unmapped buffer heads on the page we should
1182  * allocate space if the page is uptodate. For any other dirty
1183  * buffer heads on the page we should flush them.
1184  *
1185  * If we detect that a transaction would be required to flush
1186  * the page, we have to check the process flags first, if we
1187  * are already in a transaction or disk I/O during allocations
1188  * is off, we need to fail the writepage and redirty the page.
1189  */
1190
1191 STATIC int
1192 linvfs_writepage(
1193         struct page             *page,
1194         struct writeback_control *wbc)
1195 {
1196         int                     error;
1197         int                     need_trans;
1198         int                     delalloc, unmapped, unwritten;
1199         struct inode            *inode = page->mapping->host;
1200
1201         xfs_page_trace(XFS_WRITEPAGE_ENTER, inode, page, 0);
1202
1203         /*
1204          * We need a transaction if:
1205          *  1. There are delalloc buffers on the page
1206          *  2. The page is uptodate and we have unmapped buffers
1207          *  3. The page is uptodate and we have no buffers
1208          *  4. There are unwritten buffers on the page
1209          */
1210
1211         if (!page_has_buffers(page)) {
1212                 unmapped = 1;
1213                 need_trans = 1;
1214         } else {
1215                 xfs_count_page_state(page, &delalloc, &unmapped, &unwritten);
1216                 if (!PageUptodate(page))
1217                         unmapped = 0;
1218                 need_trans = delalloc + unmapped + unwritten;
1219         }
1220
1221         /*
1222          * If we need a transaction and the process flags say
1223          * we are already in a transaction, or no IO is allowed
1224          * then mark the page dirty again and leave the page
1225          * as is.
1226          */
1227         if (PFLAGS_TEST_FSTRANS() && need_trans)
1228                 goto out_fail;
1229
1230         /*
1231          * Delay hooking up buffer heads until we have
1232          * made our go/no-go decision.
1233          */
1234         if (!page_has_buffers(page))
1235                 create_empty_buffers(page, 1 << inode->i_blkbits, 0);
1236
1237         /*
1238          * Convert delayed allocate, unwritten or unmapped space
1239          * to real space and flush out to disk.
1240          */
1241         error = xfs_page_state_convert(inode, page, wbc, 1, unmapped);
1242         if (error == -EAGAIN)
1243                 goto out_fail;
1244         if (unlikely(error < 0))
1245                 goto out_unlock;
1246
1247         return 0;
1248
1249 out_fail:
1250         redirty_page_for_writepage(wbc, page);
1251         unlock_page(page);
1252         return 0;
1253 out_unlock:
1254         unlock_page(page);
1255         return error;
1256 }
1257
1258 STATIC int
1259 linvfs_invalidate_page(
1260         struct page             *page,
1261         unsigned long           offset)
1262 {
1263         xfs_page_trace(XFS_INVALIDPAGE_ENTER,
1264                         page->mapping->host, page, offset);
1265         return block_invalidatepage(page, offset);
1266 }
1267
1268 /*
1269  * Called to move a page into cleanable state - and from there
1270  * to be released. Possibly the page is already clean. We always
1271  * have buffer heads in this call.
1272  *
1273  * Returns 0 if the page is ok to release, 1 otherwise.
1274  *
1275  * Possible scenarios are:
1276  *
1277  * 1. We are being called to release a page which has been written
1278  *    to via regular I/O. buffer heads will be dirty and possibly
1279  *    delalloc. If no delalloc buffer heads in this case then we
1280  *    can just return zero.
1281  *
1282  * 2. We are called to release a page which has been written via
1283  *    mmap, all we need to do is ensure there is no delalloc
1284  *    state in the buffer heads, if not we can let the caller
1285  *    free them and we should come back later via writepage.
1286  */
1287 STATIC int
1288 linvfs_release_page(
1289         struct page             *page,
1290         gfp_t                   gfp_mask)
1291 {
1292         struct inode            *inode = page->mapping->host;
1293         int                     dirty, delalloc, unmapped, unwritten;
1294         struct writeback_control wbc = {
1295                 .sync_mode = WB_SYNC_ALL,
1296                 .nr_to_write = 1,
1297         };
1298
1299         xfs_page_trace(XFS_RELEASEPAGE_ENTER, inode, page, gfp_mask);
1300
1301         xfs_count_page_state(page, &delalloc, &unmapped, &unwritten);
1302         if (!delalloc && !unwritten)
1303                 goto free_buffers;
1304
1305         if (!(gfp_mask & __GFP_FS))
1306                 return 0;
1307
1308         /* If we are already inside a transaction or the thread cannot
1309          * do I/O, we cannot release this page.
1310          */
1311         if (PFLAGS_TEST_FSTRANS())
1312                 return 0;
1313
1314         /*
1315          * Convert delalloc space to real space, do not flush the
1316          * data out to disk, that will be done by the caller.
1317          * Never need to allocate space here - we will always
1318          * come back to writepage in that case.
1319          */
1320         dirty = xfs_page_state_convert(inode, page, &wbc, 0, 0);
1321         if (dirty == 0 && !unwritten)
1322                 goto free_buffers;
1323         return 0;
1324
1325 free_buffers:
1326         return try_to_free_buffers(page);
1327 }
1328
1329 STATIC int
1330 linvfs_prepare_write(
1331         struct file             *file,
1332         struct page             *page,
1333         unsigned int            from,
1334         unsigned int            to)
1335 {
1336         return block_prepare_write(page, from, to, linvfs_get_block);
1337 }
1338
1339 struct address_space_operations linvfs_aops = {
1340         .readpage               = linvfs_readpage,
1341         .readpages              = linvfs_readpages,
1342         .writepage              = linvfs_writepage,
1343         .sync_page              = block_sync_page,
1344         .releasepage            = linvfs_release_page,
1345         .invalidatepage         = linvfs_invalidate_page,
1346         .prepare_write          = linvfs_prepare_write,
1347         .commit_write           = generic_commit_write,
1348         .bmap                   = linvfs_bmap,
1349         .direct_IO              = linvfs_direct_IO,
1350 };