[XFS] Fix regression introduced by remount fixup
[linux-2.6] / fs / jbd2 / commit.c
1 /*
2  * linux/fs/jbd2/commit.c
3  *
4  * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
5  *
6  * Copyright 1998 Red Hat corp --- All Rights Reserved
7  *
8  * This file is part of the Linux kernel and is made available under
9  * the terms of the GNU General Public License, version 2, or at your
10  * option, any later version, incorporated herein by reference.
11  *
12  * Journal commit routines for the generic filesystem journaling code;
13  * part of the ext2fs journaling system.
14  */
15
16 #include <linux/time.h>
17 #include <linux/fs.h>
18 #include <linux/jbd2.h>
19 #include <linux/errno.h>
20 #include <linux/slab.h>
21 #include <linux/mm.h>
22 #include <linux/pagemap.h>
23 #include <linux/jiffies.h>
24 #include <linux/crc32.h>
25 #include <linux/writeback.h>
26 #include <linux/backing-dev.h>
27
28 /*
29  * Default IO end handler for temporary BJ_IO buffer_heads.
30  */
31 static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
32 {
33         BUFFER_TRACE(bh, "");
34         if (uptodate)
35                 set_buffer_uptodate(bh);
36         else
37                 clear_buffer_uptodate(bh);
38         unlock_buffer(bh);
39 }
40
41 /*
42  * When an ext4 file is truncated, it is possible that some pages are not
43  * successfully freed, because they are attached to a committing transaction.
44  * After the transaction commits, these pages are left on the LRU, with no
45  * ->mapping, and with attached buffers.  These pages are trivially reclaimable
46  * by the VM, but their apparent absence upsets the VM accounting, and it makes
47  * the numbers in /proc/meminfo look odd.
48  *
49  * So here, we have a buffer which has just come off the forget list.  Look to
50  * see if we can strip all buffers from the backing page.
51  *
52  * Called under lock_journal(), and possibly under journal_datalist_lock.  The
53  * caller provided us with a ref against the buffer, and we drop that here.
54  */
55 static void release_buffer_page(struct buffer_head *bh)
56 {
57         struct page *page;
58
59         if (buffer_dirty(bh))
60                 goto nope;
61         if (atomic_read(&bh->b_count) != 1)
62                 goto nope;
63         page = bh->b_page;
64         if (!page)
65                 goto nope;
66         if (page->mapping)
67                 goto nope;
68
69         /* OK, it's a truncated page */
70         if (!trylock_page(page))
71                 goto nope;
72
73         page_cache_get(page);
74         __brelse(bh);
75         try_to_free_buffers(page);
76         unlock_page(page);
77         page_cache_release(page);
78         return;
79
80 nope:
81         __brelse(bh);
82 }
83
84 /*
85  * Done it all: now submit the commit record.  We should have
86  * cleaned up our previous buffers by now, so if we are in abort
87  * mode we can now just skip the rest of the journal write
88  * entirely.
89  *
90  * Returns 1 if the journal needs to be aborted or 0 on success
91  */
92 static int journal_submit_commit_record(journal_t *journal,
93                                         transaction_t *commit_transaction,
94                                         struct buffer_head **cbh,
95                                         __u32 crc32_sum)
96 {
97         struct journal_head *descriptor;
98         struct commit_header *tmp;
99         struct buffer_head *bh;
100         int ret;
101         int barrier_done = 0;
102         struct timespec now = current_kernel_time();
103
104         if (is_journal_aborted(journal))
105                 return 0;
106
107         descriptor = jbd2_journal_get_descriptor_buffer(journal);
108         if (!descriptor)
109                 return 1;
110
111         bh = jh2bh(descriptor);
112
113         tmp = (struct commit_header *)bh->b_data;
114         tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
115         tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK);
116         tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid);
117         tmp->h_commit_sec = cpu_to_be64(now.tv_sec);
118         tmp->h_commit_nsec = cpu_to_be32(now.tv_nsec);
119
120         if (JBD2_HAS_COMPAT_FEATURE(journal,
121                                     JBD2_FEATURE_COMPAT_CHECKSUM)) {
122                 tmp->h_chksum_type      = JBD2_CRC32_CHKSUM;
123                 tmp->h_chksum_size      = JBD2_CRC32_CHKSUM_SIZE;
124                 tmp->h_chksum[0]        = cpu_to_be32(crc32_sum);
125         }
126
127         JBUFFER_TRACE(descriptor, "submit commit block");
128         lock_buffer(bh);
129         get_bh(bh);
130         set_buffer_dirty(bh);
131         set_buffer_uptodate(bh);
132         bh->b_end_io = journal_end_buffer_io_sync;
133
134         if (journal->j_flags & JBD2_BARRIER &&
135                 !JBD2_HAS_INCOMPAT_FEATURE(journal,
136                                          JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
137                 set_buffer_ordered(bh);
138                 barrier_done = 1;
139         }
140         ret = submit_bh(WRITE, bh);
141         if (barrier_done)
142                 clear_buffer_ordered(bh);
143
144         /* is it possible for another commit to fail at roughly
145          * the same time as this one?  If so, we don't want to
146          * trust the barrier flag in the super, but instead want
147          * to remember if we sent a barrier request
148          */
149         if (ret == -EOPNOTSUPP && barrier_done) {
150                 char b[BDEVNAME_SIZE];
151
152                 printk(KERN_WARNING
153                         "JBD: barrier-based sync failed on %s - "
154                         "disabling barriers\n",
155                         bdevname(journal->j_dev, b));
156                 spin_lock(&journal->j_state_lock);
157                 journal->j_flags &= ~JBD2_BARRIER;
158                 spin_unlock(&journal->j_state_lock);
159
160                 /* And try again, without the barrier */
161                 lock_buffer(bh);
162                 set_buffer_uptodate(bh);
163                 set_buffer_dirty(bh);
164                 ret = submit_bh(WRITE, bh);
165         }
166         *cbh = bh;
167         return ret;
168 }
169
170 /*
171  * This function along with journal_submit_commit_record
172  * allows to write the commit record asynchronously.
173  */
174 static int journal_wait_on_commit_record(struct buffer_head *bh)
175 {
176         int ret = 0;
177
178         clear_buffer_dirty(bh);
179         wait_on_buffer(bh);
180
181         if (unlikely(!buffer_uptodate(bh)))
182                 ret = -EIO;
183         put_bh(bh);            /* One for getblk() */
184         jbd2_journal_put_journal_head(bh2jh(bh));
185
186         return ret;
187 }
188
189 /*
190  * write the filemap data using writepage() address_space_operations.
191  * We don't do block allocation here even for delalloc. We don't
192  * use writepages() because with dealyed allocation we may be doing
193  * block allocation in writepages().
194  */
195 static int journal_submit_inode_data_buffers(struct address_space *mapping)
196 {
197         int ret;
198         struct writeback_control wbc = {
199                 .sync_mode =  WB_SYNC_ALL,
200                 .nr_to_write = mapping->nrpages * 2,
201                 .range_start = 0,
202                 .range_end = i_size_read(mapping->host),
203                 .for_writepages = 1,
204         };
205
206         ret = generic_writepages(mapping, &wbc);
207         return ret;
208 }
209
210 /*
211  * Submit all the data buffers of inode associated with the transaction to
212  * disk.
213  *
214  * We are in a committing transaction. Therefore no new inode can be added to
215  * our inode list. We use JI_COMMIT_RUNNING flag to protect inode we currently
216  * operate on from being released while we write out pages.
217  */
218 static int journal_submit_data_buffers(journal_t *journal,
219                 transaction_t *commit_transaction)
220 {
221         struct jbd2_inode *jinode;
222         int err, ret = 0;
223         struct address_space *mapping;
224
225         spin_lock(&journal->j_list_lock);
226         list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
227                 mapping = jinode->i_vfs_inode->i_mapping;
228                 jinode->i_flags |= JI_COMMIT_RUNNING;
229                 spin_unlock(&journal->j_list_lock);
230                 /*
231                  * submit the inode data buffers. We use writepage
232                  * instead of writepages. Because writepages can do
233                  * block allocation  with delalloc. We need to write
234                  * only allocated blocks here.
235                  */
236                 err = journal_submit_inode_data_buffers(mapping);
237                 if (!ret)
238                         ret = err;
239                 spin_lock(&journal->j_list_lock);
240                 J_ASSERT(jinode->i_transaction == commit_transaction);
241                 jinode->i_flags &= ~JI_COMMIT_RUNNING;
242                 wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
243         }
244         spin_unlock(&journal->j_list_lock);
245         return ret;
246 }
247
248 /*
249  * Wait for data submitted for writeout, refile inodes to proper
250  * transaction if needed.
251  *
252  */
253 static int journal_finish_inode_data_buffers(journal_t *journal,
254                 transaction_t *commit_transaction)
255 {
256         struct jbd2_inode *jinode, *next_i;
257         int err, ret = 0;
258
259         /* For locking, see the comment in journal_submit_data_buffers() */
260         spin_lock(&journal->j_list_lock);
261         list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
262                 jinode->i_flags |= JI_COMMIT_RUNNING;
263                 spin_unlock(&journal->j_list_lock);
264                 err = filemap_fdatawait(jinode->i_vfs_inode->i_mapping);
265                 if (err) {
266                         /*
267                          * Because AS_EIO is cleared by
268                          * wait_on_page_writeback_range(), set it again so
269                          * that user process can get -EIO from fsync().
270                          */
271                         set_bit(AS_EIO,
272                                 &jinode->i_vfs_inode->i_mapping->flags);
273
274                         if (!ret)
275                                 ret = err;
276                 }
277                 spin_lock(&journal->j_list_lock);
278                 jinode->i_flags &= ~JI_COMMIT_RUNNING;
279                 wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
280         }
281
282         /* Now refile inode to proper lists */
283         list_for_each_entry_safe(jinode, next_i,
284                                  &commit_transaction->t_inode_list, i_list) {
285                 list_del(&jinode->i_list);
286                 if (jinode->i_next_transaction) {
287                         jinode->i_transaction = jinode->i_next_transaction;
288                         jinode->i_next_transaction = NULL;
289                         list_add(&jinode->i_list,
290                                 &jinode->i_transaction->t_inode_list);
291                 } else {
292                         jinode->i_transaction = NULL;
293                 }
294         }
295         spin_unlock(&journal->j_list_lock);
296
297         return ret;
298 }
299
300 static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh)
301 {
302         struct page *page = bh->b_page;
303         char *addr;
304         __u32 checksum;
305
306         addr = kmap_atomic(page, KM_USER0);
307         checksum = crc32_be(crc32_sum,
308                 (void *)(addr + offset_in_page(bh->b_data)), bh->b_size);
309         kunmap_atomic(addr, KM_USER0);
310
311         return checksum;
312 }
313
314 static void write_tag_block(int tag_bytes, journal_block_tag_t *tag,
315                                    unsigned long long block)
316 {
317         tag->t_blocknr = cpu_to_be32(block & (u32)~0);
318         if (tag_bytes > JBD2_TAG_SIZE32)
319                 tag->t_blocknr_high = cpu_to_be32((block >> 31) >> 1);
320 }
321
322 /*
323  * jbd2_journal_commit_transaction
324  *
325  * The primary function for committing a transaction to the log.  This
326  * function is called by the journal thread to begin a complete commit.
327  */
328 void jbd2_journal_commit_transaction(journal_t *journal)
329 {
330         struct transaction_stats_s stats;
331         transaction_t *commit_transaction;
332         struct journal_head *jh, *new_jh, *descriptor;
333         struct buffer_head **wbuf = journal->j_wbuf;
334         int bufs;
335         int flags;
336         int err;
337         unsigned long long blocknr;
338         char *tagp = NULL;
339         journal_header_t *header;
340         journal_block_tag_t *tag = NULL;
341         int space_left = 0;
342         int first_tag = 0;
343         int tag_flag;
344         int i;
345         int tag_bytes = journal_tag_bytes(journal);
346         struct buffer_head *cbh = NULL; /* For transactional checksums */
347         __u32 crc32_sum = ~0;
348
349         /*
350          * First job: lock down the current transaction and wait for
351          * all outstanding updates to complete.
352          */
353
354 #ifdef COMMIT_STATS
355         spin_lock(&journal->j_list_lock);
356         summarise_journal_usage(journal);
357         spin_unlock(&journal->j_list_lock);
358 #endif
359
360         /* Do we need to erase the effects of a prior jbd2_journal_flush? */
361         if (journal->j_flags & JBD2_FLUSHED) {
362                 jbd_debug(3, "super block updated\n");
363                 jbd2_journal_update_superblock(journal, 1);
364         } else {
365                 jbd_debug(3, "superblock not updated\n");
366         }
367
368         J_ASSERT(journal->j_running_transaction != NULL);
369         J_ASSERT(journal->j_committing_transaction == NULL);
370
371         commit_transaction = journal->j_running_transaction;
372         J_ASSERT(commit_transaction->t_state == T_RUNNING);
373
374         jbd_debug(1, "JBD: starting commit of transaction %d\n",
375                         commit_transaction->t_tid);
376
377         spin_lock(&journal->j_state_lock);
378         commit_transaction->t_state = T_LOCKED;
379
380         stats.u.run.rs_wait = commit_transaction->t_max_wait;
381         stats.u.run.rs_locked = jiffies;
382         stats.u.run.rs_running = jbd2_time_diff(commit_transaction->t_start,
383                                                 stats.u.run.rs_locked);
384
385         spin_lock(&commit_transaction->t_handle_lock);
386         while (commit_transaction->t_updates) {
387                 DEFINE_WAIT(wait);
388
389                 prepare_to_wait(&journal->j_wait_updates, &wait,
390                                         TASK_UNINTERRUPTIBLE);
391                 if (commit_transaction->t_updates) {
392                         spin_unlock(&commit_transaction->t_handle_lock);
393                         spin_unlock(&journal->j_state_lock);
394                         schedule();
395                         spin_lock(&journal->j_state_lock);
396                         spin_lock(&commit_transaction->t_handle_lock);
397                 }
398                 finish_wait(&journal->j_wait_updates, &wait);
399         }
400         spin_unlock(&commit_transaction->t_handle_lock);
401
402         J_ASSERT (commit_transaction->t_outstanding_credits <=
403                         journal->j_max_transaction_buffers);
404
405         /*
406          * First thing we are allowed to do is to discard any remaining
407          * BJ_Reserved buffers.  Note, it is _not_ permissible to assume
408          * that there are no such buffers: if a large filesystem
409          * operation like a truncate needs to split itself over multiple
410          * transactions, then it may try to do a jbd2_journal_restart() while
411          * there are still BJ_Reserved buffers outstanding.  These must
412          * be released cleanly from the current transaction.
413          *
414          * In this case, the filesystem must still reserve write access
415          * again before modifying the buffer in the new transaction, but
416          * we do not require it to remember exactly which old buffers it
417          * has reserved.  This is consistent with the existing behaviour
418          * that multiple jbd2_journal_get_write_access() calls to the same
419          * buffer are perfectly permissable.
420          */
421         while (commit_transaction->t_reserved_list) {
422                 jh = commit_transaction->t_reserved_list;
423                 JBUFFER_TRACE(jh, "reserved, unused: refile");
424                 /*
425                  * A jbd2_journal_get_undo_access()+jbd2_journal_release_buffer() may
426                  * leave undo-committed data.
427                  */
428                 if (jh->b_committed_data) {
429                         struct buffer_head *bh = jh2bh(jh);
430
431                         jbd_lock_bh_state(bh);
432                         jbd2_free(jh->b_committed_data, bh->b_size);
433                         jh->b_committed_data = NULL;
434                         jbd_unlock_bh_state(bh);
435                 }
436                 jbd2_journal_refile_buffer(journal, jh);
437         }
438
439         /*
440          * Now try to drop any written-back buffers from the journal's
441          * checkpoint lists.  We do this *before* commit because it potentially
442          * frees some memory
443          */
444         spin_lock(&journal->j_list_lock);
445         __jbd2_journal_clean_checkpoint_list(journal);
446         spin_unlock(&journal->j_list_lock);
447
448         jbd_debug (3, "JBD: commit phase 1\n");
449
450         /*
451          * Switch to a new revoke table.
452          */
453         jbd2_journal_switch_revoke_table(journal);
454
455         stats.u.run.rs_flushing = jiffies;
456         stats.u.run.rs_locked = jbd2_time_diff(stats.u.run.rs_locked,
457                                                stats.u.run.rs_flushing);
458
459         commit_transaction->t_state = T_FLUSH;
460         journal->j_committing_transaction = commit_transaction;
461         journal->j_running_transaction = NULL;
462         commit_transaction->t_log_start = journal->j_head;
463         wake_up(&journal->j_wait_transaction_locked);
464         spin_unlock(&journal->j_state_lock);
465
466         jbd_debug (3, "JBD: commit phase 2\n");
467
468         /*
469          * Now start flushing things to disk, in the order they appear
470          * on the transaction lists.  Data blocks go first.
471          */
472         err = journal_submit_data_buffers(journal, commit_transaction);
473         if (err)
474                 jbd2_journal_abort(journal, err);
475
476         jbd2_journal_write_revoke_records(journal, commit_transaction);
477
478         jbd_debug(3, "JBD: commit phase 2\n");
479
480         /*
481          * Way to go: we have now written out all of the data for a
482          * transaction!  Now comes the tricky part: we need to write out
483          * metadata.  Loop over the transaction's entire buffer list:
484          */
485         spin_lock(&journal->j_state_lock);
486         commit_transaction->t_state = T_COMMIT;
487         spin_unlock(&journal->j_state_lock);
488
489         stats.u.run.rs_logging = jiffies;
490         stats.u.run.rs_flushing = jbd2_time_diff(stats.u.run.rs_flushing,
491                                                  stats.u.run.rs_logging);
492         stats.u.run.rs_blocks = commit_transaction->t_outstanding_credits;
493         stats.u.run.rs_blocks_logged = 0;
494
495         J_ASSERT(commit_transaction->t_nr_buffers <=
496                  commit_transaction->t_outstanding_credits);
497
498         err = 0;
499         descriptor = NULL;
500         bufs = 0;
501         while (commit_transaction->t_buffers) {
502
503                 /* Find the next buffer to be journaled... */
504
505                 jh = commit_transaction->t_buffers;
506
507                 /* If we're in abort mode, we just un-journal the buffer and
508                    release it for background writing. */
509
510                 if (is_journal_aborted(journal)) {
511                         JBUFFER_TRACE(jh, "journal is aborting: refile");
512                         jbd2_journal_refile_buffer(journal, jh);
513                         /* If that was the last one, we need to clean up
514                          * any descriptor buffers which may have been
515                          * already allocated, even if we are now
516                          * aborting. */
517                         if (!commit_transaction->t_buffers)
518                                 goto start_journal_io;
519                         continue;
520                 }
521
522                 /* Make sure we have a descriptor block in which to
523                    record the metadata buffer. */
524
525                 if (!descriptor) {
526                         struct buffer_head *bh;
527
528                         J_ASSERT (bufs == 0);
529
530                         jbd_debug(4, "JBD: get descriptor\n");
531
532                         descriptor = jbd2_journal_get_descriptor_buffer(journal);
533                         if (!descriptor) {
534                                 jbd2_journal_abort(journal, -EIO);
535                                 continue;
536                         }
537
538                         bh = jh2bh(descriptor);
539                         jbd_debug(4, "JBD: got buffer %llu (%p)\n",
540                                 (unsigned long long)bh->b_blocknr, bh->b_data);
541                         header = (journal_header_t *)&bh->b_data[0];
542                         header->h_magic     = cpu_to_be32(JBD2_MAGIC_NUMBER);
543                         header->h_blocktype = cpu_to_be32(JBD2_DESCRIPTOR_BLOCK);
544                         header->h_sequence  = cpu_to_be32(commit_transaction->t_tid);
545
546                         tagp = &bh->b_data[sizeof(journal_header_t)];
547                         space_left = bh->b_size - sizeof(journal_header_t);
548                         first_tag = 1;
549                         set_buffer_jwrite(bh);
550                         set_buffer_dirty(bh);
551                         wbuf[bufs++] = bh;
552
553                         /* Record it so that we can wait for IO
554                            completion later */
555                         BUFFER_TRACE(bh, "ph3: file as descriptor");
556                         jbd2_journal_file_buffer(descriptor, commit_transaction,
557                                         BJ_LogCtl);
558                 }
559
560                 /* Where is the buffer to be written? */
561
562                 err = jbd2_journal_next_log_block(journal, &blocknr);
563                 /* If the block mapping failed, just abandon the buffer
564                    and repeat this loop: we'll fall into the
565                    refile-on-abort condition above. */
566                 if (err) {
567                         jbd2_journal_abort(journal, err);
568                         continue;
569                 }
570
571                 /*
572                  * start_this_handle() uses t_outstanding_credits to determine
573                  * the free space in the log, but this counter is changed
574                  * by jbd2_journal_next_log_block() also.
575                  */
576                 commit_transaction->t_outstanding_credits--;
577
578                 /* Bump b_count to prevent truncate from stumbling over
579                    the shadowed buffer!  @@@ This can go if we ever get
580                    rid of the BJ_IO/BJ_Shadow pairing of buffers. */
581                 atomic_inc(&jh2bh(jh)->b_count);
582
583                 /* Make a temporary IO buffer with which to write it out
584                    (this will requeue both the metadata buffer and the
585                    temporary IO buffer). new_bh goes on BJ_IO*/
586
587                 set_bit(BH_JWrite, &jh2bh(jh)->b_state);
588                 /*
589                  * akpm: jbd2_journal_write_metadata_buffer() sets
590                  * new_bh->b_transaction to commit_transaction.
591                  * We need to clean this up before we release new_bh
592                  * (which is of type BJ_IO)
593                  */
594                 JBUFFER_TRACE(jh, "ph3: write metadata");
595                 flags = jbd2_journal_write_metadata_buffer(commit_transaction,
596                                                       jh, &new_jh, blocknr);
597                 set_bit(BH_JWrite, &jh2bh(new_jh)->b_state);
598                 wbuf[bufs++] = jh2bh(new_jh);
599
600                 /* Record the new block's tag in the current descriptor
601                    buffer */
602
603                 tag_flag = 0;
604                 if (flags & 1)
605                         tag_flag |= JBD2_FLAG_ESCAPE;
606                 if (!first_tag)
607                         tag_flag |= JBD2_FLAG_SAME_UUID;
608
609                 tag = (journal_block_tag_t *) tagp;
610                 write_tag_block(tag_bytes, tag, jh2bh(jh)->b_blocknr);
611                 tag->t_flags = cpu_to_be32(tag_flag);
612                 tagp += tag_bytes;
613                 space_left -= tag_bytes;
614
615                 if (first_tag) {
616                         memcpy (tagp, journal->j_uuid, 16);
617                         tagp += 16;
618                         space_left -= 16;
619                         first_tag = 0;
620                 }
621
622                 /* If there's no more to do, or if the descriptor is full,
623                    let the IO rip! */
624
625                 if (bufs == journal->j_wbufsize ||
626                     commit_transaction->t_buffers == NULL ||
627                     space_left < tag_bytes + 16) {
628
629                         jbd_debug(4, "JBD: Submit %d IOs\n", bufs);
630
631                         /* Write an end-of-descriptor marker before
632                            submitting the IOs.  "tag" still points to
633                            the last tag we set up. */
634
635                         tag->t_flags |= cpu_to_be32(JBD2_FLAG_LAST_TAG);
636
637 start_journal_io:
638                         for (i = 0; i < bufs; i++) {
639                                 struct buffer_head *bh = wbuf[i];
640                                 /*
641                                  * Compute checksum.
642                                  */
643                                 if (JBD2_HAS_COMPAT_FEATURE(journal,
644                                         JBD2_FEATURE_COMPAT_CHECKSUM)) {
645                                         crc32_sum =
646                                             jbd2_checksum_data(crc32_sum, bh);
647                                 }
648
649                                 lock_buffer(bh);
650                                 clear_buffer_dirty(bh);
651                                 set_buffer_uptodate(bh);
652                                 bh->b_end_io = journal_end_buffer_io_sync;
653                                 submit_bh(WRITE, bh);
654                         }
655                         cond_resched();
656                         stats.u.run.rs_blocks_logged += bufs;
657
658                         /* Force a new descriptor to be generated next
659                            time round the loop. */
660                         descriptor = NULL;
661                         bufs = 0;
662                 }
663         }
664
665         /* Done it all: now write the commit record asynchronously. */
666
667         if (JBD2_HAS_INCOMPAT_FEATURE(journal,
668                 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
669                 err = journal_submit_commit_record(journal, commit_transaction,
670                                                  &cbh, crc32_sum);
671                 if (err)
672                         __jbd2_journal_abort_hard(journal);
673         }
674
675         /*
676          * This is the right place to wait for data buffers both for ASYNC
677          * and !ASYNC commit. If commit is ASYNC, we need to wait only after
678          * the commit block went to disk (which happens above). If commit is
679          * SYNC, we need to wait for data buffers before we start writing
680          * commit block, which happens below in such setting.
681          */
682         err = journal_finish_inode_data_buffers(journal, commit_transaction);
683         if (err) {
684                 char b[BDEVNAME_SIZE];
685
686                 printk(KERN_WARNING
687                         "JBD2: Detected IO errors while flushing file data "
688                         "on %s\n", bdevname(journal->j_fs_dev, b));
689                 err = 0;
690         }
691
692         /* Lo and behold: we have just managed to send a transaction to
693            the log.  Before we can commit it, wait for the IO so far to
694            complete.  Control buffers being written are on the
695            transaction's t_log_list queue, and metadata buffers are on
696            the t_iobuf_list queue.
697
698            Wait for the buffers in reverse order.  That way we are
699            less likely to be woken up until all IOs have completed, and
700            so we incur less scheduling load.
701         */
702
703         jbd_debug(3, "JBD: commit phase 3\n");
704
705         /*
706          * akpm: these are BJ_IO, and j_list_lock is not needed.
707          * See __journal_try_to_free_buffer.
708          */
709 wait_for_iobuf:
710         while (commit_transaction->t_iobuf_list != NULL) {
711                 struct buffer_head *bh;
712
713                 jh = commit_transaction->t_iobuf_list->b_tprev;
714                 bh = jh2bh(jh);
715                 if (buffer_locked(bh)) {
716                         wait_on_buffer(bh);
717                         goto wait_for_iobuf;
718                 }
719                 if (cond_resched())
720                         goto wait_for_iobuf;
721
722                 if (unlikely(!buffer_uptodate(bh)))
723                         err = -EIO;
724
725                 clear_buffer_jwrite(bh);
726
727                 JBUFFER_TRACE(jh, "ph4: unfile after journal write");
728                 jbd2_journal_unfile_buffer(journal, jh);
729
730                 /*
731                  * ->t_iobuf_list should contain only dummy buffer_heads
732                  * which were created by jbd2_journal_write_metadata_buffer().
733                  */
734                 BUFFER_TRACE(bh, "dumping temporary bh");
735                 jbd2_journal_put_journal_head(jh);
736                 __brelse(bh);
737                 J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0);
738                 free_buffer_head(bh);
739
740                 /* We also have to unlock and free the corresponding
741                    shadowed buffer */
742                 jh = commit_transaction->t_shadow_list->b_tprev;
743                 bh = jh2bh(jh);
744                 clear_bit(BH_JWrite, &bh->b_state);
745                 J_ASSERT_BH(bh, buffer_jbddirty(bh));
746
747                 /* The metadata is now released for reuse, but we need
748                    to remember it against this transaction so that when
749                    we finally commit, we can do any checkpointing
750                    required. */
751                 JBUFFER_TRACE(jh, "file as BJ_Forget");
752                 jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget);
753                 /* Wake up any transactions which were waiting for this
754                    IO to complete */
755                 wake_up_bit(&bh->b_state, BH_Unshadow);
756                 JBUFFER_TRACE(jh, "brelse shadowed buffer");
757                 __brelse(bh);
758         }
759
760         J_ASSERT (commit_transaction->t_shadow_list == NULL);
761
762         jbd_debug(3, "JBD: commit phase 4\n");
763
764         /* Here we wait for the revoke record and descriptor record buffers */
765  wait_for_ctlbuf:
766         while (commit_transaction->t_log_list != NULL) {
767                 struct buffer_head *bh;
768
769                 jh = commit_transaction->t_log_list->b_tprev;
770                 bh = jh2bh(jh);
771                 if (buffer_locked(bh)) {
772                         wait_on_buffer(bh);
773                         goto wait_for_ctlbuf;
774                 }
775                 if (cond_resched())
776                         goto wait_for_ctlbuf;
777
778                 if (unlikely(!buffer_uptodate(bh)))
779                         err = -EIO;
780
781                 BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile");
782                 clear_buffer_jwrite(bh);
783                 jbd2_journal_unfile_buffer(journal, jh);
784                 jbd2_journal_put_journal_head(jh);
785                 __brelse(bh);           /* One for getblk */
786                 /* AKPM: bforget here */
787         }
788
789         jbd_debug(3, "JBD: commit phase 5\n");
790
791         if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
792                 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
793                 err = journal_submit_commit_record(journal, commit_transaction,
794                                                 &cbh, crc32_sum);
795                 if (err)
796                         __jbd2_journal_abort_hard(journal);
797         }
798         if (!err && !is_journal_aborted(journal))
799                 err = journal_wait_on_commit_record(cbh);
800
801         if (err)
802                 jbd2_journal_abort(journal, err);
803
804         /* End of a transaction!  Finally, we can do checkpoint
805            processing: any buffers committed as a result of this
806            transaction can be removed from any checkpoint list it was on
807            before. */
808
809         jbd_debug(3, "JBD: commit phase 6\n");
810
811         J_ASSERT(list_empty(&commit_transaction->t_inode_list));
812         J_ASSERT(commit_transaction->t_buffers == NULL);
813         J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
814         J_ASSERT(commit_transaction->t_iobuf_list == NULL);
815         J_ASSERT(commit_transaction->t_shadow_list == NULL);
816         J_ASSERT(commit_transaction->t_log_list == NULL);
817
818 restart_loop:
819         /*
820          * As there are other places (journal_unmap_buffer()) adding buffers
821          * to this list we have to be careful and hold the j_list_lock.
822          */
823         spin_lock(&journal->j_list_lock);
824         while (commit_transaction->t_forget) {
825                 transaction_t *cp_transaction;
826                 struct buffer_head *bh;
827
828                 jh = commit_transaction->t_forget;
829                 spin_unlock(&journal->j_list_lock);
830                 bh = jh2bh(jh);
831                 jbd_lock_bh_state(bh);
832                 J_ASSERT_JH(jh, jh->b_transaction == commit_transaction ||
833                         jh->b_transaction == journal->j_running_transaction);
834
835                 /*
836                  * If there is undo-protected committed data against
837                  * this buffer, then we can remove it now.  If it is a
838                  * buffer needing such protection, the old frozen_data
839                  * field now points to a committed version of the
840                  * buffer, so rotate that field to the new committed
841                  * data.
842                  *
843                  * Otherwise, we can just throw away the frozen data now.
844                  */
845                 if (jh->b_committed_data) {
846                         jbd2_free(jh->b_committed_data, bh->b_size);
847                         jh->b_committed_data = NULL;
848                         if (jh->b_frozen_data) {
849                                 jh->b_committed_data = jh->b_frozen_data;
850                                 jh->b_frozen_data = NULL;
851                         }
852                 } else if (jh->b_frozen_data) {
853                         jbd2_free(jh->b_frozen_data, bh->b_size);
854                         jh->b_frozen_data = NULL;
855                 }
856
857                 spin_lock(&journal->j_list_lock);
858                 cp_transaction = jh->b_cp_transaction;
859                 if (cp_transaction) {
860                         JBUFFER_TRACE(jh, "remove from old cp transaction");
861                         cp_transaction->t_chp_stats.cs_dropped++;
862                         __jbd2_journal_remove_checkpoint(jh);
863                 }
864
865                 /* Only re-checkpoint the buffer_head if it is marked
866                  * dirty.  If the buffer was added to the BJ_Forget list
867                  * by jbd2_journal_forget, it may no longer be dirty and
868                  * there's no point in keeping a checkpoint record for
869                  * it. */
870
871                 /* A buffer which has been freed while still being
872                  * journaled by a previous transaction may end up still
873                  * being dirty here, but we want to avoid writing back
874                  * that buffer in the future now that the last use has
875                  * been committed.  That's not only a performance gain,
876                  * it also stops aliasing problems if the buffer is left
877                  * behind for writeback and gets reallocated for another
878                  * use in a different page. */
879                 if (buffer_freed(bh)) {
880                         clear_buffer_freed(bh);
881                         clear_buffer_jbddirty(bh);
882                 }
883
884                 if (buffer_jbddirty(bh)) {
885                         JBUFFER_TRACE(jh, "add to new checkpointing trans");
886                         __jbd2_journal_insert_checkpoint(jh, commit_transaction);
887                         JBUFFER_TRACE(jh, "refile for checkpoint writeback");
888                         __jbd2_journal_refile_buffer(jh);
889                         jbd_unlock_bh_state(bh);
890                 } else {
891                         J_ASSERT_BH(bh, !buffer_dirty(bh));
892                         /* The buffer on BJ_Forget list and not jbddirty means
893                          * it has been freed by this transaction and hence it
894                          * could not have been reallocated until this
895                          * transaction has committed. *BUT* it could be
896                          * reallocated once we have written all the data to
897                          * disk and before we process the buffer on BJ_Forget
898                          * list. */
899                         JBUFFER_TRACE(jh, "refile or unfile freed buffer");
900                         __jbd2_journal_refile_buffer(jh);
901                         if (!jh->b_transaction) {
902                                 jbd_unlock_bh_state(bh);
903                                  /* needs a brelse */
904                                 jbd2_journal_remove_journal_head(bh);
905                                 release_buffer_page(bh);
906                         } else
907                                 jbd_unlock_bh_state(bh);
908                 }
909                 cond_resched_lock(&journal->j_list_lock);
910         }
911         spin_unlock(&journal->j_list_lock);
912         /*
913          * This is a bit sleazy.  We use j_list_lock to protect transition
914          * of a transaction into T_FINISHED state and calling
915          * __jbd2_journal_drop_transaction(). Otherwise we could race with
916          * other checkpointing code processing the transaction...
917          */
918         spin_lock(&journal->j_state_lock);
919         spin_lock(&journal->j_list_lock);
920         /*
921          * Now recheck if some buffers did not get attached to the transaction
922          * while the lock was dropped...
923          */
924         if (commit_transaction->t_forget) {
925                 spin_unlock(&journal->j_list_lock);
926                 spin_unlock(&journal->j_state_lock);
927                 goto restart_loop;
928         }
929
930         /* Done with this transaction! */
931
932         jbd_debug(3, "JBD: commit phase 7\n");
933
934         J_ASSERT(commit_transaction->t_state == T_COMMIT);
935
936         commit_transaction->t_start = jiffies;
937         stats.u.run.rs_logging = jbd2_time_diff(stats.u.run.rs_logging,
938                                                 commit_transaction->t_start);
939
940         /*
941          * File the transaction for history
942          */
943         stats.ts_type = JBD2_STATS_RUN;
944         stats.ts_tid = commit_transaction->t_tid;
945         stats.u.run.rs_handle_count = commit_transaction->t_handle_count;
946         spin_lock(&journal->j_history_lock);
947         memcpy(journal->j_history + journal->j_history_cur, &stats,
948                         sizeof(stats));
949         if (++journal->j_history_cur == journal->j_history_max)
950                 journal->j_history_cur = 0;
951
952         /*
953          * Calculate overall stats
954          */
955         journal->j_stats.ts_tid++;
956         journal->j_stats.u.run.rs_wait += stats.u.run.rs_wait;
957         journal->j_stats.u.run.rs_running += stats.u.run.rs_running;
958         journal->j_stats.u.run.rs_locked += stats.u.run.rs_locked;
959         journal->j_stats.u.run.rs_flushing += stats.u.run.rs_flushing;
960         journal->j_stats.u.run.rs_logging += stats.u.run.rs_logging;
961         journal->j_stats.u.run.rs_handle_count += stats.u.run.rs_handle_count;
962         journal->j_stats.u.run.rs_blocks += stats.u.run.rs_blocks;
963         journal->j_stats.u.run.rs_blocks_logged += stats.u.run.rs_blocks_logged;
964         spin_unlock(&journal->j_history_lock);
965
966         commit_transaction->t_state = T_FINISHED;
967         J_ASSERT(commit_transaction == journal->j_committing_transaction);
968         journal->j_commit_sequence = commit_transaction->t_tid;
969         journal->j_committing_transaction = NULL;
970         spin_unlock(&journal->j_state_lock);
971
972         if (commit_transaction->t_checkpoint_list == NULL &&
973             commit_transaction->t_checkpoint_io_list == NULL) {
974                 __jbd2_journal_drop_transaction(journal, commit_transaction);
975         } else {
976                 if (journal->j_checkpoint_transactions == NULL) {
977                         journal->j_checkpoint_transactions = commit_transaction;
978                         commit_transaction->t_cpnext = commit_transaction;
979                         commit_transaction->t_cpprev = commit_transaction;
980                 } else {
981                         commit_transaction->t_cpnext =
982                                 journal->j_checkpoint_transactions;
983                         commit_transaction->t_cpprev =
984                                 commit_transaction->t_cpnext->t_cpprev;
985                         commit_transaction->t_cpnext->t_cpprev =
986                                 commit_transaction;
987                         commit_transaction->t_cpprev->t_cpnext =
988                                 commit_transaction;
989                 }
990         }
991         spin_unlock(&journal->j_list_lock);
992
993         jbd_debug(1, "JBD: commit %d complete, head %d\n",
994                   journal->j_commit_sequence, journal->j_tail_sequence);
995
996         wake_up(&journal->j_wait_done_commit);
997 }