ext4: Avoid corrupting the uninitialized bit in the extent during truncate
[linux-2.6] / fs / jbd / commit.c
1 /*
2  * linux/fs/jbd/commit.c
3  *
4  * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
5  *
6  * Copyright 1998 Red Hat corp --- All Rights Reserved
7  *
8  * This file is part of the Linux kernel and is made available under
9  * the terms of the GNU General Public License, version 2, or at your
10  * option, any later version, incorporated herein by reference.
11  *
12  * Journal commit routines for the generic filesystem journaling code;
13  * part of the ext2fs journaling system.
14  */
15
16 #include <linux/time.h>
17 #include <linux/fs.h>
18 #include <linux/jbd.h>
19 #include <linux/errno.h>
20 #include <linux/slab.h>
21 #include <linux/mm.h>
22 #include <linux/pagemap.h>
23 #include <linux/bio.h>
24
25 /*
26  * Default IO end handler for temporary BJ_IO buffer_heads.
27  */
28 static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
29 {
30         BUFFER_TRACE(bh, "");
31         if (uptodate)
32                 set_buffer_uptodate(bh);
33         else
34                 clear_buffer_uptodate(bh);
35         unlock_buffer(bh);
36 }
37
38 /*
39  * When an ext3-ordered file is truncated, it is possible that many pages are
40  * not successfully freed, because they are attached to a committing transaction.
41  * After the transaction commits, these pages are left on the LRU, with no
42  * ->mapping, and with attached buffers.  These pages are trivially reclaimable
43  * by the VM, but their apparent absence upsets the VM accounting, and it makes
44  * the numbers in /proc/meminfo look odd.
45  *
46  * So here, we have a buffer which has just come off the forget list.  Look to
47  * see if we can strip all buffers from the backing page.
48  *
49  * Called under journal->j_list_lock.  The caller provided us with a ref
50  * against the buffer, and we drop that here.
51  */
52 static void release_buffer_page(struct buffer_head *bh)
53 {
54         struct page *page;
55
56         if (buffer_dirty(bh))
57                 goto nope;
58         if (atomic_read(&bh->b_count) != 1)
59                 goto nope;
60         page = bh->b_page;
61         if (!page)
62                 goto nope;
63         if (page->mapping)
64                 goto nope;
65
66         /* OK, it's a truncated page */
67         if (!trylock_page(page))
68                 goto nope;
69
70         page_cache_get(page);
71         __brelse(bh);
72         try_to_free_buffers(page);
73         unlock_page(page);
74         page_cache_release(page);
75         return;
76
77 nope:
78         __brelse(bh);
79 }
80
81 /*
82  * Decrement reference counter for data buffer. If it has been marked
83  * 'BH_Freed', release it and the page to which it belongs if possible.
84  */
85 static void release_data_buffer(struct buffer_head *bh)
86 {
87         if (buffer_freed(bh)) {
88                 clear_buffer_freed(bh);
89                 release_buffer_page(bh);
90         } else
91                 put_bh(bh);
92 }
93
94 /*
95  * Try to acquire jbd_lock_bh_state() against the buffer, when j_list_lock is
96  * held.  For ranking reasons we must trylock.  If we lose, schedule away and
97  * return 0.  j_list_lock is dropped in this case.
98  */
99 static int inverted_lock(journal_t *journal, struct buffer_head *bh)
100 {
101         if (!jbd_trylock_bh_state(bh)) {
102                 spin_unlock(&journal->j_list_lock);
103                 schedule();
104                 return 0;
105         }
106         return 1;
107 }
108
109 /* Done it all: now write the commit record.  We should have
110  * cleaned up our previous buffers by now, so if we are in abort
111  * mode we can now just skip the rest of the journal write
112  * entirely.
113  *
114  * Returns 1 if the journal needs to be aborted or 0 on success
115  */
116 static int journal_write_commit_record(journal_t *journal,
117                                         transaction_t *commit_transaction)
118 {
119         struct journal_head *descriptor;
120         struct buffer_head *bh;
121         journal_header_t *header;
122         int ret;
123         int barrier_done = 0;
124
125         if (is_journal_aborted(journal))
126                 return 0;
127
128         descriptor = journal_get_descriptor_buffer(journal);
129         if (!descriptor)
130                 return 1;
131
132         bh = jh2bh(descriptor);
133
134         header = (journal_header_t *)(bh->b_data);
135         header->h_magic = cpu_to_be32(JFS_MAGIC_NUMBER);
136         header->h_blocktype = cpu_to_be32(JFS_COMMIT_BLOCK);
137         header->h_sequence = cpu_to_be32(commit_transaction->t_tid);
138
139         JBUFFER_TRACE(descriptor, "write commit block");
140         set_buffer_dirty(bh);
141         if (journal->j_flags & JFS_BARRIER) {
142                 set_buffer_ordered(bh);
143                 barrier_done = 1;
144         }
145         ret = sync_dirty_buffer(bh);
146         if (barrier_done)
147                 clear_buffer_ordered(bh);
148         /* is it possible for another commit to fail at roughly
149          * the same time as this one?  If so, we don't want to
150          * trust the barrier flag in the super, but instead want
151          * to remember if we sent a barrier request
152          */
153         if (ret == -EOPNOTSUPP && barrier_done) {
154                 char b[BDEVNAME_SIZE];
155
156                 printk(KERN_WARNING
157                         "JBD: barrier-based sync failed on %s - "
158                         "disabling barriers\n",
159                         bdevname(journal->j_dev, b));
160                 spin_lock(&journal->j_state_lock);
161                 journal->j_flags &= ~JFS_BARRIER;
162                 spin_unlock(&journal->j_state_lock);
163
164                 /* And try again, without the barrier */
165                 set_buffer_uptodate(bh);
166                 set_buffer_dirty(bh);
167                 ret = sync_dirty_buffer(bh);
168         }
169         put_bh(bh);             /* One for getblk() */
170         journal_put_journal_head(descriptor);
171
172         return (ret == -EIO);
173 }
174
175 static void journal_do_submit_data(struct buffer_head **wbuf, int bufs,
176                                    int write_op)
177 {
178         int i;
179
180         for (i = 0; i < bufs; i++) {
181                 wbuf[i]->b_end_io = end_buffer_write_sync;
182                 /* We use-up our safety reference in submit_bh() */
183                 submit_bh(write_op, wbuf[i]);
184         }
185 }
186
187 /*
188  *  Submit all the data buffers to disk
189  */
190 static int journal_submit_data_buffers(journal_t *journal,
191                                        transaction_t *commit_transaction,
192                                        int write_op)
193 {
194         struct journal_head *jh;
195         struct buffer_head *bh;
196         int locked;
197         int bufs = 0;
198         struct buffer_head **wbuf = journal->j_wbuf;
199         int err = 0;
200
201         /*
202          * Whenever we unlock the journal and sleep, things can get added
203          * onto ->t_sync_datalist, so we have to keep looping back to
204          * write_out_data until we *know* that the list is empty.
205          *
206          * Cleanup any flushed data buffers from the data list.  Even in
207          * abort mode, we want to flush this out as soon as possible.
208          */
209 write_out_data:
210         cond_resched();
211         spin_lock(&journal->j_list_lock);
212
213         while (commit_transaction->t_sync_datalist) {
214                 jh = commit_transaction->t_sync_datalist;
215                 bh = jh2bh(jh);
216                 locked = 0;
217
218                 /* Get reference just to make sure buffer does not disappear
219                  * when we are forced to drop various locks */
220                 get_bh(bh);
221                 /* If the buffer is dirty, we need to submit IO and hence
222                  * we need the buffer lock. We try to lock the buffer without
223                  * blocking. If we fail, we need to drop j_list_lock and do
224                  * blocking lock_buffer().
225                  */
226                 if (buffer_dirty(bh)) {
227                         if (!trylock_buffer(bh)) {
228                                 BUFFER_TRACE(bh, "needs blocking lock");
229                                 spin_unlock(&journal->j_list_lock);
230                                 /* Write out all data to prevent deadlocks */
231                                 journal_do_submit_data(wbuf, bufs, write_op);
232                                 bufs = 0;
233                                 lock_buffer(bh);
234                                 spin_lock(&journal->j_list_lock);
235                         }
236                         locked = 1;
237                 }
238                 /* We have to get bh_state lock. Again out of order, sigh. */
239                 if (!inverted_lock(journal, bh)) {
240                         jbd_lock_bh_state(bh);
241                         spin_lock(&journal->j_list_lock);
242                 }
243                 /* Someone already cleaned up the buffer? */
244                 if (!buffer_jbd(bh)
245                         || jh->b_transaction != commit_transaction
246                         || jh->b_jlist != BJ_SyncData) {
247                         jbd_unlock_bh_state(bh);
248                         if (locked)
249                                 unlock_buffer(bh);
250                         BUFFER_TRACE(bh, "already cleaned up");
251                         release_data_buffer(bh);
252                         continue;
253                 }
254                 if (locked && test_clear_buffer_dirty(bh)) {
255                         BUFFER_TRACE(bh, "needs writeout, adding to array");
256                         wbuf[bufs++] = bh;
257                         __journal_file_buffer(jh, commit_transaction,
258                                                 BJ_Locked);
259                         jbd_unlock_bh_state(bh);
260                         if (bufs == journal->j_wbufsize) {
261                                 spin_unlock(&journal->j_list_lock);
262                                 journal_do_submit_data(wbuf, bufs, write_op);
263                                 bufs = 0;
264                                 goto write_out_data;
265                         }
266                 } else if (!locked && buffer_locked(bh)) {
267                         __journal_file_buffer(jh, commit_transaction,
268                                                 BJ_Locked);
269                         jbd_unlock_bh_state(bh);
270                         put_bh(bh);
271                 } else {
272                         BUFFER_TRACE(bh, "writeout complete: unfile");
273                         if (unlikely(!buffer_uptodate(bh)))
274                                 err = -EIO;
275                         __journal_unfile_buffer(jh);
276                         jbd_unlock_bh_state(bh);
277                         if (locked)
278                                 unlock_buffer(bh);
279                         journal_remove_journal_head(bh);
280                         /* One for our safety reference, other for
281                          * journal_remove_journal_head() */
282                         put_bh(bh);
283                         release_data_buffer(bh);
284                 }
285
286                 if (need_resched() || spin_needbreak(&journal->j_list_lock)) {
287                         spin_unlock(&journal->j_list_lock);
288                         goto write_out_data;
289                 }
290         }
291         spin_unlock(&journal->j_list_lock);
292         journal_do_submit_data(wbuf, bufs, write_op);
293
294         return err;
295 }
296
297 /*
298  * journal_commit_transaction
299  *
300  * The primary function for committing a transaction to the log.  This
301  * function is called by the journal thread to begin a complete commit.
302  */
303 void journal_commit_transaction(journal_t *journal)
304 {
305         transaction_t *commit_transaction;
306         struct journal_head *jh, *new_jh, *descriptor;
307         struct buffer_head **wbuf = journal->j_wbuf;
308         int bufs;
309         int flags;
310         int err;
311         unsigned long blocknr;
312         ktime_t start_time;
313         u64 commit_time;
314         char *tagp = NULL;
315         journal_header_t *header;
316         journal_block_tag_t *tag = NULL;
317         int space_left = 0;
318         int first_tag = 0;
319         int tag_flag;
320         int i;
321         int write_op = WRITE;
322
323         /*
324          * First job: lock down the current transaction and wait for
325          * all outstanding updates to complete.
326          */
327
328 #ifdef COMMIT_STATS
329         spin_lock(&journal->j_list_lock);
330         summarise_journal_usage(journal);
331         spin_unlock(&journal->j_list_lock);
332 #endif
333
334         /* Do we need to erase the effects of a prior journal_flush? */
335         if (journal->j_flags & JFS_FLUSHED) {
336                 jbd_debug(3, "super block updated\n");
337                 journal_update_superblock(journal, 1);
338         } else {
339                 jbd_debug(3, "superblock not updated\n");
340         }
341
342         J_ASSERT(journal->j_running_transaction != NULL);
343         J_ASSERT(journal->j_committing_transaction == NULL);
344
345         commit_transaction = journal->j_running_transaction;
346         J_ASSERT(commit_transaction->t_state == T_RUNNING);
347
348         jbd_debug(1, "JBD: starting commit of transaction %d\n",
349                         commit_transaction->t_tid);
350
351         spin_lock(&journal->j_state_lock);
352         commit_transaction->t_state = T_LOCKED;
353
354         /*
355          * Use plugged writes here, since we want to submit several before
356          * we unplug the device. We don't do explicit unplugging in here,
357          * instead we rely on sync_buffer() doing the unplug for us.
358          */
359         if (commit_transaction->t_synchronous_commit)
360                 write_op = WRITE_SYNC_PLUG;
361         spin_lock(&commit_transaction->t_handle_lock);
362         while (commit_transaction->t_updates) {
363                 DEFINE_WAIT(wait);
364
365                 prepare_to_wait(&journal->j_wait_updates, &wait,
366                                         TASK_UNINTERRUPTIBLE);
367                 if (commit_transaction->t_updates) {
368                         spin_unlock(&commit_transaction->t_handle_lock);
369                         spin_unlock(&journal->j_state_lock);
370                         schedule();
371                         spin_lock(&journal->j_state_lock);
372                         spin_lock(&commit_transaction->t_handle_lock);
373                 }
374                 finish_wait(&journal->j_wait_updates, &wait);
375         }
376         spin_unlock(&commit_transaction->t_handle_lock);
377
378         J_ASSERT (commit_transaction->t_outstanding_credits <=
379                         journal->j_max_transaction_buffers);
380
381         /*
382          * First thing we are allowed to do is to discard any remaining
383          * BJ_Reserved buffers.  Note, it is _not_ permissible to assume
384          * that there are no such buffers: if a large filesystem
385          * operation like a truncate needs to split itself over multiple
386          * transactions, then it may try to do a journal_restart() while
387          * there are still BJ_Reserved buffers outstanding.  These must
388          * be released cleanly from the current transaction.
389          *
390          * In this case, the filesystem must still reserve write access
391          * again before modifying the buffer in the new transaction, but
392          * we do not require it to remember exactly which old buffers it
393          * has reserved.  This is consistent with the existing behaviour
394          * that multiple journal_get_write_access() calls to the same
395          * buffer are perfectly permissable.
396          */
397         while (commit_transaction->t_reserved_list) {
398                 jh = commit_transaction->t_reserved_list;
399                 JBUFFER_TRACE(jh, "reserved, unused: refile");
400                 /*
401                  * A journal_get_undo_access()+journal_release_buffer() may
402                  * leave undo-committed data.
403                  */
404                 if (jh->b_committed_data) {
405                         struct buffer_head *bh = jh2bh(jh);
406
407                         jbd_lock_bh_state(bh);
408                         jbd_free(jh->b_committed_data, bh->b_size);
409                         jh->b_committed_data = NULL;
410                         jbd_unlock_bh_state(bh);
411                 }
412                 journal_refile_buffer(journal, jh);
413         }
414
415         /*
416          * Now try to drop any written-back buffers from the journal's
417          * checkpoint lists.  We do this *before* commit because it potentially
418          * frees some memory
419          */
420         spin_lock(&journal->j_list_lock);
421         __journal_clean_checkpoint_list(journal);
422         spin_unlock(&journal->j_list_lock);
423
424         jbd_debug (3, "JBD: commit phase 1\n");
425
426         /*
427          * Switch to a new revoke table.
428          */
429         journal_switch_revoke_table(journal);
430
431         commit_transaction->t_state = T_FLUSH;
432         journal->j_committing_transaction = commit_transaction;
433         journal->j_running_transaction = NULL;
434         start_time = ktime_get();
435         commit_transaction->t_log_start = journal->j_head;
436         wake_up(&journal->j_wait_transaction_locked);
437         spin_unlock(&journal->j_state_lock);
438
439         jbd_debug (3, "JBD: commit phase 2\n");
440
441         /*
442          * Now start flushing things to disk, in the order they appear
443          * on the transaction lists.  Data blocks go first.
444          */
445         err = journal_submit_data_buffers(journal, commit_transaction,
446                                           write_op);
447
448         /*
449          * Wait for all previously submitted IO to complete.
450          */
451         spin_lock(&journal->j_list_lock);
452         while (commit_transaction->t_locked_list) {
453                 struct buffer_head *bh;
454
455                 jh = commit_transaction->t_locked_list->b_tprev;
456                 bh = jh2bh(jh);
457                 get_bh(bh);
458                 if (buffer_locked(bh)) {
459                         spin_unlock(&journal->j_list_lock);
460                         wait_on_buffer(bh);
461                         spin_lock(&journal->j_list_lock);
462                 }
463                 if (unlikely(!buffer_uptodate(bh))) {
464                         if (!trylock_page(bh->b_page)) {
465                                 spin_unlock(&journal->j_list_lock);
466                                 lock_page(bh->b_page);
467                                 spin_lock(&journal->j_list_lock);
468                         }
469                         if (bh->b_page->mapping)
470                                 set_bit(AS_EIO, &bh->b_page->mapping->flags);
471
472                         unlock_page(bh->b_page);
473                         SetPageError(bh->b_page);
474                         err = -EIO;
475                 }
476                 if (!inverted_lock(journal, bh)) {
477                         put_bh(bh);
478                         spin_lock(&journal->j_list_lock);
479                         continue;
480                 }
481                 if (buffer_jbd(bh) && jh->b_jlist == BJ_Locked) {
482                         __journal_unfile_buffer(jh);
483                         jbd_unlock_bh_state(bh);
484                         journal_remove_journal_head(bh);
485                         put_bh(bh);
486                 } else {
487                         jbd_unlock_bh_state(bh);
488                 }
489                 release_data_buffer(bh);
490                 cond_resched_lock(&journal->j_list_lock);
491         }
492         spin_unlock(&journal->j_list_lock);
493
494         if (err) {
495                 char b[BDEVNAME_SIZE];
496
497                 printk(KERN_WARNING
498                         "JBD: Detected IO errors while flushing file data "
499                         "on %s\n", bdevname(journal->j_fs_dev, b));
500                 if (journal->j_flags & JFS_ABORT_ON_SYNCDATA_ERR)
501                         journal_abort(journal, err);
502                 err = 0;
503         }
504
505         journal_write_revoke_records(journal, commit_transaction, write_op);
506
507         /*
508          * If we found any dirty or locked buffers, then we should have
509          * looped back up to the write_out_data label.  If there weren't
510          * any then journal_clean_data_list should have wiped the list
511          * clean by now, so check that it is in fact empty.
512          */
513         J_ASSERT (commit_transaction->t_sync_datalist == NULL);
514
515         jbd_debug (3, "JBD: commit phase 3\n");
516
517         /*
518          * Way to go: we have now written out all of the data for a
519          * transaction!  Now comes the tricky part: we need to write out
520          * metadata.  Loop over the transaction's entire buffer list:
521          */
522         spin_lock(&journal->j_state_lock);
523         commit_transaction->t_state = T_COMMIT;
524         spin_unlock(&journal->j_state_lock);
525
526         J_ASSERT(commit_transaction->t_nr_buffers <=
527                  commit_transaction->t_outstanding_credits);
528
529         descriptor = NULL;
530         bufs = 0;
531         while (commit_transaction->t_buffers) {
532
533                 /* Find the next buffer to be journaled... */
534
535                 jh = commit_transaction->t_buffers;
536
537                 /* If we're in abort mode, we just un-journal the buffer and
538                    release it. */
539
540                 if (is_journal_aborted(journal)) {
541                         clear_buffer_jbddirty(jh2bh(jh));
542                         JBUFFER_TRACE(jh, "journal is aborting: refile");
543                         journal_refile_buffer(journal, jh);
544                         /* If that was the last one, we need to clean up
545                          * any descriptor buffers which may have been
546                          * already allocated, even if we are now
547                          * aborting. */
548                         if (!commit_transaction->t_buffers)
549                                 goto start_journal_io;
550                         continue;
551                 }
552
553                 /* Make sure we have a descriptor block in which to
554                    record the metadata buffer. */
555
556                 if (!descriptor) {
557                         struct buffer_head *bh;
558
559                         J_ASSERT (bufs == 0);
560
561                         jbd_debug(4, "JBD: get descriptor\n");
562
563                         descriptor = journal_get_descriptor_buffer(journal);
564                         if (!descriptor) {
565                                 journal_abort(journal, -EIO);
566                                 continue;
567                         }
568
569                         bh = jh2bh(descriptor);
570                         jbd_debug(4, "JBD: got buffer %llu (%p)\n",
571                                 (unsigned long long)bh->b_blocknr, bh->b_data);
572                         header = (journal_header_t *)&bh->b_data[0];
573                         header->h_magic     = cpu_to_be32(JFS_MAGIC_NUMBER);
574                         header->h_blocktype = cpu_to_be32(JFS_DESCRIPTOR_BLOCK);
575                         header->h_sequence  = cpu_to_be32(commit_transaction->t_tid);
576
577                         tagp = &bh->b_data[sizeof(journal_header_t)];
578                         space_left = bh->b_size - sizeof(journal_header_t);
579                         first_tag = 1;
580                         set_buffer_jwrite(bh);
581                         set_buffer_dirty(bh);
582                         wbuf[bufs++] = bh;
583
584                         /* Record it so that we can wait for IO
585                            completion later */
586                         BUFFER_TRACE(bh, "ph3: file as descriptor");
587                         journal_file_buffer(descriptor, commit_transaction,
588                                         BJ_LogCtl);
589                 }
590
591                 /* Where is the buffer to be written? */
592
593                 err = journal_next_log_block(journal, &blocknr);
594                 /* If the block mapping failed, just abandon the buffer
595                    and repeat this loop: we'll fall into the
596                    refile-on-abort condition above. */
597                 if (err) {
598                         journal_abort(journal, err);
599                         continue;
600                 }
601
602                 /*
603                  * start_this_handle() uses t_outstanding_credits to determine
604                  * the free space in the log, but this counter is changed
605                  * by journal_next_log_block() also.
606                  */
607                 commit_transaction->t_outstanding_credits--;
608
609                 /* Bump b_count to prevent truncate from stumbling over
610                    the shadowed buffer!  @@@ This can go if we ever get
611                    rid of the BJ_IO/BJ_Shadow pairing of buffers. */
612                 atomic_inc(&jh2bh(jh)->b_count);
613
614                 /* Make a temporary IO buffer with which to write it out
615                    (this will requeue both the metadata buffer and the
616                    temporary IO buffer). new_bh goes on BJ_IO*/
617
618                 set_bit(BH_JWrite, &jh2bh(jh)->b_state);
619                 /*
620                  * akpm: journal_write_metadata_buffer() sets
621                  * new_bh->b_transaction to commit_transaction.
622                  * We need to clean this up before we release new_bh
623                  * (which is of type BJ_IO)
624                  */
625                 JBUFFER_TRACE(jh, "ph3: write metadata");
626                 flags = journal_write_metadata_buffer(commit_transaction,
627                                                       jh, &new_jh, blocknr);
628                 set_bit(BH_JWrite, &jh2bh(new_jh)->b_state);
629                 wbuf[bufs++] = jh2bh(new_jh);
630
631                 /* Record the new block's tag in the current descriptor
632                    buffer */
633
634                 tag_flag = 0;
635                 if (flags & 1)
636                         tag_flag |= JFS_FLAG_ESCAPE;
637                 if (!first_tag)
638                         tag_flag |= JFS_FLAG_SAME_UUID;
639
640                 tag = (journal_block_tag_t *) tagp;
641                 tag->t_blocknr = cpu_to_be32(jh2bh(jh)->b_blocknr);
642                 tag->t_flags = cpu_to_be32(tag_flag);
643                 tagp += sizeof(journal_block_tag_t);
644                 space_left -= sizeof(journal_block_tag_t);
645
646                 if (first_tag) {
647                         memcpy (tagp, journal->j_uuid, 16);
648                         tagp += 16;
649                         space_left -= 16;
650                         first_tag = 0;
651                 }
652
653                 /* If there's no more to do, or if the descriptor is full,
654                    let the IO rip! */
655
656                 if (bufs == journal->j_wbufsize ||
657                     commit_transaction->t_buffers == NULL ||
658                     space_left < sizeof(journal_block_tag_t) + 16) {
659
660                         jbd_debug(4, "JBD: Submit %d IOs\n", bufs);
661
662                         /* Write an end-of-descriptor marker before
663                            submitting the IOs.  "tag" still points to
664                            the last tag we set up. */
665
666                         tag->t_flags |= cpu_to_be32(JFS_FLAG_LAST_TAG);
667
668 start_journal_io:
669                         for (i = 0; i < bufs; i++) {
670                                 struct buffer_head *bh = wbuf[i];
671                                 lock_buffer(bh);
672                                 clear_buffer_dirty(bh);
673                                 set_buffer_uptodate(bh);
674                                 bh->b_end_io = journal_end_buffer_io_sync;
675                                 submit_bh(write_op, bh);
676                         }
677                         cond_resched();
678
679                         /* Force a new descriptor to be generated next
680                            time round the loop. */
681                         descriptor = NULL;
682                         bufs = 0;
683                 }
684         }
685
686         /* Lo and behold: we have just managed to send a transaction to
687            the log.  Before we can commit it, wait for the IO so far to
688            complete.  Control buffers being written are on the
689            transaction's t_log_list queue, and metadata buffers are on
690            the t_iobuf_list queue.
691
692            Wait for the buffers in reverse order.  That way we are
693            less likely to be woken up until all IOs have completed, and
694            so we incur less scheduling load.
695         */
696
697         jbd_debug(3, "JBD: commit phase 4\n");
698
699         /*
700          * akpm: these are BJ_IO, and j_list_lock is not needed.
701          * See __journal_try_to_free_buffer.
702          */
703 wait_for_iobuf:
704         while (commit_transaction->t_iobuf_list != NULL) {
705                 struct buffer_head *bh;
706
707                 jh = commit_transaction->t_iobuf_list->b_tprev;
708                 bh = jh2bh(jh);
709                 if (buffer_locked(bh)) {
710                         wait_on_buffer(bh);
711                         goto wait_for_iobuf;
712                 }
713                 if (cond_resched())
714                         goto wait_for_iobuf;
715
716                 if (unlikely(!buffer_uptodate(bh)))
717                         err = -EIO;
718
719                 clear_buffer_jwrite(bh);
720
721                 JBUFFER_TRACE(jh, "ph4: unfile after journal write");
722                 journal_unfile_buffer(journal, jh);
723
724                 /*
725                  * ->t_iobuf_list should contain only dummy buffer_heads
726                  * which were created by journal_write_metadata_buffer().
727                  */
728                 BUFFER_TRACE(bh, "dumping temporary bh");
729                 journal_put_journal_head(jh);
730                 __brelse(bh);
731                 J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0);
732                 free_buffer_head(bh);
733
734                 /* We also have to unlock and free the corresponding
735                    shadowed buffer */
736                 jh = commit_transaction->t_shadow_list->b_tprev;
737                 bh = jh2bh(jh);
738                 clear_bit(BH_JWrite, &bh->b_state);
739                 J_ASSERT_BH(bh, buffer_jbddirty(bh));
740
741                 /* The metadata is now released for reuse, but we need
742                    to remember it against this transaction so that when
743                    we finally commit, we can do any checkpointing
744                    required. */
745                 JBUFFER_TRACE(jh, "file as BJ_Forget");
746                 journal_file_buffer(jh, commit_transaction, BJ_Forget);
747                 /* Wake up any transactions which were waiting for this
748                    IO to complete */
749                 wake_up_bit(&bh->b_state, BH_Unshadow);
750                 JBUFFER_TRACE(jh, "brelse shadowed buffer");
751                 __brelse(bh);
752         }
753
754         J_ASSERT (commit_transaction->t_shadow_list == NULL);
755
756         jbd_debug(3, "JBD: commit phase 5\n");
757
758         /* Here we wait for the revoke record and descriptor record buffers */
759  wait_for_ctlbuf:
760         while (commit_transaction->t_log_list != NULL) {
761                 struct buffer_head *bh;
762
763                 jh = commit_transaction->t_log_list->b_tprev;
764                 bh = jh2bh(jh);
765                 if (buffer_locked(bh)) {
766                         wait_on_buffer(bh);
767                         goto wait_for_ctlbuf;
768                 }
769                 if (cond_resched())
770                         goto wait_for_ctlbuf;
771
772                 if (unlikely(!buffer_uptodate(bh)))
773                         err = -EIO;
774
775                 BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile");
776                 clear_buffer_jwrite(bh);
777                 journal_unfile_buffer(journal, jh);
778                 journal_put_journal_head(jh);
779                 __brelse(bh);           /* One for getblk */
780                 /* AKPM: bforget here */
781         }
782
783         if (err)
784                 journal_abort(journal, err);
785
786         jbd_debug(3, "JBD: commit phase 6\n");
787
788         if (journal_write_commit_record(journal, commit_transaction))
789                 err = -EIO;
790
791         if (err)
792                 journal_abort(journal, err);
793
794         /* End of a transaction!  Finally, we can do checkpoint
795            processing: any buffers committed as a result of this
796            transaction can be removed from any checkpoint list it was on
797            before. */
798
799         jbd_debug(3, "JBD: commit phase 7\n");
800
801         J_ASSERT(commit_transaction->t_sync_datalist == NULL);
802         J_ASSERT(commit_transaction->t_buffers == NULL);
803         J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
804         J_ASSERT(commit_transaction->t_iobuf_list == NULL);
805         J_ASSERT(commit_transaction->t_shadow_list == NULL);
806         J_ASSERT(commit_transaction->t_log_list == NULL);
807
808 restart_loop:
809         /*
810          * As there are other places (journal_unmap_buffer()) adding buffers
811          * to this list we have to be careful and hold the j_list_lock.
812          */
813         spin_lock(&journal->j_list_lock);
814         while (commit_transaction->t_forget) {
815                 transaction_t *cp_transaction;
816                 struct buffer_head *bh;
817
818                 jh = commit_transaction->t_forget;
819                 spin_unlock(&journal->j_list_lock);
820                 bh = jh2bh(jh);
821                 jbd_lock_bh_state(bh);
822                 J_ASSERT_JH(jh, jh->b_transaction == commit_transaction ||
823                         jh->b_transaction == journal->j_running_transaction);
824
825                 /*
826                  * If there is undo-protected committed data against
827                  * this buffer, then we can remove it now.  If it is a
828                  * buffer needing such protection, the old frozen_data
829                  * field now points to a committed version of the
830                  * buffer, so rotate that field to the new committed
831                  * data.
832                  *
833                  * Otherwise, we can just throw away the frozen data now.
834                  */
835                 if (jh->b_committed_data) {
836                         jbd_free(jh->b_committed_data, bh->b_size);
837                         jh->b_committed_data = NULL;
838                         if (jh->b_frozen_data) {
839                                 jh->b_committed_data = jh->b_frozen_data;
840                                 jh->b_frozen_data = NULL;
841                         }
842                 } else if (jh->b_frozen_data) {
843                         jbd_free(jh->b_frozen_data, bh->b_size);
844                         jh->b_frozen_data = NULL;
845                 }
846
847                 spin_lock(&journal->j_list_lock);
848                 cp_transaction = jh->b_cp_transaction;
849                 if (cp_transaction) {
850                         JBUFFER_TRACE(jh, "remove from old cp transaction");
851                         __journal_remove_checkpoint(jh);
852                 }
853
854                 /* Only re-checkpoint the buffer_head if it is marked
855                  * dirty.  If the buffer was added to the BJ_Forget list
856                  * by journal_forget, it may no longer be dirty and
857                  * there's no point in keeping a checkpoint record for
858                  * it. */
859
860                 /* A buffer which has been freed while still being
861                  * journaled by a previous transaction may end up still
862                  * being dirty here, but we want to avoid writing back
863                  * that buffer in the future now that the last use has
864                  * been committed.  That's not only a performance gain,
865                  * it also stops aliasing problems if the buffer is left
866                  * behind for writeback and gets reallocated for another
867                  * use in a different page. */
868                 if (buffer_freed(bh)) {
869                         clear_buffer_freed(bh);
870                         clear_buffer_jbddirty(bh);
871                 }
872
873                 if (buffer_jbddirty(bh)) {
874                         JBUFFER_TRACE(jh, "add to new checkpointing trans");
875                         __journal_insert_checkpoint(jh, commit_transaction);
876                         if (is_journal_aborted(journal))
877                                 clear_buffer_jbddirty(bh);
878                         JBUFFER_TRACE(jh, "refile for checkpoint writeback");
879                         __journal_refile_buffer(jh);
880                         jbd_unlock_bh_state(bh);
881                 } else {
882                         J_ASSERT_BH(bh, !buffer_dirty(bh));
883                         /* The buffer on BJ_Forget list and not jbddirty means
884                          * it has been freed by this transaction and hence it
885                          * could not have been reallocated until this
886                          * transaction has committed. *BUT* it could be
887                          * reallocated once we have written all the data to
888                          * disk and before we process the buffer on BJ_Forget
889                          * list. */
890                         JBUFFER_TRACE(jh, "refile or unfile freed buffer");
891                         __journal_refile_buffer(jh);
892                         if (!jh->b_transaction) {
893                                 jbd_unlock_bh_state(bh);
894                                  /* needs a brelse */
895                                 journal_remove_journal_head(bh);
896                                 release_buffer_page(bh);
897                         } else
898                                 jbd_unlock_bh_state(bh);
899                 }
900                 cond_resched_lock(&journal->j_list_lock);
901         }
902         spin_unlock(&journal->j_list_lock);
903         /*
904          * This is a bit sleazy.  We use j_list_lock to protect transition
905          * of a transaction into T_FINISHED state and calling
906          * __journal_drop_transaction(). Otherwise we could race with
907          * other checkpointing code processing the transaction...
908          */
909         spin_lock(&journal->j_state_lock);
910         spin_lock(&journal->j_list_lock);
911         /*
912          * Now recheck if some buffers did not get attached to the transaction
913          * while the lock was dropped...
914          */
915         if (commit_transaction->t_forget) {
916                 spin_unlock(&journal->j_list_lock);
917                 spin_unlock(&journal->j_state_lock);
918                 goto restart_loop;
919         }
920
921         /* Done with this transaction! */
922
923         jbd_debug(3, "JBD: commit phase 8\n");
924
925         J_ASSERT(commit_transaction->t_state == T_COMMIT);
926
927         commit_transaction->t_state = T_FINISHED;
928         J_ASSERT(commit_transaction == journal->j_committing_transaction);
929         journal->j_commit_sequence = commit_transaction->t_tid;
930         journal->j_committing_transaction = NULL;
931         commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
932
933         /*
934          * weight the commit time higher than the average time so we don't
935          * react too strongly to vast changes in commit time
936          */
937         if (likely(journal->j_average_commit_time))
938                 journal->j_average_commit_time = (commit_time*3 +
939                                 journal->j_average_commit_time) / 4;
940         else
941                 journal->j_average_commit_time = commit_time;
942
943         spin_unlock(&journal->j_state_lock);
944
945         if (commit_transaction->t_checkpoint_list == NULL &&
946             commit_transaction->t_checkpoint_io_list == NULL) {
947                 __journal_drop_transaction(journal, commit_transaction);
948         } else {
949                 if (journal->j_checkpoint_transactions == NULL) {
950                         journal->j_checkpoint_transactions = commit_transaction;
951                         commit_transaction->t_cpnext = commit_transaction;
952                         commit_transaction->t_cpprev = commit_transaction;
953                 } else {
954                         commit_transaction->t_cpnext =
955                                 journal->j_checkpoint_transactions;
956                         commit_transaction->t_cpprev =
957                                 commit_transaction->t_cpnext->t_cpprev;
958                         commit_transaction->t_cpnext->t_cpprev =
959                                 commit_transaction;
960                         commit_transaction->t_cpprev->t_cpnext =
961                                 commit_transaction;
962                 }
963         }
964         spin_unlock(&journal->j_list_lock);
965
966         jbd_debug(1, "JBD: commit %d complete, head %d\n",
967                   journal->j_commit_sequence, journal->j_tail_sequence);
968
969         wake_up(&journal->j_wait_done_commit);
970 }