git.oblomov.eu Git - linux-2.6/blob - fs/jbd2/commit.c

   1 /*
   2  * linux/fs/jbd2/commit.c
   3  *
   4  * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
   5  *
   6  * Copyright 1998 Red Hat corp --- All Rights Reserved
   7  *
   8  * This file is part of the Linux kernel and is made available under
   9  * the terms of the GNU General Public License, version 2, or at your
  10  * option, any later version, incorporated herein by reference.
  11  *
  12  * Journal commit routines for the generic filesystem journaling code;
  13  * part of the ext2fs journaling system.
  14  */
  15
  16 #include <linux/time.h>
  17 #include <linux/fs.h>
  18 #include <linux/jbd2.h>
  19 #include <linux/errno.h>
  20 #include <linux/slab.h>
  21 #include <linux/mm.h>
  22 #include <linux/pagemap.h>
  23 #include <linux/jiffies.h>
  24 #include <linux/crc32.h>
  25
  26 /*
  27  * Default IO end handler for temporary BJ_IO buffer_heads.
  28  */
  29 static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
  30 {
  31         BUFFER_TRACE(bh, "");
  32         if (uptodate)
  33                 set_buffer_uptodate(bh);
  34         else
  35                 clear_buffer_uptodate(bh);
  36         unlock_buffer(bh);
  37 }
  38
  39 /*
  40  * When an ext3-ordered file is truncated, it is possible that many pages are
  41  * not sucessfully freed, because they are attached to a committing transaction.
  42  * After the transaction commits, these pages are left on the LRU, with no
  43  * ->mapping, and with attached buffers.  These pages are trivially reclaimable
  44  * by the VM, but their apparent absence upsets the VM accounting, and it makes
  45  * the numbers in /proc/meminfo look odd.
  46  *
  47  * So here, we have a buffer which has just come off the forget list.  Look to
  48  * see if we can strip all buffers from the backing page.
  49  *
  50  * Called under lock_journal(), and possibly under journal_datalist_lock.  The
  51  * caller provided us with a ref against the buffer, and we drop that here.
  52  */
  53 static void release_buffer_page(struct buffer_head *bh)
  54 {
  55         struct page *page;
  56
  57         if (buffer_dirty(bh))
  58                 goto nope;
  59         if (atomic_read(&bh->b_count) != 1)
  60                 goto nope;
  61         page = bh->b_page;
  62         if (!page)
  63                 goto nope;
  64         if (page->mapping)
  65                 goto nope;
  66
  67         /* OK, it's a truncated page */
  68         if (TestSetPageLocked(page))
  69                 goto nope;
  70
  71         page_cache_get(page);
  72         __brelse(bh);
  73         try_to_free_buffers(page);
  74         unlock_page(page);
  75         page_cache_release(page);
  76         return;
  77
  78 nope:
  79         __brelse(bh);
  80 }
  81
  82 /*
  83  * Try to acquire jbd_lock_bh_state() against the buffer, when j_list_lock is
  84  * held.  For ranking reasons we must trylock.  If we lose, schedule away and
  85  * return 0.  j_list_lock is dropped in this case.
  86  */
  87 static int inverted_lock(journal_t *journal, struct buffer_head *bh)
  88 {
  89         if (!jbd_trylock_bh_state(bh)) {
  90                 spin_unlock(&journal->j_list_lock);
  91                 schedule();
  92                 return 0;
  93         }
  94         return 1;
  95 }
  96
  97 /*
  98  * Done it all: now submit the commit record.  We should have
  99  * cleaned up our previous buffers by now, so if we are in abort
 100  * mode we can now just skip the rest of the journal write
 101  * entirely.
 102  *
 103  * Returns 1 if the journal needs to be aborted or 0 on success
 104  */
 105 static int journal_submit_commit_record(journal_t *journal,
 106                                         transaction_t *commit_transaction,
 107                                         struct buffer_head **cbh,
 108                                         __u32 crc32_sum)
 109 {
 110         struct journal_head *descriptor;
 111         struct commit_header *tmp;
 112         struct buffer_head *bh;
 113         int ret;
 114         int barrier_done = 0;
 115
 116         if (is_journal_aborted(journal))
 117                 return 0;
 118
 119         descriptor = jbd2_journal_get_descriptor_buffer(journal);
 120         if (!descriptor)
 121                 return 1;
 122
 123         bh = jh2bh(descriptor);
 124
 125         tmp = (struct commit_header *)bh->b_data;
 126         tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
 127         tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK);
 128         tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid);
 129
 130         if (JBD2_HAS_COMPAT_FEATURE(journal,
 131                                     JBD2_FEATURE_COMPAT_CHECKSUM)) {
 132                 tmp->h_chksum_type      = JBD2_CRC32_CHKSUM;
 133                 tmp->h_chksum_size      = JBD2_CRC32_CHKSUM_SIZE;
 134                 tmp->h_chksum[0]        = cpu_to_be32(crc32_sum);
 135         }
 136
 137         JBUFFER_TRACE(descriptor, "submit commit block");
 138         lock_buffer(bh);
 139         get_bh(bh);
 140         set_buffer_dirty(bh);
 141         set_buffer_uptodate(bh);
 142         bh->b_end_io = journal_end_buffer_io_sync;
 143
 144         if (journal->j_flags & JBD2_BARRIER &&
 145                 !JBD2_HAS_INCOMPAT_FEATURE(journal,
 146                                          JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
 147                 set_buffer_ordered(bh);
 148                 barrier_done = 1;
 149         }
 150         ret = submit_bh(WRITE, bh);
 151         if (barrier_done)
 152                 clear_buffer_ordered(bh);
 153
 154         /* is it possible for another commit to fail at roughly
 155          * the same time as this one?  If so, we don't want to
 156          * trust the barrier flag in the super, but instead want
 157          * to remember if we sent a barrier request
 158          */
 159         if (ret == -EOPNOTSUPP && barrier_done) {
 160                 char b[BDEVNAME_SIZE];
 161
 162                 printk(KERN_WARNING
 163                         "JBD: barrier-based sync failed on %s - "
 164                         "disabling barriers\n",
 165                         bdevname(journal->j_dev, b));
 166                 spin_lock(&journal->j_state_lock);
 167                 journal->j_flags &= ~JBD2_BARRIER;
 168                 spin_unlock(&journal->j_state_lock);
 169
 170                 /* And try again, without the barrier */
 171                 set_buffer_uptodate(bh);
 172                 set_buffer_dirty(bh);
 173                 ret = submit_bh(WRITE, bh);
 174         }
 175         *cbh = bh;
 176         return ret;
 177 }
 178
 179 /*
 180  * This function along with journal_submit_commit_record
 181  * allows to write the commit record asynchronously.
 182  */
 183 static int journal_wait_on_commit_record(struct buffer_head *bh)
 184 {
 185         int ret = 0;
 186
 187         clear_buffer_dirty(bh);
 188         wait_on_buffer(bh);
 189
 190         if (unlikely(!buffer_uptodate(bh)))
 191                 ret = -EIO;
 192         put_bh(bh);            /* One for getblk() */
 193         jbd2_journal_put_journal_head(bh2jh(bh));
 194
 195         return ret;
 196 }
 197
 198 /*
 199  * Wait for all submitted IO to complete.
 200  */
 201 static int journal_wait_on_locked_list(journal_t *journal,
 202                                        transaction_t *commit_transaction)
 203 {
 204         int ret = 0;
 205         struct journal_head *jh;
 206
 207         while (commit_transaction->t_locked_list) {
 208                 struct buffer_head *bh;
 209
 210                 jh = commit_transaction->t_locked_list->b_tprev;
 211                 bh = jh2bh(jh);
 212                 get_bh(bh);
 213                 if (buffer_locked(bh)) {
 214                         spin_unlock(&journal->j_list_lock);
 215                         wait_on_buffer(bh);
 216                         if (unlikely(!buffer_uptodate(bh)))
 217                                 ret = -EIO;
 218                         spin_lock(&journal->j_list_lock);
 219                 }
 220                 if (!inverted_lock(journal, bh)) {
 221                         put_bh(bh);
 222                         spin_lock(&journal->j_list_lock);
 223                         continue;
 224                 }
 225                 if (buffer_jbd(bh) && jh->b_jlist == BJ_Locked) {
 226                         __jbd2_journal_unfile_buffer(jh);
 227                         jbd_unlock_bh_state(bh);
 228                         jbd2_journal_remove_journal_head(bh);
 229                         put_bh(bh);
 230                 } else {
 231                         jbd_unlock_bh_state(bh);
 232                 }
 233                 put_bh(bh);
 234                 cond_resched_lock(&journal->j_list_lock);
 235         }
 236         return ret;
 237   }
 238
 239 static void journal_do_submit_data(struct buffer_head **wbuf, int bufs)
 240 {
 241         int i;
 242
 243         for (i = 0; i < bufs; i++) {
 244                 wbuf[i]->b_end_io = end_buffer_write_sync;
 245                 /* We use-up our safety reference in submit_bh() */
 246                 submit_bh(WRITE, wbuf[i]);
 247         }
 248 }
 249
 250 /*
 251  *  Submit all the data buffers to disk
 252  */
 253 static void journal_submit_data_buffers(journal_t *journal,
 254                                 transaction_t *commit_transaction)
 255 {
 256         struct journal_head *jh;
 257         struct buffer_head *bh;
 258         int locked;
 259         int bufs = 0;
 260         struct buffer_head **wbuf = journal->j_wbuf;
 261
 262         /*
 263          * Whenever we unlock the journal and sleep, things can get added
 264          * onto ->t_sync_datalist, so we have to keep looping back to
 265          * write_out_data until we *know* that the list is empty.
 266          *
 267          * Cleanup any flushed data buffers from the data list.  Even in
 268          * abort mode, we want to flush this out as soon as possible.
 269          */
 270 write_out_data:
 271         cond_resched();
 272         spin_lock(&journal->j_list_lock);
 273
 274         while (commit_transaction->t_sync_datalist) {
 275                 jh = commit_transaction->t_sync_datalist;
 276                 bh = jh2bh(jh);
 277                 locked = 0;
 278
 279                 /* Get reference just to make sure buffer does not disappear
 280                  * when we are forced to drop various locks */
 281                 get_bh(bh);
 282                 /* If the buffer is dirty, we need to submit IO and hence
 283                  * we need the buffer lock. We try to lock the buffer without
 284                  * blocking. If we fail, we need to drop j_list_lock and do
 285                  * blocking lock_buffer().
 286                  */
 287                 if (buffer_dirty(bh)) {
 288                         if (test_set_buffer_locked(bh)) {
 289                                 BUFFER_TRACE(bh, "needs blocking lock");
 290                                 spin_unlock(&journal->j_list_lock);
 291                                 /* Write out all data to prevent deadlocks */
 292                                 journal_do_submit_data(wbuf, bufs);
 293                                 bufs = 0;
 294                                 lock_buffer(bh);
 295                                 spin_lock(&journal->j_list_lock);
 296                         }
 297                         locked = 1;
 298                 }
 299                 /* We have to get bh_state lock. Again out of order, sigh. */
 300                 if (!inverted_lock(journal, bh)) {
 301                         jbd_lock_bh_state(bh);
 302                         spin_lock(&journal->j_list_lock);
 303                 }
 304                 /* Someone already cleaned up the buffer? */
 305                 if (!buffer_jbd(bh)
 306                         || jh->b_transaction != commit_transaction
 307                         || jh->b_jlist != BJ_SyncData) {
 308                         jbd_unlock_bh_state(bh);
 309                         if (locked)
 310                                 unlock_buffer(bh);
 311                         BUFFER_TRACE(bh, "already cleaned up");
 312                         put_bh(bh);
 313                         continue;
 314                 }
 315                 if (locked && test_clear_buffer_dirty(bh)) {
 316                         BUFFER_TRACE(bh, "needs writeout, adding to array");
 317                         wbuf[bufs++] = bh;
 318                         __jbd2_journal_file_buffer(jh, commit_transaction,
 319                                                 BJ_Locked);
 320                         jbd_unlock_bh_state(bh);
 321                         if (bufs == journal->j_wbufsize) {
 322                                 spin_unlock(&journal->j_list_lock);
 323                                 journal_do_submit_data(wbuf, bufs);
 324                                 bufs = 0;
 325                                 goto write_out_data;
 326                         }
 327                 } else if (!locked && buffer_locked(bh)) {
 328                         __jbd2_journal_file_buffer(jh, commit_transaction,
 329                                                 BJ_Locked);
 330                         jbd_unlock_bh_state(bh);
 331                         put_bh(bh);
 332                 } else {
 333                         BUFFER_TRACE(bh, "writeout complete: unfile");
 334                         __jbd2_journal_unfile_buffer(jh);
 335                         jbd_unlock_bh_state(bh);
 336                         if (locked)
 337                                 unlock_buffer(bh);
 338                         jbd2_journal_remove_journal_head(bh);
 339                         /* Once for our safety reference, once for
 340                          * jbd2_journal_remove_journal_head() */
 341                         put_bh(bh);
 342                         put_bh(bh);
 343                 }
 344
 345                 if (need_resched() || spin_needbreak(&journal->j_list_lock)) {
 346                         spin_unlock(&journal->j_list_lock);
 347                         goto write_out_data;
 348                 }
 349         }
 350         spin_unlock(&journal->j_list_lock);
 351         journal_do_submit_data(wbuf, bufs);
 352 }
 353
 354 static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh)
 355 {
 356         struct page *page = bh->b_page;
 357         char *addr;
 358         __u32 checksum;
 359
 360         addr = kmap_atomic(page, KM_USER0);
 361         checksum = crc32_be(crc32_sum,
 362                 (void *)(addr + offset_in_page(bh->b_data)), bh->b_size);
 363         kunmap_atomic(addr, KM_USER0);
 364
 365         return checksum;
 366 }
 367
 368 static void write_tag_block(int tag_bytes, journal_block_tag_t *tag,
 369                                    unsigned long long block)
 370 {
 371         tag->t_blocknr = cpu_to_be32(block & (u32)~0);
 372         if (tag_bytes > JBD2_TAG_SIZE32)
 373                 tag->t_blocknr_high = cpu_to_be32((block >> 31) >> 1);
 374 }
 375
 376 /*
 377  * jbd2_journal_commit_transaction
 378  *
 379  * The primary function for committing a transaction to the log.  This
 380  * function is called by the journal thread to begin a complete commit.
 381  */
 382 void jbd2_journal_commit_transaction(journal_t *journal)
 383 {
 384         struct transaction_stats_s stats;
 385         transaction_t *commit_transaction;
 386         struct journal_head *jh, *new_jh, *descriptor;
 387         struct buffer_head **wbuf = journal->j_wbuf;
 388         int bufs;
 389         int flags;
 390         int err;
 391         unsigned long long blocknr;
 392         char *tagp = NULL;
 393         journal_header_t *header;
 394         journal_block_tag_t *tag = NULL;
 395         int space_left = 0;
 396         int first_tag = 0;
 397         int tag_flag;
 398         int i;
 399         int tag_bytes = journal_tag_bytes(journal);
 400         struct buffer_head *cbh = NULL; /* For transactional checksums */
 401         __u32 crc32_sum = ~0;
 402
 403         /*
 404          * First job: lock down the current transaction and wait for
 405          * all outstanding updates to complete.
 406          */
 407
 408 #ifdef COMMIT_STATS
 409         spin_lock(&journal->j_list_lock);
 410         summarise_journal_usage(journal);
 411         spin_unlock(&journal->j_list_lock);
 412 #endif
 413
 414         /* Do we need to erase the effects of a prior jbd2_journal_flush? */
 415         if (journal->j_flags & JBD2_FLUSHED) {
 416                 jbd_debug(3, "super block updated\n");
 417                 jbd2_journal_update_superblock(journal, 1);
 418         } else {
 419                 jbd_debug(3, "superblock not updated\n");
 420         }
 421
 422         J_ASSERT(journal->j_running_transaction != NULL);
 423         J_ASSERT(journal->j_committing_transaction == NULL);
 424
 425         commit_transaction = journal->j_running_transaction;
 426         J_ASSERT(commit_transaction->t_state == T_RUNNING);
 427
 428         jbd_debug(1, "JBD: starting commit of transaction %d\n",
 429                         commit_transaction->t_tid);
 430
 431         spin_lock(&journal->j_state_lock);
 432         commit_transaction->t_state = T_LOCKED;
 433
 434         stats.u.run.rs_wait = commit_transaction->t_max_wait;
 435         stats.u.run.rs_locked = jiffies;
 436         stats.u.run.rs_running = jbd2_time_diff(commit_transaction->t_start,
 437                                                 stats.u.run.rs_locked);
 438
 439         spin_lock(&commit_transaction->t_handle_lock);
 440         while (commit_transaction->t_updates) {
 441                 DEFINE_WAIT(wait);
 442
 443                 prepare_to_wait(&journal->j_wait_updates, &wait,
 444                                         TASK_UNINTERRUPTIBLE);
 445                 if (commit_transaction->t_updates) {
 446                         spin_unlock(&commit_transaction->t_handle_lock);
 447                         spin_unlock(&journal->j_state_lock);
 448                         schedule();
 449                         spin_lock(&journal->j_state_lock);
 450                         spin_lock(&commit_transaction->t_handle_lock);
 451                 }
 452                 finish_wait(&journal->j_wait_updates, &wait);
 453         }
 454         spin_unlock(&commit_transaction->t_handle_lock);
 455
 456         J_ASSERT (commit_transaction->t_outstanding_credits <=
 457                         journal->j_max_transaction_buffers);
 458
 459         /*
 460          * First thing we are allowed to do is to discard any remaining
 461          * BJ_Reserved buffers.  Note, it is _not_ permissible to assume
 462          * that there are no such buffers: if a large filesystem
 463          * operation like a truncate needs to split itself over multiple
 464          * transactions, then it may try to do a jbd2_journal_restart() while
 465          * there are still BJ_Reserved buffers outstanding.  These must
 466          * be released cleanly from the current transaction.
 467          *
 468          * In this case, the filesystem must still reserve write access
 469          * again before modifying the buffer in the new transaction, but
 470          * we do not require it to remember exactly which old buffers it
 471          * has reserved.  This is consistent with the existing behaviour
 472          * that multiple jbd2_journal_get_write_access() calls to the same
 473          * buffer are perfectly permissable.
 474          */
 475         while (commit_transaction->t_reserved_list) {
 476                 jh = commit_transaction->t_reserved_list;
 477                 JBUFFER_TRACE(jh, "reserved, unused: refile");
 478                 /*
 479                  * A jbd2_journal_get_undo_access()+jbd2_journal_release_buffer() may
 480                  * leave undo-committed data.
 481                  */
 482                 if (jh->b_committed_data) {
 483                         struct buffer_head *bh = jh2bh(jh);
 484
 485                         jbd_lock_bh_state(bh);
 486                         jbd2_free(jh->b_committed_data, bh->b_size);
 487                         jh->b_committed_data = NULL;
 488                         jbd_unlock_bh_state(bh);
 489                 }
 490                 jbd2_journal_refile_buffer(journal, jh);
 491         }
 492
 493         /*
 494          * Now try to drop any written-back buffers from the journal's
 495          * checkpoint lists.  We do this *before* commit because it potentially
 496          * frees some memory
 497          */
 498         spin_lock(&journal->j_list_lock);
 499         __jbd2_journal_clean_checkpoint_list(journal);
 500         spin_unlock(&journal->j_list_lock);
 501
 502         jbd_debug (3, "JBD: commit phase 1\n");
 503
 504         /*
 505          * Switch to a new revoke table.
 506          */
 507         jbd2_journal_switch_revoke_table(journal);
 508
 509         stats.u.run.rs_flushing = jiffies;
 510         stats.u.run.rs_locked = jbd2_time_diff(stats.u.run.rs_locked,
 511                                                stats.u.run.rs_flushing);
 512
 513         commit_transaction->t_state = T_FLUSH;
 514         journal->j_committing_transaction = commit_transaction;
 515         journal->j_running_transaction = NULL;
 516         commit_transaction->t_log_start = journal->j_head;
 517         wake_up(&journal->j_wait_transaction_locked);
 518         spin_unlock(&journal->j_state_lock);
 519
 520         jbd_debug (3, "JBD: commit phase 2\n");
 521
 522         /*
 523          * First, drop modified flag: all accesses to the buffers
 524          * will be tracked for a new trasaction only -bzzz
 525          */
 526         spin_lock(&journal->j_list_lock);
 527         if (commit_transaction->t_buffers) {
 528                 new_jh = jh = commit_transaction->t_buffers->b_tnext;
 529                 do {
 530                         J_ASSERT_JH(new_jh, new_jh->b_modified == 1 ||
 531                                         new_jh->b_modified == 0);
 532                         new_jh->b_modified = 0;
 533                         new_jh = new_jh->b_tnext;
 534                 } while (new_jh != jh);
 535         }
 536         spin_unlock(&journal->j_list_lock);
 537
 538         /*
 539          * Now start flushing things to disk, in the order they appear
 540          * on the transaction lists.  Data blocks go first.
 541          */
 542         err = 0;
 543         journal_submit_data_buffers(journal, commit_transaction);
 544
 545         /*
 546          * Wait for all previously submitted IO to complete if commit
 547          * record is to be written synchronously.
 548          */
 549         spin_lock(&journal->j_list_lock);
 550         if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
 551                 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT))
 552                 err = journal_wait_on_locked_list(journal,
 553                                                 commit_transaction);
 554
 555         spin_unlock(&journal->j_list_lock);
 556
 557         if (err)
 558                 jbd2_journal_abort(journal, err);
 559
 560         jbd2_journal_write_revoke_records(journal, commit_transaction);
 561
 562         jbd_debug(3, "JBD: commit phase 2\n");
 563
 564         /*
 565          * If we found any dirty or locked buffers, then we should have
 566          * looped back up to the write_out_data label.  If there weren't
 567          * any then journal_clean_data_list should have wiped the list
 568          * clean by now, so check that it is in fact empty.
 569          */
 570         J_ASSERT (commit_transaction->t_sync_datalist == NULL);
 571
 572         jbd_debug (3, "JBD: commit phase 3\n");
 573
 574         /*
 575          * Way to go: we have now written out all of the data for a
 576          * transaction!  Now comes the tricky part: we need to write out
 577          * metadata.  Loop over the transaction's entire buffer list:
 578          */
 579         commit_transaction->t_state = T_COMMIT;
 580
 581         stats.u.run.rs_logging = jiffies;
 582         stats.u.run.rs_flushing = jbd2_time_diff(stats.u.run.rs_flushing,
 583                                                  stats.u.run.rs_logging);
 584         stats.u.run.rs_blocks = commit_transaction->t_outstanding_credits;
 585         stats.u.run.rs_blocks_logged = 0;
 586
 587         descriptor = NULL;
 588         bufs = 0;
 589         while (commit_transaction->t_buffers) {
 590
 591                 /* Find the next buffer to be journaled... */
 592
 593                 jh = commit_transaction->t_buffers;
 594
 595                 /* If we're in abort mode, we just un-journal the buffer and
 596                    release it for background writing. */
 597
 598                 if (is_journal_aborted(journal)) {
 599                         JBUFFER_TRACE(jh, "journal is aborting: refile");
 600                         jbd2_journal_refile_buffer(journal, jh);
 601                         /* If that was the last one, we need to clean up
 602                          * any descriptor buffers which may have been
 603                          * already allocated, even if we are now
 604                          * aborting. */
 605                         if (!commit_transaction->t_buffers)
 606                                 goto start_journal_io;
 607                         continue;
 608                 }
 609
 610                 /* Make sure we have a descriptor block in which to
 611                    record the metadata buffer. */
 612
 613                 if (!descriptor) {
 614                         struct buffer_head *bh;
 615
 616                         J_ASSERT (bufs == 0);
 617
 618                         jbd_debug(4, "JBD: get descriptor\n");
 619
 620                         descriptor = jbd2_journal_get_descriptor_buffer(journal);
 621                         if (!descriptor) {
 622                                 jbd2_journal_abort(journal, -EIO);
 623                                 continue;
 624                         }
 625
 626                         bh = jh2bh(descriptor);
 627                         jbd_debug(4, "JBD: got buffer %llu (%p)\n",
 628                                 (unsigned long long)bh->b_blocknr, bh->b_data);
 629                         header = (journal_header_t *)&bh->b_data[0];
 630                         header->h_magic     = cpu_to_be32(JBD2_MAGIC_NUMBER);
 631                         header->h_blocktype = cpu_to_be32(JBD2_DESCRIPTOR_BLOCK);
 632                         header->h_sequence  = cpu_to_be32(commit_transaction->t_tid);
 633
 634                         tagp = &bh->b_data[sizeof(journal_header_t)];
 635                         space_left = bh->b_size - sizeof(journal_header_t);
 636                         first_tag = 1;
 637                         set_buffer_jwrite(bh);
 638                         set_buffer_dirty(bh);
 639                         wbuf[bufs++] = bh;
 640
 641                         /* Record it so that we can wait for IO
 642                            completion later */
 643                         BUFFER_TRACE(bh, "ph3: file as descriptor");
 644                         jbd2_journal_file_buffer(descriptor, commit_transaction,
 645                                         BJ_LogCtl);
 646                 }
 647
 648                 /* Where is the buffer to be written? */
 649
 650                 err = jbd2_journal_next_log_block(journal, &blocknr);
 651                 /* If the block mapping failed, just abandon the buffer
 652                    and repeat this loop: we'll fall into the
 653                    refile-on-abort condition above. */
 654                 if (err) {
 655                         jbd2_journal_abort(journal, err);
 656                         continue;
 657                 }
 658
 659                 /*
 660                  * start_this_handle() uses t_outstanding_credits to determine
 661                  * the free space in the log, but this counter is changed
 662                  * by jbd2_journal_next_log_block() also.
 663                  */
 664                 commit_transaction->t_outstanding_credits--;
 665
 666                 /* Bump b_count to prevent truncate from stumbling over
 667                    the shadowed buffer!  @@@ This can go if we ever get
 668                    rid of the BJ_IO/BJ_Shadow pairing of buffers. */
 669                 atomic_inc(&jh2bh(jh)->b_count);
 670
 671                 /* Make a temporary IO buffer with which to write it out
 672                    (this will requeue both the metadata buffer and the
 673                    temporary IO buffer). new_bh goes on BJ_IO*/
 674
 675                 set_bit(BH_JWrite, &jh2bh(jh)->b_state);
 676                 /*
 677                  * akpm: jbd2_journal_write_metadata_buffer() sets
 678                  * new_bh->b_transaction to commit_transaction.
 679                  * We need to clean this up before we release new_bh
 680                  * (which is of type BJ_IO)
 681                  */
 682                 JBUFFER_TRACE(jh, "ph3: write metadata");
 683                 flags = jbd2_journal_write_metadata_buffer(commit_transaction,
 684                                                       jh, &new_jh, blocknr);
 685                 set_bit(BH_JWrite, &jh2bh(new_jh)->b_state);
 686                 wbuf[bufs++] = jh2bh(new_jh);
 687
 688                 /* Record the new block's tag in the current descriptor
 689                    buffer */
 690
 691                 tag_flag = 0;
 692                 if (flags & 1)
 693                         tag_flag |= JBD2_FLAG_ESCAPE;
 694                 if (!first_tag)
 695                         tag_flag |= JBD2_FLAG_SAME_UUID;
 696
 697                 tag = (journal_block_tag_t *) tagp;
 698                 write_tag_block(tag_bytes, tag, jh2bh(jh)->b_blocknr);
 699                 tag->t_flags = cpu_to_be32(tag_flag);
 700                 tagp += tag_bytes;
 701                 space_left -= tag_bytes;
 702
 703                 if (first_tag) {
 704                         memcpy (tagp, journal->j_uuid, 16);
 705                         tagp += 16;
 706                         space_left -= 16;
 707                         first_tag = 0;
 708                 }
 709
 710                 /* If there's no more to do, or if the descriptor is full,
 711                    let the IO rip! */
 712
 713                 if (bufs == journal->j_wbufsize ||
 714                     commit_transaction->t_buffers == NULL ||
 715                     space_left < tag_bytes + 16) {
 716
 717                         jbd_debug(4, "JBD: Submit %d IOs\n", bufs);
 718
 719                         /* Write an end-of-descriptor marker before
 720                            submitting the IOs.  "tag" still points to
 721                            the last tag we set up. */
 722
 723                         tag->t_flags |= cpu_to_be32(JBD2_FLAG_LAST_TAG);
 724
 725 start_journal_io:
 726                         for (i = 0; i < bufs; i++) {
 727                                 struct buffer_head *bh = wbuf[i];
 728                                 /*
 729                                  * Compute checksum.
 730                                  */
 731                                 if (JBD2_HAS_COMPAT_FEATURE(journal,
 732                                         JBD2_FEATURE_COMPAT_CHECKSUM)) {
 733                                         crc32_sum =
 734                                             jbd2_checksum_data(crc32_sum, bh);
 735                                 }
 736
 737                                 lock_buffer(bh);
 738                                 clear_buffer_dirty(bh);
 739                                 set_buffer_uptodate(bh);
 740                                 bh->b_end_io = journal_end_buffer_io_sync;
 741                                 submit_bh(WRITE, bh);
 742                         }
 743                         cond_resched();
 744                         stats.u.run.rs_blocks_logged += bufs;
 745
 746                         /* Force a new descriptor to be generated next
 747                            time round the loop. */
 748                         descriptor = NULL;
 749                         bufs = 0;
 750                 }
 751         }
 752
 753         /* Done it all: now write the commit record asynchronously. */
 754
 755         if (JBD2_HAS_INCOMPAT_FEATURE(journal,
 756                 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
 757                 err = journal_submit_commit_record(journal, commit_transaction,
 758                                                  &cbh, crc32_sum);
 759                 if (err)
 760                         __jbd2_journal_abort_hard(journal);
 761
 762                 spin_lock(&journal->j_list_lock);
 763                 err = journal_wait_on_locked_list(journal,
 764                                                 commit_transaction);
 765                 spin_unlock(&journal->j_list_lock);
 766                 if (err)
 767                         __jbd2_journal_abort_hard(journal);
 768         }
 769
 770         /* Lo and behold: we have just managed to send a transaction to
 771            the log.  Before we can commit it, wait for the IO so far to
 772            complete.  Control buffers being written are on the
 773            transaction's t_log_list queue, and metadata buffers are on
 774            the t_iobuf_list queue.
 775
 776            Wait for the buffers in reverse order.  That way we are
 777            less likely to be woken up until all IOs have completed, and
 778            so we incur less scheduling load.
 779         */
 780
 781         jbd_debug(3, "JBD: commit phase 4\n");
 782
 783         /*
 784          * akpm: these are BJ_IO, and j_list_lock is not needed.
 785          * See __journal_try_to_free_buffer.
 786          */
 787 wait_for_iobuf:
 788         while (commit_transaction->t_iobuf_list != NULL) {
 789                 struct buffer_head *bh;
 790
 791                 jh = commit_transaction->t_iobuf_list->b_tprev;
 792                 bh = jh2bh(jh);
 793                 if (buffer_locked(bh)) {
 794                         wait_on_buffer(bh);
 795                         goto wait_for_iobuf;
 796                 }
 797                 if (cond_resched())
 798                         goto wait_for_iobuf;
 799
 800                 if (unlikely(!buffer_uptodate(bh)))
 801                         err = -EIO;
 802
 803                 clear_buffer_jwrite(bh);
 804
 805                 JBUFFER_TRACE(jh, "ph4: unfile after journal write");
 806                 jbd2_journal_unfile_buffer(journal, jh);
 807
 808                 /*
 809                  * ->t_iobuf_list should contain only dummy buffer_heads
 810                  * which were created by jbd2_journal_write_metadata_buffer().
 811                  */
 812                 BUFFER_TRACE(bh, "dumping temporary bh");
 813                 jbd2_journal_put_journal_head(jh);
 814                 __brelse(bh);
 815                 J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0);
 816                 free_buffer_head(bh);
 817
 818                 /* We also have to unlock and free the corresponding
 819                    shadowed buffer */
 820                 jh = commit_transaction->t_shadow_list->b_tprev;
 821                 bh = jh2bh(jh);
 822                 clear_bit(BH_JWrite, &bh->b_state);
 823                 J_ASSERT_BH(bh, buffer_jbddirty(bh));
 824
 825                 /* The metadata is now released for reuse, but we need
 826                    to remember it against this transaction so that when
 827                    we finally commit, we can do any checkpointing
 828                    required. */
 829                 JBUFFER_TRACE(jh, "file as BJ_Forget");
 830                 jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget);
 831                 /* Wake up any transactions which were waiting for this
 832                    IO to complete */
 833                 wake_up_bit(&bh->b_state, BH_Unshadow);
 834                 JBUFFER_TRACE(jh, "brelse shadowed buffer");
 835                 __brelse(bh);
 836         }
 837
 838         J_ASSERT (commit_transaction->t_shadow_list == NULL);
 839
 840         jbd_debug(3, "JBD: commit phase 5\n");
 841
 842         /* Here we wait for the revoke record and descriptor record buffers */
 843  wait_for_ctlbuf:
 844         while (commit_transaction->t_log_list != NULL) {
 845                 struct buffer_head *bh;
 846
 847                 jh = commit_transaction->t_log_list->b_tprev;
 848                 bh = jh2bh(jh);
 849                 if (buffer_locked(bh)) {
 850                         wait_on_buffer(bh);
 851                         goto wait_for_ctlbuf;
 852                 }
 853                 if (cond_resched())
 854                         goto wait_for_ctlbuf;
 855
 856                 if (unlikely(!buffer_uptodate(bh)))
 857                         err = -EIO;
 858
 859                 BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile");
 860                 clear_buffer_jwrite(bh);
 861                 jbd2_journal_unfile_buffer(journal, jh);
 862                 jbd2_journal_put_journal_head(jh);
 863                 __brelse(bh);           /* One for getblk */
 864                 /* AKPM: bforget here */
 865         }
 866
 867         jbd_debug(3, "JBD: commit phase 6\n");
 868
 869         if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
 870                 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
 871                 err = journal_submit_commit_record(journal, commit_transaction,
 872                                                 &cbh, crc32_sum);
 873                 if (err)
 874                         __jbd2_journal_abort_hard(journal);
 875         }
 876         if (!err && !is_journal_aborted(journal))
 877                 err = journal_wait_on_commit_record(cbh);
 878
 879         if (err)
 880                 jbd2_journal_abort(journal, err);
 881
 882         /* End of a transaction!  Finally, we can do checkpoint
 883            processing: any buffers committed as a result of this
 884            transaction can be removed from any checkpoint list it was on
 885            before. */
 886
 887         jbd_debug(3, "JBD: commit phase 7\n");
 888
 889         J_ASSERT(commit_transaction->t_sync_datalist == NULL);
 890         J_ASSERT(commit_transaction->t_buffers == NULL);
 891         J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
 892         J_ASSERT(commit_transaction->t_iobuf_list == NULL);
 893         J_ASSERT(commit_transaction->t_shadow_list == NULL);
 894         J_ASSERT(commit_transaction->t_log_list == NULL);
 895
 896 restart_loop:
 897         /*
 898          * As there are other places (journal_unmap_buffer()) adding buffers
 899          * to this list we have to be careful and hold the j_list_lock.
 900          */
 901         spin_lock(&journal->j_list_lock);
 902         while (commit_transaction->t_forget) {
 903                 transaction_t *cp_transaction;
 904                 struct buffer_head *bh;
 905
 906                 jh = commit_transaction->t_forget;
 907                 spin_unlock(&journal->j_list_lock);
 908                 bh = jh2bh(jh);
 909                 jbd_lock_bh_state(bh);
 910                 J_ASSERT_JH(jh, jh->b_transaction == commit_transaction ||
 911                         jh->b_transaction == journal->j_running_transaction);
 912
 913                 /*
 914                  * If there is undo-protected committed data against
 915                  * this buffer, then we can remove it now.  If it is a
 916                  * buffer needing such protection, the old frozen_data
 917                  * field now points to a committed version of the
 918                  * buffer, so rotate that field to the new committed
 919                  * data.
 920                  *
 921                  * Otherwise, we can just throw away the frozen data now.
 922                  */
 923                 if (jh->b_committed_data) {
 924                         jbd2_free(jh->b_committed_data, bh->b_size);
 925                         jh->b_committed_data = NULL;
 926                         if (jh->b_frozen_data) {
 927                                 jh->b_committed_data = jh->b_frozen_data;
 928                                 jh->b_frozen_data = NULL;
 929                         }
 930                 } else if (jh->b_frozen_data) {
 931                         jbd2_free(jh->b_frozen_data, bh->b_size);
 932                         jh->b_frozen_data = NULL;
 933                 }
 934
 935                 spin_lock(&journal->j_list_lock);
 936                 cp_transaction = jh->b_cp_transaction;
 937                 if (cp_transaction) {
 938                         JBUFFER_TRACE(jh, "remove from old cp transaction");
 939                         cp_transaction->t_chp_stats.cs_dropped++;
 940                         __jbd2_journal_remove_checkpoint(jh);
 941                 }
 942
 943                 /* Only re-checkpoint the buffer_head if it is marked
 944                  * dirty.  If the buffer was added to the BJ_Forget list
 945                  * by jbd2_journal_forget, it may no longer be dirty and
 946                  * there's no point in keeping a checkpoint record for
 947                  * it. */
 948
 949                 /* A buffer which has been freed while still being
 950                  * journaled by a previous transaction may end up still
 951                  * being dirty here, but we want to avoid writing back
 952                  * that buffer in the future now that the last use has
 953                  * been committed.  That's not only a performance gain,
 954                  * it also stops aliasing problems if the buffer is left
 955                  * behind for writeback and gets reallocated for another
 956                  * use in a different page. */
 957                 if (buffer_freed(bh)) {
 958                         clear_buffer_freed(bh);
 959                         clear_buffer_jbddirty(bh);
 960                 }
 961
 962                 if (buffer_jbddirty(bh)) {
 963                         JBUFFER_TRACE(jh, "add to new checkpointing trans");
 964                         __jbd2_journal_insert_checkpoint(jh, commit_transaction);
 965                         JBUFFER_TRACE(jh, "refile for checkpoint writeback");
 966                         __jbd2_journal_refile_buffer(jh);
 967                         jbd_unlock_bh_state(bh);
 968                 } else {
 969                         J_ASSERT_BH(bh, !buffer_dirty(bh));
 970                         /* The buffer on BJ_Forget list and not jbddirty means
 971                          * it has been freed by this transaction and hence it
 972                          * could not have been reallocated until this
 973                          * transaction has committed. *BUT* it could be
 974                          * reallocated once we have written all the data to
 975                          * disk and before we process the buffer on BJ_Forget
 976                          * list. */
 977                         JBUFFER_TRACE(jh, "refile or unfile freed buffer");
 978                         __jbd2_journal_refile_buffer(jh);
 979                         if (!jh->b_transaction) {
 980                                 jbd_unlock_bh_state(bh);
 981                                  /* needs a brelse */
 982                                 jbd2_journal_remove_journal_head(bh);
 983                                 release_buffer_page(bh);
 984                         } else
 985                                 jbd_unlock_bh_state(bh);
 986                 }
 987                 cond_resched_lock(&journal->j_list_lock);
 988         }
 989         spin_unlock(&journal->j_list_lock);
 990         /*
 991          * This is a bit sleazy.  We use j_list_lock to protect transition
 992          * of a transaction into T_FINISHED state and calling
 993          * __jbd2_journal_drop_transaction(). Otherwise we could race with
 994          * other checkpointing code processing the transaction...
 995          */
 996         spin_lock(&journal->j_state_lock);
 997         spin_lock(&journal->j_list_lock);
 998         /*
 999          * Now recheck if some buffers did not get attached to the transaction
1000          * while the lock was dropped...
1001          */
1002         if (commit_transaction->t_forget) {
1003                 spin_unlock(&journal->j_list_lock);
1004                 spin_unlock(&journal->j_state_lock);
1005                 goto restart_loop;
1006         }
1007
1008         /* Done with this transaction! */
1009
1010         jbd_debug(3, "JBD: commit phase 8\n");
1011
1012         J_ASSERT(commit_transaction->t_state == T_COMMIT);
1013
1014         commit_transaction->t_start = jiffies;
1015         stats.u.run.rs_logging = jbd2_time_diff(stats.u.run.rs_logging,
1016                                                 commit_transaction->t_start);
1017
1018         /*
1019          * File the transaction for history
1020          */
1021         stats.ts_type = JBD2_STATS_RUN;
1022         stats.ts_tid = commit_transaction->t_tid;
1023         stats.u.run.rs_handle_count = commit_transaction->t_handle_count;
1024         spin_lock(&journal->j_history_lock);
1025         memcpy(journal->j_history + journal->j_history_cur, &stats,
1026                         sizeof(stats));
1027         if (++journal->j_history_cur == journal->j_history_max)
1028                 journal->j_history_cur = 0;
1029
1030         /*
1031          * Calculate overall stats
1032          */
1033         journal->j_stats.ts_tid++;
1034         journal->j_stats.u.run.rs_wait += stats.u.run.rs_wait;
1035         journal->j_stats.u.run.rs_running += stats.u.run.rs_running;
1036         journal->j_stats.u.run.rs_locked += stats.u.run.rs_locked;
1037         journal->j_stats.u.run.rs_flushing += stats.u.run.rs_flushing;
1038         journal->j_stats.u.run.rs_logging += stats.u.run.rs_logging;
1039         journal->j_stats.u.run.rs_handle_count += stats.u.run.rs_handle_count;
1040         journal->j_stats.u.run.rs_blocks += stats.u.run.rs_blocks;
1041         journal->j_stats.u.run.rs_blocks_logged += stats.u.run.rs_blocks_logged;
1042         spin_unlock(&journal->j_history_lock);
1043
1044         commit_transaction->t_state = T_FINISHED;
1045         J_ASSERT(commit_transaction == journal->j_committing_transaction);
1046         journal->j_commit_sequence = commit_transaction->t_tid;
1047         journal->j_committing_transaction = NULL;
1048         spin_unlock(&journal->j_state_lock);
1049
1050         if (commit_transaction->t_checkpoint_list == NULL &&
1051             commit_transaction->t_checkpoint_io_list == NULL) {
1052                 __jbd2_journal_drop_transaction(journal, commit_transaction);
1053         } else {
1054                 if (journal->j_checkpoint_transactions == NULL) {
1055                         journal->j_checkpoint_transactions = commit_transaction;
1056                         commit_transaction->t_cpnext = commit_transaction;
1057                         commit_transaction->t_cpprev = commit_transaction;
1058                 } else {
1059                         commit_transaction->t_cpnext =
1060                                 journal->j_checkpoint_transactions;
1061                         commit_transaction->t_cpprev =
1062                                 commit_transaction->t_cpnext->t_cpprev;
1063                         commit_transaction->t_cpnext->t_cpprev =
1064                                 commit_transaction;
1065                         commit_transaction->t_cpprev->t_cpnext =
1066                                 commit_transaction;
1067                 }
1068         }
1069         spin_unlock(&journal->j_list_lock);
1070
1071         jbd_debug(1, "JBD: commit %d complete, head %d\n",
1072                   journal->j_commit_sequence, journal->j_tail_sequence);
1073
1074         wake_up(&journal->j_wait_done_commit);
1075 }