Merge master.kernel.org:/home/rmk/linux-2.6-arm
[linux-2.6] / fs / reiserfs / fix_node.c
1 /*
2  * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
3  */
4
5 /**
6  ** old_item_num
7  ** old_entry_num
8  ** set_entry_sizes
9  ** create_virtual_node
10  ** check_left
11  ** check_right
12  ** directory_part_size
13  ** get_num_ver
14  ** set_parameters
15  ** is_leaf_removable
16  ** are_leaves_removable
17  ** get_empty_nodes
18  ** get_lfree
19  ** get_rfree
20  ** is_left_neighbor_in_cache
21  ** decrement_key
22  ** get_far_parent
23  ** get_parents
24  ** can_node_be_removed
25  ** ip_check_balance
26  ** dc_check_balance_internal
27  ** dc_check_balance_leaf
28  ** dc_check_balance
29  ** check_balance
30  ** get_direct_parent
31  ** get_neighbors
32  ** fix_nodes
33  ** 
34  ** 
35  **/
36
37 #include <linux/config.h>
38 #include <linux/time.h>
39 #include <linux/string.h>
40 #include <linux/reiserfs_fs.h>
41 #include <linux/buffer_head.h>
42
43 /* To make any changes in the tree we find a node, that contains item
44    to be changed/deleted or position in the node we insert a new item
45    to. We call this node S. To do balancing we need to decide what we
46    will shift to left/right neighbor, or to a new node, where new item
47    will be etc. To make this analysis simpler we build virtual
48    node. Virtual node is an array of items, that will replace items of
49    node S. (For instance if we are going to delete an item, virtual
50    node does not contain it). Virtual node keeps information about
51    item sizes and types, mergeability of first and last items, sizes
52    of all entries in directory item. We use this array of items when
53    calculating what we can shift to neighbors and how many nodes we
54    have to have if we do not any shiftings, if we shift to left/right
55    neighbor or to both. */
56
57 /* taking item number in virtual node, returns number of item, that it has in source buffer */
58 static inline int old_item_num(int new_num, int affected_item_num, int mode)
59 {
60         if (mode == M_PASTE || mode == M_CUT || new_num < affected_item_num)
61                 return new_num;
62
63         if (mode == M_INSERT) {
64
65                 RFALSE(new_num == 0,
66                        "vs-8005: for INSERT mode and item number of inserted item");
67
68                 return new_num - 1;
69         }
70
71         RFALSE(mode != M_DELETE,
72                "vs-8010: old_item_num: mode must be M_DELETE (mode = \'%c\'",
73                mode);
74         /* delete mode */
75         return new_num + 1;
76 }
77
78 static void create_virtual_node(struct tree_balance *tb, int h)
79 {
80         struct item_head *ih;
81         struct virtual_node *vn = tb->tb_vn;
82         int new_num;
83         struct buffer_head *Sh; /* this comes from tb->S[h] */
84
85         Sh = PATH_H_PBUFFER(tb->tb_path, h);
86
87         /* size of changed node */
88         vn->vn_size =
89             MAX_CHILD_SIZE(Sh) - B_FREE_SPACE(Sh) + tb->insert_size[h];
90
91         /* for internal nodes array if virtual items is not created */
92         if (h) {
93                 vn->vn_nr_item = (vn->vn_size - DC_SIZE) / (DC_SIZE + KEY_SIZE);
94                 return;
95         }
96
97         /* number of items in virtual node  */
98         vn->vn_nr_item =
99             B_NR_ITEMS(Sh) + ((vn->vn_mode == M_INSERT) ? 1 : 0) -
100             ((vn->vn_mode == M_DELETE) ? 1 : 0);
101
102         /* first virtual item */
103         vn->vn_vi = (struct virtual_item *)(tb->tb_vn + 1);
104         memset(vn->vn_vi, 0, vn->vn_nr_item * sizeof(struct virtual_item));
105         vn->vn_free_ptr += vn->vn_nr_item * sizeof(struct virtual_item);
106
107         /* first item in the node */
108         ih = B_N_PITEM_HEAD(Sh, 0);
109
110         /* define the mergeability for 0-th item (if it is not being deleted) */
111         if (op_is_left_mergeable(&(ih->ih_key), Sh->b_size)
112             && (vn->vn_mode != M_DELETE || vn->vn_affected_item_num))
113                 vn->vn_vi[0].vi_type |= VI_TYPE_LEFT_MERGEABLE;
114
115         /* go through all items those remain in the virtual node (except for the new (inserted) one) */
116         for (new_num = 0; new_num < vn->vn_nr_item; new_num++) {
117                 int j;
118                 struct virtual_item *vi = vn->vn_vi + new_num;
119                 int is_affected =
120                     ((new_num != vn->vn_affected_item_num) ? 0 : 1);
121
122                 if (is_affected && vn->vn_mode == M_INSERT)
123                         continue;
124
125                 /* get item number in source node */
126                 j = old_item_num(new_num, vn->vn_affected_item_num,
127                                  vn->vn_mode);
128
129                 vi->vi_item_len += ih_item_len(ih + j) + IH_SIZE;
130                 vi->vi_ih = ih + j;
131                 vi->vi_item = B_I_PITEM(Sh, ih + j);
132                 vi->vi_uarea = vn->vn_free_ptr;
133
134                 // FIXME: there is no check, that item operation did not
135                 // consume too much memory
136                 vn->vn_free_ptr +=
137                     op_create_vi(vn, vi, is_affected, tb->insert_size[0]);
138                 if (tb->vn_buf + tb->vn_buf_size < vn->vn_free_ptr)
139                         reiserfs_panic(tb->tb_sb,
140                                        "vs-8030: create_virtual_node: "
141                                        "virtual node space consumed");
142
143                 if (!is_affected)
144                         /* this is not being changed */
145                         continue;
146
147                 if (vn->vn_mode == M_PASTE || vn->vn_mode == M_CUT) {
148                         vn->vn_vi[new_num].vi_item_len += tb->insert_size[0];
149                         vi->vi_new_data = vn->vn_data;  // pointer to data which is going to be pasted
150                 }
151         }
152
153         /* virtual inserted item is not defined yet */
154         if (vn->vn_mode == M_INSERT) {
155                 struct virtual_item *vi = vn->vn_vi + vn->vn_affected_item_num;
156
157                 RFALSE(vn->vn_ins_ih == 0,
158                        "vs-8040: item header of inserted item is not specified");
159                 vi->vi_item_len = tb->insert_size[0];
160                 vi->vi_ih = vn->vn_ins_ih;
161                 vi->vi_item = vn->vn_data;
162                 vi->vi_uarea = vn->vn_free_ptr;
163
164                 op_create_vi(vn, vi, 0 /*not pasted or cut */ ,
165                              tb->insert_size[0]);
166         }
167
168         /* set right merge flag we take right delimiting key and check whether it is a mergeable item */
169         if (tb->CFR[0]) {
170                 struct reiserfs_key *key;
171
172                 key = B_N_PDELIM_KEY(tb->CFR[0], tb->rkey[0]);
173                 if (op_is_left_mergeable(key, Sh->b_size)
174                     && (vn->vn_mode != M_DELETE
175                         || vn->vn_affected_item_num != B_NR_ITEMS(Sh) - 1))
176                         vn->vn_vi[vn->vn_nr_item - 1].vi_type |=
177                             VI_TYPE_RIGHT_MERGEABLE;
178
179 #ifdef CONFIG_REISERFS_CHECK
180                 if (op_is_left_mergeable(key, Sh->b_size) &&
181                     !(vn->vn_mode != M_DELETE
182                       || vn->vn_affected_item_num != B_NR_ITEMS(Sh) - 1)) {
183                         /* we delete last item and it could be merged with right neighbor's first item */
184                         if (!
185                             (B_NR_ITEMS(Sh) == 1
186                              && is_direntry_le_ih(B_N_PITEM_HEAD(Sh, 0))
187                              && I_ENTRY_COUNT(B_N_PITEM_HEAD(Sh, 0)) == 1)) {
188                                 /* node contains more than 1 item, or item is not directory item, or this item contains more than 1 entry */
189                                 print_block(Sh, 0, -1, -1);
190                                 reiserfs_panic(tb->tb_sb,
191                                                "vs-8045: create_virtual_node: rdkey %k, affected item==%d (mode==%c) Must be %c",
192                                                key, vn->vn_affected_item_num,
193                                                vn->vn_mode, M_DELETE);
194                         }
195                 }
196 #endif
197
198         }
199 }
200
201 /* using virtual node check, how many items can be shifted to left
202    neighbor */
203 static void check_left(struct tree_balance *tb, int h, int cur_free)
204 {
205         int i;
206         struct virtual_node *vn = tb->tb_vn;
207         struct virtual_item *vi;
208         int d_size, ih_size;
209
210         RFALSE(cur_free < 0, "vs-8050: cur_free (%d) < 0", cur_free);
211
212         /* internal level */
213         if (h > 0) {
214                 tb->lnum[h] = cur_free / (DC_SIZE + KEY_SIZE);
215                 return;
216         }
217
218         /* leaf level */
219
220         if (!cur_free || !vn->vn_nr_item) {
221                 /* no free space or nothing to move */
222                 tb->lnum[h] = 0;
223                 tb->lbytes = -1;
224                 return;
225         }
226
227         RFALSE(!PATH_H_PPARENT(tb->tb_path, 0),
228                "vs-8055: parent does not exist or invalid");
229
230         vi = vn->vn_vi;
231         if ((unsigned int)cur_free >=
232             (vn->vn_size -
233              ((vi->vi_type & VI_TYPE_LEFT_MERGEABLE) ? IH_SIZE : 0))) {
234                 /* all contents of S[0] fits into L[0] */
235
236                 RFALSE(vn->vn_mode == M_INSERT || vn->vn_mode == M_PASTE,
237                        "vs-8055: invalid mode or balance condition failed");
238
239                 tb->lnum[0] = vn->vn_nr_item;
240                 tb->lbytes = -1;
241                 return;
242         }
243
244         d_size = 0, ih_size = IH_SIZE;
245
246         /* first item may be merge with last item in left neighbor */
247         if (vi->vi_type & VI_TYPE_LEFT_MERGEABLE)
248                 d_size = -((int)IH_SIZE), ih_size = 0;
249
250         tb->lnum[0] = 0;
251         for (i = 0; i < vn->vn_nr_item;
252              i++, ih_size = IH_SIZE, d_size = 0, vi++) {
253                 d_size += vi->vi_item_len;
254                 if (cur_free >= d_size) {
255                         /* the item can be shifted entirely */
256                         cur_free -= d_size;
257                         tb->lnum[0]++;
258                         continue;
259                 }
260
261                 /* the item cannot be shifted entirely, try to split it */
262                 /* check whether L[0] can hold ih and at least one byte of the item body */
263                 if (cur_free <= ih_size) {
264                         /* cannot shift even a part of the current item */
265                         tb->lbytes = -1;
266                         return;
267                 }
268                 cur_free -= ih_size;
269
270                 tb->lbytes = op_check_left(vi, cur_free, 0, 0);
271                 if (tb->lbytes != -1)
272                         /* count partially shifted item */
273                         tb->lnum[0]++;
274
275                 break;
276         }
277
278         return;
279 }
280
281 /* using virtual node check, how many items can be shifted to right
282    neighbor */
283 static void check_right(struct tree_balance *tb, int h, int cur_free)
284 {
285         int i;
286         struct virtual_node *vn = tb->tb_vn;
287         struct virtual_item *vi;
288         int d_size, ih_size;
289
290         RFALSE(cur_free < 0, "vs-8070: cur_free < 0");
291
292         /* internal level */
293         if (h > 0) {
294                 tb->rnum[h] = cur_free / (DC_SIZE + KEY_SIZE);
295                 return;
296         }
297
298         /* leaf level */
299
300         if (!cur_free || !vn->vn_nr_item) {
301                 /* no free space  */
302                 tb->rnum[h] = 0;
303                 tb->rbytes = -1;
304                 return;
305         }
306
307         RFALSE(!PATH_H_PPARENT(tb->tb_path, 0),
308                "vs-8075: parent does not exist or invalid");
309
310         vi = vn->vn_vi + vn->vn_nr_item - 1;
311         if ((unsigned int)cur_free >=
312             (vn->vn_size -
313              ((vi->vi_type & VI_TYPE_RIGHT_MERGEABLE) ? IH_SIZE : 0))) {
314                 /* all contents of S[0] fits into R[0] */
315
316                 RFALSE(vn->vn_mode == M_INSERT || vn->vn_mode == M_PASTE,
317                        "vs-8080: invalid mode or balance condition failed");
318
319                 tb->rnum[h] = vn->vn_nr_item;
320                 tb->rbytes = -1;
321                 return;
322         }
323
324         d_size = 0, ih_size = IH_SIZE;
325
326         /* last item may be merge with first item in right neighbor */
327         if (vi->vi_type & VI_TYPE_RIGHT_MERGEABLE)
328                 d_size = -(int)IH_SIZE, ih_size = 0;
329
330         tb->rnum[0] = 0;
331         for (i = vn->vn_nr_item - 1; i >= 0;
332              i--, d_size = 0, ih_size = IH_SIZE, vi--) {
333                 d_size += vi->vi_item_len;
334                 if (cur_free >= d_size) {
335                         /* the item can be shifted entirely */
336                         cur_free -= d_size;
337                         tb->rnum[0]++;
338                         continue;
339                 }
340
341                 /* check whether R[0] can hold ih and at least one byte of the item body */
342                 if (cur_free <= ih_size) {      /* cannot shift even a part of the current item */
343                         tb->rbytes = -1;
344                         return;
345                 }
346
347                 /* R[0] can hold the header of the item and at least one byte of its body */
348                 cur_free -= ih_size;    /* cur_free is still > 0 */
349
350                 tb->rbytes = op_check_right(vi, cur_free);
351                 if (tb->rbytes != -1)
352                         /* count partially shifted item */
353                         tb->rnum[0]++;
354
355                 break;
356         }
357
358         return;
359 }
360
361 /*
362  * from - number of items, which are shifted to left neighbor entirely
363  * to - number of item, which are shifted to right neighbor entirely
364  * from_bytes - number of bytes of boundary item (or directory entries) which are shifted to left neighbor
365  * to_bytes - number of bytes of boundary item (or directory entries) which are shifted to right neighbor */
366 static int get_num_ver(int mode, struct tree_balance *tb, int h,
367                        int from, int from_bytes,
368                        int to, int to_bytes, short *snum012, int flow)
369 {
370         int i;
371         int cur_free;
372         //    int bytes;
373         int units;
374         struct virtual_node *vn = tb->tb_vn;
375         //    struct virtual_item * vi;
376
377         int total_node_size, max_node_size, current_item_size;
378         int needed_nodes;
379         int start_item,         /* position of item we start filling node from */
380          end_item,              /* position of item we finish filling node by */
381          start_bytes,           /* number of first bytes (entries for directory) of start_item-th item 
382                                    we do not include into node that is being filled */
383          end_bytes;             /* number of last bytes (entries for directory) of end_item-th item 
384                                    we do node include into node that is being filled */
385         int split_item_positions[2];    /* these are positions in virtual item of
386                                            items, that are split between S[0] and
387                                            S1new and S1new and S2new */
388
389         split_item_positions[0] = -1;
390         split_item_positions[1] = -1;
391
392         /* We only create additional nodes if we are in insert or paste mode
393            or we are in replace mode at the internal level. If h is 0 and
394            the mode is M_REPLACE then in fix_nodes we change the mode to
395            paste or insert before we get here in the code.  */
396         RFALSE(tb->insert_size[h] < 0 || (mode != M_INSERT && mode != M_PASTE),
397                "vs-8100: insert_size < 0 in overflow");
398
399         max_node_size = MAX_CHILD_SIZE(PATH_H_PBUFFER(tb->tb_path, h));
400
401         /* snum012 [0-2] - number of items, that lay
402            to S[0], first new node and second new node */
403         snum012[3] = -1;        /* s1bytes */
404         snum012[4] = -1;        /* s2bytes */
405
406         /* internal level */
407         if (h > 0) {
408                 i = ((to - from) * (KEY_SIZE + DC_SIZE) + DC_SIZE);
409                 if (i == max_node_size)
410                         return 1;
411                 return (i / max_node_size + 1);
412         }
413
414         /* leaf level */
415         needed_nodes = 1;
416         total_node_size = 0;
417         cur_free = max_node_size;
418
419         // start from 'from'-th item
420         start_item = from;
421         // skip its first 'start_bytes' units
422         start_bytes = ((from_bytes != -1) ? from_bytes : 0);
423
424         // last included item is the 'end_item'-th one
425         end_item = vn->vn_nr_item - to - 1;
426         // do not count last 'end_bytes' units of 'end_item'-th item
427         end_bytes = (to_bytes != -1) ? to_bytes : 0;
428
429         /* go through all item beginning from the start_item-th item and ending by
430            the end_item-th item. Do not count first 'start_bytes' units of
431            'start_item'-th item and last 'end_bytes' of 'end_item'-th item */
432
433         for (i = start_item; i <= end_item; i++) {
434                 struct virtual_item *vi = vn->vn_vi + i;
435                 int skip_from_end = ((i == end_item) ? end_bytes : 0);
436
437                 RFALSE(needed_nodes > 3, "vs-8105: too many nodes are needed");
438
439                 /* get size of current item */
440                 current_item_size = vi->vi_item_len;
441
442                 /* do not take in calculation head part (from_bytes) of from-th item */
443                 current_item_size -=
444                     op_part_size(vi, 0 /*from start */ , start_bytes);
445
446                 /* do not take in calculation tail part of last item */
447                 current_item_size -=
448                     op_part_size(vi, 1 /*from end */ , skip_from_end);
449
450                 /* if item fits into current node entierly */
451                 if (total_node_size + current_item_size <= max_node_size) {
452                         snum012[needed_nodes - 1]++;
453                         total_node_size += current_item_size;
454                         start_bytes = 0;
455                         continue;
456                 }
457
458                 if (current_item_size > max_node_size) {
459                         /* virtual item length is longer, than max size of item in
460                            a node. It is impossible for direct item */
461                         RFALSE(is_direct_le_ih(vi->vi_ih),
462                                "vs-8110: "
463                                "direct item length is %d. It can not be longer than %d",
464                                current_item_size, max_node_size);
465                         /* we will try to split it */
466                         flow = 1;
467                 }
468
469                 if (!flow) {
470                         /* as we do not split items, take new node and continue */
471                         needed_nodes++;
472                         i--;
473                         total_node_size = 0;
474                         continue;
475                 }
476                 // calculate number of item units which fit into node being
477                 // filled
478                 {
479                         int free_space;
480
481                         free_space = max_node_size - total_node_size - IH_SIZE;
482                         units =
483                             op_check_left(vi, free_space, start_bytes,
484                                           skip_from_end);
485                         if (units == -1) {
486                                 /* nothing fits into current node, take new node and continue */
487                                 needed_nodes++, i--, total_node_size = 0;
488                                 continue;
489                         }
490                 }
491
492                 /* something fits into the current node */
493                 //if (snum012[3] != -1 || needed_nodes != 1)
494                 //  reiserfs_panic (tb->tb_sb, "vs-8115: get_num_ver: too many nodes required");
495                 //snum012[needed_nodes - 1 + 3] = op_unit_num (vi) - start_bytes - units;
496                 start_bytes += units;
497                 snum012[needed_nodes - 1 + 3] = units;
498
499                 if (needed_nodes > 2)
500                         reiserfs_warning(tb->tb_sb, "vs-8111: get_num_ver: "
501                                          "split_item_position is out of boundary");
502                 snum012[needed_nodes - 1]++;
503                 split_item_positions[needed_nodes - 1] = i;
504                 needed_nodes++;
505                 /* continue from the same item with start_bytes != -1 */
506                 start_item = i;
507                 i--;
508                 total_node_size = 0;
509         }
510
511         // sum012[4] (if it is not -1) contains number of units of which
512         // are to be in S1new, snum012[3] - to be in S0. They are supposed
513         // to be S1bytes and S2bytes correspondingly, so recalculate
514         if (snum012[4] > 0) {
515                 int split_item_num;
516                 int bytes_to_r, bytes_to_l;
517                 int bytes_to_S1new;
518
519                 split_item_num = split_item_positions[1];
520                 bytes_to_l =
521                     ((from == split_item_num
522                       && from_bytes != -1) ? from_bytes : 0);
523                 bytes_to_r =
524                     ((end_item == split_item_num
525                       && end_bytes != -1) ? end_bytes : 0);
526                 bytes_to_S1new =
527                     ((split_item_positions[0] ==
528                       split_item_positions[1]) ? snum012[3] : 0);
529
530                 // s2bytes
531                 snum012[4] =
532                     op_unit_num(&vn->vn_vi[split_item_num]) - snum012[4] -
533                     bytes_to_r - bytes_to_l - bytes_to_S1new;
534
535                 if (vn->vn_vi[split_item_num].vi_index != TYPE_DIRENTRY &&
536                     vn->vn_vi[split_item_num].vi_index != TYPE_INDIRECT)
537                         reiserfs_warning(tb->tb_sb, "vs-8115: get_num_ver: not "
538                                          "directory or indirect item");
539         }
540
541         /* now we know S2bytes, calculate S1bytes */
542         if (snum012[3] > 0) {
543                 int split_item_num;
544                 int bytes_to_r, bytes_to_l;
545                 int bytes_to_S2new;
546
547                 split_item_num = split_item_positions[0];
548                 bytes_to_l =
549                     ((from == split_item_num
550                       && from_bytes != -1) ? from_bytes : 0);
551                 bytes_to_r =
552                     ((end_item == split_item_num
553                       && end_bytes != -1) ? end_bytes : 0);
554                 bytes_to_S2new =
555                     ((split_item_positions[0] == split_item_positions[1]
556                       && snum012[4] != -1) ? snum012[4] : 0);
557
558                 // s1bytes
559                 snum012[3] =
560                     op_unit_num(&vn->vn_vi[split_item_num]) - snum012[3] -
561                     bytes_to_r - bytes_to_l - bytes_to_S2new;
562         }
563
564         return needed_nodes;
565 }
566
567 #ifdef CONFIG_REISERFS_CHECK
568 extern struct tree_balance *cur_tb;
569 #endif
570
571 /* Set parameters for balancing.
572  * Performs write of results of analysis of balancing into structure tb,
573  * where it will later be used by the functions that actually do the balancing. 
574  * Parameters:
575  *      tb      tree_balance structure;
576  *      h       current level of the node;
577  *      lnum    number of items from S[h] that must be shifted to L[h];
578  *      rnum    number of items from S[h] that must be shifted to R[h];
579  *      blk_num number of blocks that S[h] will be splitted into;
580  *      s012    number of items that fall into splitted nodes.
581  *      lbytes  number of bytes which flow to the left neighbor from the item that is not
582  *              not shifted entirely
583  *      rbytes  number of bytes which flow to the right neighbor from the item that is not
584  *              not shifted entirely
585  *      s1bytes number of bytes which flow to the first  new node when S[0] splits (this number is contained in s012 array)
586  */
587
588 static void set_parameters(struct tree_balance *tb, int h, int lnum,
589                            int rnum, int blk_num, short *s012, int lb, int rb)
590 {
591
592         tb->lnum[h] = lnum;
593         tb->rnum[h] = rnum;
594         tb->blknum[h] = blk_num;
595
596         if (h == 0) {           /* only for leaf level */
597                 if (s012 != NULL) {
598                         tb->s0num = *s012++,
599                             tb->s1num = *s012++, tb->s2num = *s012++;
600                         tb->s1bytes = *s012++;
601                         tb->s2bytes = *s012;
602                 }
603                 tb->lbytes = lb;
604                 tb->rbytes = rb;
605         }
606         PROC_INFO_ADD(tb->tb_sb, lnum[h], lnum);
607         PROC_INFO_ADD(tb->tb_sb, rnum[h], rnum);
608
609         PROC_INFO_ADD(tb->tb_sb, lbytes[h], lb);
610         PROC_INFO_ADD(tb->tb_sb, rbytes[h], rb);
611 }
612
613 /* check, does node disappear if we shift tb->lnum[0] items to left
614    neighbor and tb->rnum[0] to the right one. */
615 static int is_leaf_removable(struct tree_balance *tb)
616 {
617         struct virtual_node *vn = tb->tb_vn;
618         int to_left, to_right;
619         int size;
620         int remain_items;
621
622         /* number of items, that will be shifted to left (right) neighbor
623            entirely */
624         to_left = tb->lnum[0] - ((tb->lbytes != -1) ? 1 : 0);
625         to_right = tb->rnum[0] - ((tb->rbytes != -1) ? 1 : 0);
626         remain_items = vn->vn_nr_item;
627
628         /* how many items remain in S[0] after shiftings to neighbors */
629         remain_items -= (to_left + to_right);
630
631         if (remain_items < 1) {
632                 /* all content of node can be shifted to neighbors */
633                 set_parameters(tb, 0, to_left, vn->vn_nr_item - to_left, 0,
634                                NULL, -1, -1);
635                 return 1;
636         }
637
638         if (remain_items > 1 || tb->lbytes == -1 || tb->rbytes == -1)
639                 /* S[0] is not removable */
640                 return 0;
641
642         /* check, whether we can divide 1 remaining item between neighbors */
643
644         /* get size of remaining item (in item units) */
645         size = op_unit_num(&(vn->vn_vi[to_left]));
646
647         if (tb->lbytes + tb->rbytes >= size) {
648                 set_parameters(tb, 0, to_left + 1, to_right + 1, 0, NULL,
649                                tb->lbytes, -1);
650                 return 1;
651         }
652
653         return 0;
654 }
655
656 /* check whether L, S, R can be joined in one node */
657 static int are_leaves_removable(struct tree_balance *tb, int lfree, int rfree)
658 {
659         struct virtual_node *vn = tb->tb_vn;
660         int ih_size;
661         struct buffer_head *S0;
662
663         S0 = PATH_H_PBUFFER(tb->tb_path, 0);
664
665         ih_size = 0;
666         if (vn->vn_nr_item) {
667                 if (vn->vn_vi[0].vi_type & VI_TYPE_LEFT_MERGEABLE)
668                         ih_size += IH_SIZE;
669
670                 if (vn->vn_vi[vn->vn_nr_item - 1].
671                     vi_type & VI_TYPE_RIGHT_MERGEABLE)
672                         ih_size += IH_SIZE;
673         } else {
674                 /* there was only one item and it will be deleted */
675                 struct item_head *ih;
676
677                 RFALSE(B_NR_ITEMS(S0) != 1,
678                        "vs-8125: item number must be 1: it is %d",
679                        B_NR_ITEMS(S0));
680
681                 ih = B_N_PITEM_HEAD(S0, 0);
682                 if (tb->CFR[0]
683                     && !comp_short_le_keys(&(ih->ih_key),
684                                            B_N_PDELIM_KEY(tb->CFR[0],
685                                                           tb->rkey[0])))
686                         if (is_direntry_le_ih(ih)) {
687                                 /* Directory must be in correct state here: that is
688                                    somewhere at the left side should exist first directory
689                                    item. But the item being deleted can not be that first
690                                    one because its right neighbor is item of the same
691                                    directory. (But first item always gets deleted in last
692                                    turn). So, neighbors of deleted item can be merged, so
693                                    we can save ih_size */
694                                 ih_size = IH_SIZE;
695
696                                 /* we might check that left neighbor exists and is of the
697                                    same directory */
698                                 RFALSE(le_ih_k_offset(ih) == DOT_OFFSET,
699                                        "vs-8130: first directory item can not be removed until directory is not empty");
700                         }
701
702         }
703
704         if (MAX_CHILD_SIZE(S0) + vn->vn_size <= rfree + lfree + ih_size) {
705                 set_parameters(tb, 0, -1, -1, -1, NULL, -1, -1);
706                 PROC_INFO_INC(tb->tb_sb, leaves_removable);
707                 return 1;
708         }
709         return 0;
710
711 }
712
713 /* when we do not split item, lnum and rnum are numbers of entire items */
714 #define SET_PAR_SHIFT_LEFT \
715 if (h)\
716 {\
717    int to_l;\
718    \
719    to_l = (MAX_NR_KEY(Sh)+1 - lpar + vn->vn_nr_item + 1) / 2 -\
720               (MAX_NR_KEY(Sh) + 1 - lpar);\
721               \
722               set_parameters (tb, h, to_l, 0, lnver, NULL, -1, -1);\
723 }\
724 else \
725 {\
726    if (lset==LEFT_SHIFT_FLOW)\
727      set_parameters (tb, h, lpar, 0, lnver, snum012+lset,\
728                      tb->lbytes, -1);\
729    else\
730      set_parameters (tb, h, lpar - (tb->lbytes!=-1), 0, lnver, snum012+lset,\
731                      -1, -1);\
732 }
733
734 #define SET_PAR_SHIFT_RIGHT \
735 if (h)\
736 {\
737    int to_r;\
738    \
739    to_r = (MAX_NR_KEY(Sh)+1 - rpar + vn->vn_nr_item + 1) / 2 - (MAX_NR_KEY(Sh) + 1 - rpar);\
740    \
741    set_parameters (tb, h, 0, to_r, rnver, NULL, -1, -1);\
742 }\
743 else \
744 {\
745    if (rset==RIGHT_SHIFT_FLOW)\
746      set_parameters (tb, h, 0, rpar, rnver, snum012+rset,\
747                   -1, tb->rbytes);\
748    else\
749      set_parameters (tb, h, 0, rpar - (tb->rbytes!=-1), rnver, snum012+rset,\
750                   -1, -1);\
751 }
752
753 static void free_buffers_in_tb(struct tree_balance *p_s_tb)
754 {
755         int n_counter;
756
757         decrement_counters_in_path(p_s_tb->tb_path);
758
759         for (n_counter = 0; n_counter < MAX_HEIGHT; n_counter++) {
760                 decrement_bcount(p_s_tb->L[n_counter]);
761                 p_s_tb->L[n_counter] = NULL;
762                 decrement_bcount(p_s_tb->R[n_counter]);
763                 p_s_tb->R[n_counter] = NULL;
764                 decrement_bcount(p_s_tb->FL[n_counter]);
765                 p_s_tb->FL[n_counter] = NULL;
766                 decrement_bcount(p_s_tb->FR[n_counter]);
767                 p_s_tb->FR[n_counter] = NULL;
768                 decrement_bcount(p_s_tb->CFL[n_counter]);
769                 p_s_tb->CFL[n_counter] = NULL;
770                 decrement_bcount(p_s_tb->CFR[n_counter]);
771                 p_s_tb->CFR[n_counter] = NULL;
772         }
773 }
774
775 /* Get new buffers for storing new nodes that are created while balancing.
776  * Returns:     SCHEDULE_OCCURRED - schedule occurred while the function worked;
777  *              CARRY_ON - schedule didn't occur while the function worked;
778  *              NO_DISK_SPACE - no disk space.
779  */
780 /* The function is NOT SCHEDULE-SAFE! */
781 static int get_empty_nodes(struct tree_balance *p_s_tb, int n_h)
782 {
783         struct buffer_head *p_s_new_bh,
784             *p_s_Sh = PATH_H_PBUFFER(p_s_tb->tb_path, n_h);
785         b_blocknr_t *p_n_blocknr, a_n_blocknrs[MAX_AMOUNT_NEEDED] = { 0, };
786         int n_counter, n_number_of_freeblk, n_amount_needed,    /* number of needed empty blocks */
787          n_retval = CARRY_ON;
788         struct super_block *p_s_sb = p_s_tb->tb_sb;
789
790         /* number_of_freeblk is the number of empty blocks which have been
791            acquired for use by the balancing algorithm minus the number of
792            empty blocks used in the previous levels of the analysis,
793            number_of_freeblk = tb->cur_blknum can be non-zero if a schedule occurs
794            after empty blocks are acquired, and the balancing analysis is
795            then restarted, amount_needed is the number needed by this level
796            (n_h) of the balancing analysis.
797
798            Note that for systems with many processes writing, it would be
799            more layout optimal to calculate the total number needed by all
800            levels and then to run reiserfs_new_blocks to get all of them at once.  */
801
802         /* Initiate number_of_freeblk to the amount acquired prior to the restart of
803            the analysis or 0 if not restarted, then subtract the amount needed
804            by all of the levels of the tree below n_h. */
805         /* blknum includes S[n_h], so we subtract 1 in this calculation */
806         for (n_counter = 0, n_number_of_freeblk = p_s_tb->cur_blknum;
807              n_counter < n_h; n_counter++)
808                 n_number_of_freeblk -=
809                     (p_s_tb->blknum[n_counter]) ? (p_s_tb->blknum[n_counter] -
810                                                    1) : 0;
811
812         /* Allocate missing empty blocks. */
813         /* if p_s_Sh == 0  then we are getting a new root */
814         n_amount_needed = (p_s_Sh) ? (p_s_tb->blknum[n_h] - 1) : 1;
815         /*  Amount_needed = the amount that we need more than the amount that we have. */
816         if (n_amount_needed > n_number_of_freeblk)
817                 n_amount_needed -= n_number_of_freeblk;
818         else                    /* If we have enough already then there is nothing to do. */
819                 return CARRY_ON;
820
821         /* No need to check quota - is not allocated for blocks used for formatted nodes */
822         if (reiserfs_new_form_blocknrs(p_s_tb, a_n_blocknrs,
823                                        n_amount_needed) == NO_DISK_SPACE)
824                 return NO_DISK_SPACE;
825
826         /* for each blocknumber we just got, get a buffer and stick it on FEB */
827         for (p_n_blocknr = a_n_blocknrs, n_counter = 0;
828              n_counter < n_amount_needed; p_n_blocknr++, n_counter++) {
829
830                 RFALSE(!*p_n_blocknr,
831                        "PAP-8135: reiserfs_new_blocknrs failed when got new blocks");
832
833                 p_s_new_bh = sb_getblk(p_s_sb, *p_n_blocknr);
834                 RFALSE(buffer_dirty(p_s_new_bh) ||
835                        buffer_journaled(p_s_new_bh) ||
836                        buffer_journal_dirty(p_s_new_bh),
837                        "PAP-8140: journlaled or dirty buffer %b for the new block",
838                        p_s_new_bh);
839
840                 /* Put empty buffers into the array. */
841                 RFALSE(p_s_tb->FEB[p_s_tb->cur_blknum],
842                        "PAP-8141: busy slot for new buffer");
843
844                 set_buffer_journal_new(p_s_new_bh);
845                 p_s_tb->FEB[p_s_tb->cur_blknum++] = p_s_new_bh;
846         }
847
848         if (n_retval == CARRY_ON && FILESYSTEM_CHANGED_TB(p_s_tb))
849                 n_retval = REPEAT_SEARCH;
850
851         return n_retval;
852 }
853
854 /* Get free space of the left neighbor, which is stored in the parent
855  * node of the left neighbor.  */
856 static int get_lfree(struct tree_balance *tb, int h)
857 {
858         struct buffer_head *l, *f;
859         int order;
860
861         if ((f = PATH_H_PPARENT(tb->tb_path, h)) == 0 || (l = tb->FL[h]) == 0)
862                 return 0;
863
864         if (f == l)
865                 order = PATH_H_B_ITEM_ORDER(tb->tb_path, h) - 1;
866         else {
867                 order = B_NR_ITEMS(l);
868                 f = l;
869         }
870
871         return (MAX_CHILD_SIZE(f) - dc_size(B_N_CHILD(f, order)));
872 }
873
874 /* Get free space of the right neighbor,
875  * which is stored in the parent node of the right neighbor.
876  */
877 static int get_rfree(struct tree_balance *tb, int h)
878 {
879         struct buffer_head *r, *f;
880         int order;
881
882         if ((f = PATH_H_PPARENT(tb->tb_path, h)) == 0 || (r = tb->FR[h]) == 0)
883                 return 0;
884
885         if (f == r)
886                 order = PATH_H_B_ITEM_ORDER(tb->tb_path, h) + 1;
887         else {
888                 order = 0;
889                 f = r;
890         }
891
892         return (MAX_CHILD_SIZE(f) - dc_size(B_N_CHILD(f, order)));
893
894 }
895
896 /* Check whether left neighbor is in memory. */
897 static int is_left_neighbor_in_cache(struct tree_balance *p_s_tb, int n_h)
898 {
899         struct buffer_head *p_s_father, *left;
900         struct super_block *p_s_sb = p_s_tb->tb_sb;
901         b_blocknr_t n_left_neighbor_blocknr;
902         int n_left_neighbor_position;
903
904         if (!p_s_tb->FL[n_h])   /* Father of the left neighbor does not exist. */
905                 return 0;
906
907         /* Calculate father of the node to be balanced. */
908         p_s_father = PATH_H_PBUFFER(p_s_tb->tb_path, n_h + 1);
909
910         RFALSE(!p_s_father ||
911                !B_IS_IN_TREE(p_s_father) ||
912                !B_IS_IN_TREE(p_s_tb->FL[n_h]) ||
913                !buffer_uptodate(p_s_father) ||
914                !buffer_uptodate(p_s_tb->FL[n_h]),
915                "vs-8165: F[h] (%b) or FL[h] (%b) is invalid",
916                p_s_father, p_s_tb->FL[n_h]);
917
918         /* Get position of the pointer to the left neighbor into the left father. */
919         n_left_neighbor_position = (p_s_father == p_s_tb->FL[n_h]) ?
920             p_s_tb->lkey[n_h] : B_NR_ITEMS(p_s_tb->FL[n_h]);
921         /* Get left neighbor block number. */
922         n_left_neighbor_blocknr =
923             B_N_CHILD_NUM(p_s_tb->FL[n_h], n_left_neighbor_position);
924         /* Look for the left neighbor in the cache. */
925         if ((left = sb_find_get_block(p_s_sb, n_left_neighbor_blocknr))) {
926
927                 RFALSE(buffer_uptodate(left) && !B_IS_IN_TREE(left),
928                        "vs-8170: left neighbor (%b %z) is not in the tree",
929                        left, left);
930                 put_bh(left);
931                 return 1;
932         }
933
934         return 0;
935 }
936
937 #define LEFT_PARENTS  'l'
938 #define RIGHT_PARENTS 'r'
939
940 static void decrement_key(struct cpu_key *p_s_key)
941 {
942         // call item specific function for this key
943         item_ops[cpu_key_k_type(p_s_key)]->decrement_key(p_s_key);
944 }
945
946 /* Calculate far left/right parent of the left/right neighbor of the current node, that
947  * is calculate the left/right (FL[h]/FR[h]) neighbor of the parent F[h].
948  * Calculate left/right common parent of the current node and L[h]/R[h].
949  * Calculate left/right delimiting key position.
950  * Returns:     PATH_INCORRECT   - path in the tree is not correct;
951                 SCHEDULE_OCCURRED - schedule occurred while the function worked;
952  *              CARRY_ON         - schedule didn't occur while the function worked;
953  */
954 static int get_far_parent(struct tree_balance *p_s_tb,
955                           int n_h,
956                           struct buffer_head **pp_s_father,
957                           struct buffer_head **pp_s_com_father, char c_lr_par)
958 {
959         struct buffer_head *p_s_parent;
960         INITIALIZE_PATH(s_path_to_neighbor_father);
961         struct path *p_s_path = p_s_tb->tb_path;
962         struct cpu_key s_lr_father_key;
963         int n_counter,
964             n_position = INT_MAX,
965             n_first_last_position = 0,
966             n_path_offset = PATH_H_PATH_OFFSET(p_s_path, n_h);
967
968         /* Starting from F[n_h] go upwards in the tree, and look for the common
969            ancestor of F[n_h], and its neighbor l/r, that should be obtained. */
970
971         n_counter = n_path_offset;
972
973         RFALSE(n_counter < FIRST_PATH_ELEMENT_OFFSET,
974                "PAP-8180: invalid path length");
975
976         for (; n_counter > FIRST_PATH_ELEMENT_OFFSET; n_counter--) {
977                 /* Check whether parent of the current buffer in the path is really parent in the tree. */
978                 if (!B_IS_IN_TREE
979                     (p_s_parent = PATH_OFFSET_PBUFFER(p_s_path, n_counter - 1)))
980                         return REPEAT_SEARCH;
981                 /* Check whether position in the parent is correct. */
982                 if ((n_position =
983                      PATH_OFFSET_POSITION(p_s_path,
984                                           n_counter - 1)) >
985                     B_NR_ITEMS(p_s_parent))
986                         return REPEAT_SEARCH;
987                 /* Check whether parent at the path really points to the child. */
988                 if (B_N_CHILD_NUM(p_s_parent, n_position) !=
989                     PATH_OFFSET_PBUFFER(p_s_path, n_counter)->b_blocknr)
990                         return REPEAT_SEARCH;
991                 /* Return delimiting key if position in the parent is not equal to first/last one. */
992                 if (c_lr_par == RIGHT_PARENTS)
993                         n_first_last_position = B_NR_ITEMS(p_s_parent);
994                 if (n_position != n_first_last_position) {
995                         *pp_s_com_father = p_s_parent;
996                         get_bh(*pp_s_com_father);
997                         /*(*pp_s_com_father = p_s_parent)->b_count++; */
998                         break;
999                 }
1000         }
1001
1002         /* if we are in the root of the tree, then there is no common father */
1003         if (n_counter == FIRST_PATH_ELEMENT_OFFSET) {
1004                 /* Check whether first buffer in the path is the root of the tree. */
1005                 if (PATH_OFFSET_PBUFFER
1006                     (p_s_tb->tb_path,
1007                      FIRST_PATH_ELEMENT_OFFSET)->b_blocknr ==
1008                     SB_ROOT_BLOCK(p_s_tb->tb_sb)) {
1009                         *pp_s_father = *pp_s_com_father = NULL;
1010                         return CARRY_ON;
1011                 }
1012                 return REPEAT_SEARCH;
1013         }
1014
1015         RFALSE(B_LEVEL(*pp_s_com_father) <= DISK_LEAF_NODE_LEVEL,
1016                "PAP-8185: (%b %z) level too small",
1017                *pp_s_com_father, *pp_s_com_father);
1018
1019         /* Check whether the common parent is locked. */
1020
1021         if (buffer_locked(*pp_s_com_father)) {
1022                 __wait_on_buffer(*pp_s_com_father);
1023                 if (FILESYSTEM_CHANGED_TB(p_s_tb)) {
1024                         decrement_bcount(*pp_s_com_father);
1025                         return REPEAT_SEARCH;
1026                 }
1027         }
1028
1029         /* So, we got common parent of the current node and its left/right neighbor.
1030            Now we are geting the parent of the left/right neighbor. */
1031
1032         /* Form key to get parent of the left/right neighbor. */
1033         le_key2cpu_key(&s_lr_father_key,
1034                        B_N_PDELIM_KEY(*pp_s_com_father,
1035                                       (c_lr_par ==
1036                                        LEFT_PARENTS) ? (p_s_tb->lkey[n_h - 1] =
1037                                                         n_position -
1038                                                         1) : (p_s_tb->rkey[n_h -
1039                                                                            1] =
1040                                                               n_position)));
1041
1042         if (c_lr_par == LEFT_PARENTS)
1043                 decrement_key(&s_lr_father_key);
1044
1045         if (search_by_key
1046             (p_s_tb->tb_sb, &s_lr_father_key, &s_path_to_neighbor_father,
1047              n_h + 1) == IO_ERROR)
1048                 // path is released
1049                 return IO_ERROR;
1050
1051         if (FILESYSTEM_CHANGED_TB(p_s_tb)) {
1052                 decrement_counters_in_path(&s_path_to_neighbor_father);
1053                 decrement_bcount(*pp_s_com_father);
1054                 return REPEAT_SEARCH;
1055         }
1056
1057         *pp_s_father = PATH_PLAST_BUFFER(&s_path_to_neighbor_father);
1058
1059         RFALSE(B_LEVEL(*pp_s_father) != n_h + 1,
1060                "PAP-8190: (%b %z) level too small", *pp_s_father, *pp_s_father);
1061         RFALSE(s_path_to_neighbor_father.path_length <
1062                FIRST_PATH_ELEMENT_OFFSET, "PAP-8192: path length is too small");
1063
1064         s_path_to_neighbor_father.path_length--;
1065         decrement_counters_in_path(&s_path_to_neighbor_father);
1066         return CARRY_ON;
1067 }
1068
1069 /* Get parents of neighbors of node in the path(S[n_path_offset]) and common parents of
1070  * S[n_path_offset] and L[n_path_offset]/R[n_path_offset]: F[n_path_offset], FL[n_path_offset],
1071  * FR[n_path_offset], CFL[n_path_offset], CFR[n_path_offset].
1072  * Calculate numbers of left and right delimiting keys position: lkey[n_path_offset], rkey[n_path_offset].
1073  * Returns:     SCHEDULE_OCCURRED - schedule occurred while the function worked;
1074  *              CARRY_ON - schedule didn't occur while the function worked;
1075  */
1076 static int get_parents(struct tree_balance *p_s_tb, int n_h)
1077 {
1078         struct path *p_s_path = p_s_tb->tb_path;
1079         int n_position,
1080             n_ret_value,
1081             n_path_offset = PATH_H_PATH_OFFSET(p_s_tb->tb_path, n_h);
1082         struct buffer_head *p_s_curf, *p_s_curcf;
1083
1084         /* Current node is the root of the tree or will be root of the tree */
1085         if (n_path_offset <= FIRST_PATH_ELEMENT_OFFSET) {
1086                 /* The root can not have parents.
1087                    Release nodes which previously were obtained as parents of the current node neighbors. */
1088                 decrement_bcount(p_s_tb->FL[n_h]);
1089                 decrement_bcount(p_s_tb->CFL[n_h]);
1090                 decrement_bcount(p_s_tb->FR[n_h]);
1091                 decrement_bcount(p_s_tb->CFR[n_h]);
1092                 p_s_tb->FL[n_h] = p_s_tb->CFL[n_h] = p_s_tb->FR[n_h] =
1093                     p_s_tb->CFR[n_h] = NULL;
1094                 return CARRY_ON;
1095         }
1096
1097         /* Get parent FL[n_path_offset] of L[n_path_offset]. */
1098         if ((n_position = PATH_OFFSET_POSITION(p_s_path, n_path_offset - 1))) {
1099                 /* Current node is not the first child of its parent. */
1100                 /*(p_s_curf = p_s_curcf = PATH_OFFSET_PBUFFER(p_s_path, n_path_offset - 1))->b_count += 2; */
1101                 p_s_curf = p_s_curcf =
1102                     PATH_OFFSET_PBUFFER(p_s_path, n_path_offset - 1);
1103                 get_bh(p_s_curf);
1104                 get_bh(p_s_curf);
1105                 p_s_tb->lkey[n_h] = n_position - 1;
1106         } else {
1107                 /* Calculate current parent of L[n_path_offset], which is the left neighbor of the current node.
1108                    Calculate current common parent of L[n_path_offset] and the current node. Note that
1109                    CFL[n_path_offset] not equal FL[n_path_offset] and CFL[n_path_offset] not equal F[n_path_offset].
1110                    Calculate lkey[n_path_offset]. */
1111                 if ((n_ret_value = get_far_parent(p_s_tb, n_h + 1, &p_s_curf,
1112                                                   &p_s_curcf,
1113                                                   LEFT_PARENTS)) != CARRY_ON)
1114                         return n_ret_value;
1115         }
1116
1117         decrement_bcount(p_s_tb->FL[n_h]);
1118         p_s_tb->FL[n_h] = p_s_curf;     /* New initialization of FL[n_h]. */
1119         decrement_bcount(p_s_tb->CFL[n_h]);
1120         p_s_tb->CFL[n_h] = p_s_curcf;   /* New initialization of CFL[n_h]. */
1121
1122         RFALSE((p_s_curf && !B_IS_IN_TREE(p_s_curf)) ||
1123                (p_s_curcf && !B_IS_IN_TREE(p_s_curcf)),
1124                "PAP-8195: FL (%b) or CFL (%b) is invalid", p_s_curf, p_s_curcf);
1125
1126 /* Get parent FR[n_h] of R[n_h]. */
1127
1128 /* Current node is the last child of F[n_h]. FR[n_h] != F[n_h]. */
1129         if (n_position == B_NR_ITEMS(PATH_H_PBUFFER(p_s_path, n_h + 1))) {
1130 /* Calculate current parent of R[n_h], which is the right neighbor of F[n_h].
1131    Calculate current common parent of R[n_h] and current node. Note that CFR[n_h]
1132    not equal FR[n_path_offset] and CFR[n_h] not equal F[n_h]. */
1133                 if ((n_ret_value =
1134                      get_far_parent(p_s_tb, n_h + 1, &p_s_curf, &p_s_curcf,
1135                                     RIGHT_PARENTS)) != CARRY_ON)
1136                         return n_ret_value;
1137         } else {
1138 /* Current node is not the last child of its parent F[n_h]. */
1139                 /*(p_s_curf = p_s_curcf = PATH_OFFSET_PBUFFER(p_s_path, n_path_offset - 1))->b_count += 2; */
1140                 p_s_curf = p_s_curcf =
1141                     PATH_OFFSET_PBUFFER(p_s_path, n_path_offset - 1);
1142                 get_bh(p_s_curf);
1143                 get_bh(p_s_curf);
1144                 p_s_tb->rkey[n_h] = n_position;
1145         }
1146
1147         decrement_bcount(p_s_tb->FR[n_h]);
1148         p_s_tb->FR[n_h] = p_s_curf;     /* New initialization of FR[n_path_offset]. */
1149
1150         decrement_bcount(p_s_tb->CFR[n_h]);
1151         p_s_tb->CFR[n_h] = p_s_curcf;   /* New initialization of CFR[n_path_offset]. */
1152
1153         RFALSE((p_s_curf && !B_IS_IN_TREE(p_s_curf)) ||
1154                (p_s_curcf && !B_IS_IN_TREE(p_s_curcf)),
1155                "PAP-8205: FR (%b) or CFR (%b) is invalid", p_s_curf, p_s_curcf);
1156
1157         return CARRY_ON;
1158 }
1159
1160 /* it is possible to remove node as result of shiftings to
1161    neighbors even when we insert or paste item. */
1162 static inline int can_node_be_removed(int mode, int lfree, int sfree, int rfree,
1163                                       struct tree_balance *tb, int h)
1164 {
1165         struct buffer_head *Sh = PATH_H_PBUFFER(tb->tb_path, h);
1166         int levbytes = tb->insert_size[h];
1167         struct item_head *ih;
1168         struct reiserfs_key *r_key = NULL;
1169
1170         ih = B_N_PITEM_HEAD(Sh, 0);
1171         if (tb->CFR[h])
1172                 r_key = B_N_PDELIM_KEY(tb->CFR[h], tb->rkey[h]);
1173
1174         if (lfree + rfree + sfree < MAX_CHILD_SIZE(Sh) + levbytes
1175             /* shifting may merge items which might save space */
1176             -
1177             ((!h
1178               && op_is_left_mergeable(&(ih->ih_key), Sh->b_size)) ? IH_SIZE : 0)
1179             -
1180             ((!h && r_key
1181               && op_is_left_mergeable(r_key, Sh->b_size)) ? IH_SIZE : 0)
1182             + ((h) ? KEY_SIZE : 0)) {
1183                 /* node can not be removed */
1184                 if (sfree >= levbytes) {        /* new item fits into node S[h] without any shifting */
1185                         if (!h)
1186                                 tb->s0num =
1187                                     B_NR_ITEMS(Sh) +
1188                                     ((mode == M_INSERT) ? 1 : 0);
1189                         set_parameters(tb, h, 0, 0, 1, NULL, -1, -1);
1190                         return NO_BALANCING_NEEDED;
1191                 }
1192         }
1193         PROC_INFO_INC(tb->tb_sb, can_node_be_removed[h]);
1194         return !NO_BALANCING_NEEDED;
1195 }
1196
1197 /* Check whether current node S[h] is balanced when increasing its size by
1198  * Inserting or Pasting.
1199  * Calculate parameters for balancing for current level h.
1200  * Parameters:
1201  *      tb      tree_balance structure;
1202  *      h       current level of the node;
1203  *      inum    item number in S[h];
1204  *      mode    i - insert, p - paste;
1205  * Returns:     1 - schedule occurred; 
1206  *              0 - balancing for higher levels needed;
1207  *             -1 - no balancing for higher levels needed;
1208  *             -2 - no disk space.
1209  */
1210 /* ip means Inserting or Pasting */
1211 static int ip_check_balance(struct tree_balance *tb, int h)
1212 {
1213         struct virtual_node *vn = tb->tb_vn;
1214         int levbytes,           /* Number of bytes that must be inserted into (value
1215                                    is negative if bytes are deleted) buffer which
1216                                    contains node being balanced.  The mnemonic is
1217                                    that the attempted change in node space used level
1218                                    is levbytes bytes. */
1219          n_ret_value;
1220
1221         int lfree, sfree, rfree /* free space in L, S and R */ ;
1222
1223         /* nver is short for number of vertixes, and lnver is the number if
1224            we shift to the left, rnver is the number if we shift to the
1225            right, and lrnver is the number if we shift in both directions.
1226            The goal is to minimize first the number of vertixes, and second,
1227            the number of vertixes whose contents are changed by shifting,
1228            and third the number of uncached vertixes whose contents are
1229            changed by shifting and must be read from disk.  */
1230         int nver, lnver, rnver, lrnver;
1231
1232         /* used at leaf level only, S0 = S[0] is the node being balanced,
1233            sInum [ I = 0,1,2 ] is the number of items that will
1234            remain in node SI after balancing.  S1 and S2 are new
1235            nodes that might be created. */
1236
1237         /* we perform 8 calls to get_num_ver().  For each call we calculate five parameters.
1238            where 4th parameter is s1bytes and 5th - s2bytes
1239          */
1240         short snum012[40] = { 0, };     /* s0num, s1num, s2num for 8 cases 
1241                                            0,1 - do not shift and do not shift but bottle
1242                                            2 - shift only whole item to left
1243                                            3 - shift to left and bottle as much as possible
1244                                            4,5 - shift to right (whole items and as much as possible
1245                                            6,7 - shift to both directions (whole items and as much as possible)
1246                                          */
1247
1248         /* Sh is the node whose balance is currently being checked */
1249         struct buffer_head *Sh;
1250
1251         Sh = PATH_H_PBUFFER(tb->tb_path, h);
1252         levbytes = tb->insert_size[h];
1253
1254         /* Calculate balance parameters for creating new root. */
1255         if (!Sh) {
1256                 if (!h)
1257                         reiserfs_panic(tb->tb_sb,
1258                                        "vs-8210: ip_check_balance: S[0] can not be 0");
1259                 switch (n_ret_value = get_empty_nodes(tb, h)) {
1260                 case CARRY_ON:
1261                         set_parameters(tb, h, 0, 0, 1, NULL, -1, -1);
1262                         return NO_BALANCING_NEEDED;     /* no balancing for higher levels needed */
1263
1264                 case NO_DISK_SPACE:
1265                 case REPEAT_SEARCH:
1266                         return n_ret_value;
1267                 default:
1268                         reiserfs_panic(tb->tb_sb,
1269                                        "vs-8215: ip_check_balance: incorrect return value of get_empty_nodes");
1270                 }
1271         }
1272
1273         if ((n_ret_value = get_parents(tb, h)) != CARRY_ON)     /* get parents of S[h] neighbors. */
1274                 return n_ret_value;
1275
1276         sfree = B_FREE_SPACE(Sh);
1277
1278         /* get free space of neighbors */
1279         rfree = get_rfree(tb, h);
1280         lfree = get_lfree(tb, h);
1281
1282         if (can_node_be_removed(vn->vn_mode, lfree, sfree, rfree, tb, h) ==
1283             NO_BALANCING_NEEDED)
1284                 /* and new item fits into node S[h] without any shifting */
1285                 return NO_BALANCING_NEEDED;
1286
1287         create_virtual_node(tb, h);
1288
1289         /*  
1290            determine maximal number of items we can shift to the left neighbor (in tb structure)
1291            and the maximal number of bytes that can flow to the left neighbor
1292            from the left most liquid item that cannot be shifted from S[0] entirely (returned value)
1293          */
1294         check_left(tb, h, lfree);
1295
1296         /*
1297            determine maximal number of items we can shift to the right neighbor (in tb structure)
1298            and the maximal number of bytes that can flow to the right neighbor
1299            from the right most liquid item that cannot be shifted from S[0] entirely (returned value)
1300          */
1301         check_right(tb, h, rfree);
1302
1303         /* all contents of internal node S[h] can be moved into its
1304            neighbors, S[h] will be removed after balancing */
1305         if (h && (tb->rnum[h] + tb->lnum[h] >= vn->vn_nr_item + 1)) {
1306                 int to_r;
1307
1308                 /* Since we are working on internal nodes, and our internal
1309                    nodes have fixed size entries, then we can balance by the
1310                    number of items rather than the space they consume.  In this
1311                    routine we set the left node equal to the right node,
1312                    allowing a difference of less than or equal to 1 child
1313                    pointer. */
1314                 to_r =
1315                     ((MAX_NR_KEY(Sh) << 1) + 2 - tb->lnum[h] - tb->rnum[h] +
1316                      vn->vn_nr_item + 1) / 2 - (MAX_NR_KEY(Sh) + 1 -
1317                                                 tb->rnum[h]);
1318                 set_parameters(tb, h, vn->vn_nr_item + 1 - to_r, to_r, 0, NULL,
1319                                -1, -1);
1320                 return CARRY_ON;
1321         }
1322
1323         /* this checks balance condition, that any two neighboring nodes can not fit in one node */
1324         RFALSE(h &&
1325                (tb->lnum[h] >= vn->vn_nr_item + 1 ||
1326                 tb->rnum[h] >= vn->vn_nr_item + 1),
1327                "vs-8220: tree is not balanced on internal level");
1328         RFALSE(!h && ((tb->lnum[h] >= vn->vn_nr_item && (tb->lbytes == -1)) ||
1329                       (tb->rnum[h] >= vn->vn_nr_item && (tb->rbytes == -1))),
1330                "vs-8225: tree is not balanced on leaf level");
1331
1332         /* all contents of S[0] can be moved into its neighbors
1333            S[0] will be removed after balancing. */
1334         if (!h && is_leaf_removable(tb))
1335                 return CARRY_ON;
1336
1337         /* why do we perform this check here rather than earlier??
1338            Answer: we can win 1 node in some cases above. Moreover we
1339            checked it above, when we checked, that S[0] is not removable
1340            in principle */
1341         if (sfree >= levbytes) {        /* new item fits into node S[h] without any shifting */
1342                 if (!h)
1343                         tb->s0num = vn->vn_nr_item;
1344                 set_parameters(tb, h, 0, 0, 1, NULL, -1, -1);
1345                 return NO_BALANCING_NEEDED;
1346         }
1347
1348         {
1349                 int lpar, rpar, nset, lset, rset, lrset;
1350                 /* 
1351                  * regular overflowing of the node
1352                  */
1353
1354                 /* get_num_ver works in 2 modes (FLOW & NO_FLOW) 
1355                    lpar, rpar - number of items we can shift to left/right neighbor (including splitting item)
1356                    nset, lset, rset, lrset - shows, whether flowing items give better packing 
1357                  */
1358 #define FLOW 1
1359 #define NO_FLOW 0               /* do not any splitting */
1360
1361                 /* we choose one the following */
1362 #define NOTHING_SHIFT_NO_FLOW   0
1363 #define NOTHING_SHIFT_FLOW      5
1364 #define LEFT_SHIFT_NO_FLOW      10
1365 #define LEFT_SHIFT_FLOW         15
1366 #define RIGHT_SHIFT_NO_FLOW     20
1367 #define RIGHT_SHIFT_FLOW        25
1368 #define LR_SHIFT_NO_FLOW        30
1369 #define LR_SHIFT_FLOW           35
1370
1371                 lpar = tb->lnum[h];
1372                 rpar = tb->rnum[h];
1373
1374                 /* calculate number of blocks S[h] must be split into when
1375                    nothing is shifted to the neighbors,
1376                    as well as number of items in each part of the split node (s012 numbers),
1377                    and number of bytes (s1bytes) of the shared drop which flow to S1 if any */
1378                 nset = NOTHING_SHIFT_NO_FLOW;
1379                 nver = get_num_ver(vn->vn_mode, tb, h,
1380                                    0, -1, h ? vn->vn_nr_item : 0, -1,
1381                                    snum012, NO_FLOW);
1382
1383                 if (!h) {
1384                         int nver1;
1385
1386                         /* note, that in this case we try to bottle between S[0] and S1 (S1 - the first new node) */
1387                         nver1 = get_num_ver(vn->vn_mode, tb, h,
1388                                             0, -1, 0, -1,
1389                                             snum012 + NOTHING_SHIFT_FLOW, FLOW);
1390                         if (nver > nver1)
1391                                 nset = NOTHING_SHIFT_FLOW, nver = nver1;
1392                 }
1393
1394                 /* calculate number of blocks S[h] must be split into when
1395                    l_shift_num first items and l_shift_bytes of the right most
1396                    liquid item to be shifted are shifted to the left neighbor,
1397                    as well as number of items in each part of the splitted node (s012 numbers),
1398                    and number of bytes (s1bytes) of the shared drop which flow to S1 if any
1399                  */
1400                 lset = LEFT_SHIFT_NO_FLOW;
1401                 lnver = get_num_ver(vn->vn_mode, tb, h,
1402                                     lpar - ((h || tb->lbytes == -1) ? 0 : 1),
1403                                     -1, h ? vn->vn_nr_item : 0, -1,
1404                                     snum012 + LEFT_SHIFT_NO_FLOW, NO_FLOW);
1405                 if (!h) {
1406                         int lnver1;
1407
1408                         lnver1 = get_num_ver(vn->vn_mode, tb, h,
1409                                              lpar -
1410                                              ((tb->lbytes != -1) ? 1 : 0),
1411                                              tb->lbytes, 0, -1,
1412                                              snum012 + LEFT_SHIFT_FLOW, FLOW);
1413                         if (lnver > lnver1)
1414                                 lset = LEFT_SHIFT_FLOW, lnver = lnver1;
1415                 }
1416
1417                 /* calculate number of blocks S[h] must be split into when
1418                    r_shift_num first items and r_shift_bytes of the left most
1419                    liquid item to be shifted are shifted to the right neighbor,
1420                    as well as number of items in each part of the splitted node (s012 numbers),
1421                    and number of bytes (s1bytes) of the shared drop which flow to S1 if any
1422                  */
1423                 rset = RIGHT_SHIFT_NO_FLOW;
1424                 rnver = get_num_ver(vn->vn_mode, tb, h,
1425                                     0, -1,
1426                                     h ? (vn->vn_nr_item - rpar) : (rpar -
1427                                                                    ((tb->
1428                                                                      rbytes !=
1429                                                                      -1) ? 1 :
1430                                                                     0)), -1,
1431                                     snum012 + RIGHT_SHIFT_NO_FLOW, NO_FLOW);
1432                 if (!h) {
1433                         int rnver1;
1434
1435                         rnver1 = get_num_ver(vn->vn_mode, tb, h,
1436                                              0, -1,
1437                                              (rpar -
1438                                               ((tb->rbytes != -1) ? 1 : 0)),
1439                                              tb->rbytes,
1440                                              snum012 + RIGHT_SHIFT_FLOW, FLOW);
1441
1442                         if (rnver > rnver1)
1443                                 rset = RIGHT_SHIFT_FLOW, rnver = rnver1;
1444                 }
1445
1446                 /* calculate number of blocks S[h] must be split into when
1447                    items are shifted in both directions,
1448                    as well as number of items in each part of the splitted node (s012 numbers),
1449                    and number of bytes (s1bytes) of the shared drop which flow to S1 if any
1450                  */
1451                 lrset = LR_SHIFT_NO_FLOW;
1452                 lrnver = get_num_ver(vn->vn_mode, tb, h,
1453                                      lpar - ((h || tb->lbytes == -1) ? 0 : 1),
1454                                      -1,
1455                                      h ? (vn->vn_nr_item - rpar) : (rpar -
1456                                                                     ((tb->
1457                                                                       rbytes !=
1458                                                                       -1) ? 1 :
1459                                                                      0)), -1,
1460                                      snum012 + LR_SHIFT_NO_FLOW, NO_FLOW);
1461                 if (!h) {
1462                         int lrnver1;
1463
1464                         lrnver1 = get_num_ver(vn->vn_mode, tb, h,
1465                                               lpar -
1466                                               ((tb->lbytes != -1) ? 1 : 0),
1467                                               tb->lbytes,
1468                                               (rpar -
1469                                                ((tb->rbytes != -1) ? 1 : 0)),
1470                                               tb->rbytes,
1471                                               snum012 + LR_SHIFT_FLOW, FLOW);
1472                         if (lrnver > lrnver1)
1473                                 lrset = LR_SHIFT_FLOW, lrnver = lrnver1;
1474                 }
1475
1476                 /* Our general shifting strategy is:
1477                    1) to minimized number of new nodes;
1478                    2) to minimized number of neighbors involved in shifting;
1479                    3) to minimized number of disk reads; */
1480
1481                 /* we can win TWO or ONE nodes by shifting in both directions */
1482                 if (lrnver < lnver && lrnver < rnver) {
1483                         RFALSE(h &&
1484                                (tb->lnum[h] != 1 ||
1485                                 tb->rnum[h] != 1 ||
1486                                 lrnver != 1 || rnver != 2 || lnver != 2
1487                                 || h != 1), "vs-8230: bad h");
1488                         if (lrset == LR_SHIFT_FLOW)
1489                                 set_parameters(tb, h, tb->lnum[h], tb->rnum[h],
1490                                                lrnver, snum012 + lrset,
1491                                                tb->lbytes, tb->rbytes);
1492                         else
1493                                 set_parameters(tb, h,
1494                                                tb->lnum[h] -
1495                                                ((tb->lbytes == -1) ? 0 : 1),
1496                                                tb->rnum[h] -
1497                                                ((tb->rbytes == -1) ? 0 : 1),
1498                                                lrnver, snum012 + lrset, -1, -1);
1499
1500                         return CARRY_ON;
1501                 }
1502
1503                 /* if shifting doesn't lead to better packing then don't shift */
1504                 if (nver == lrnver) {
1505                         set_parameters(tb, h, 0, 0, nver, snum012 + nset, -1,
1506                                        -1);
1507                         return CARRY_ON;
1508                 }
1509
1510                 /* now we know that for better packing shifting in only one
1511                    direction either to the left or to the right is required */
1512
1513                 /*  if shifting to the left is better than shifting to the right */
1514                 if (lnver < rnver) {
1515                         SET_PAR_SHIFT_LEFT;
1516                         return CARRY_ON;
1517                 }
1518
1519                 /* if shifting to the right is better than shifting to the left */
1520                 if (lnver > rnver) {
1521                         SET_PAR_SHIFT_RIGHT;
1522                         return CARRY_ON;
1523                 }
1524
1525                 /* now shifting in either direction gives the same number
1526                    of nodes and we can make use of the cached neighbors */
1527                 if (is_left_neighbor_in_cache(tb, h)) {
1528                         SET_PAR_SHIFT_LEFT;
1529                         return CARRY_ON;
1530                 }
1531
1532                 /* shift to the right independently on whether the right neighbor in cache or not */
1533                 SET_PAR_SHIFT_RIGHT;
1534                 return CARRY_ON;
1535         }
1536 }
1537
1538 /* Check whether current node S[h] is balanced when Decreasing its size by
1539  * Deleting or Cutting for INTERNAL node of S+tree.
1540  * Calculate parameters for balancing for current level h.
1541  * Parameters:
1542  *      tb      tree_balance structure;
1543  *      h       current level of the node;
1544  *      inum    item number in S[h];
1545  *      mode    i - insert, p - paste;
1546  * Returns:     1 - schedule occurred; 
1547  *              0 - balancing for higher levels needed;
1548  *             -1 - no balancing for higher levels needed;
1549  *             -2 - no disk space.
1550  *
1551  * Note: Items of internal nodes have fixed size, so the balance condition for
1552  * the internal part of S+tree is as for the B-trees.
1553  */
1554 static int dc_check_balance_internal(struct tree_balance *tb, int h)
1555 {
1556         struct virtual_node *vn = tb->tb_vn;
1557
1558         /* Sh is the node whose balance is currently being checked,
1559            and Fh is its father.  */
1560         struct buffer_head *Sh, *Fh;
1561         int maxsize, n_ret_value;
1562         int lfree, rfree /* free space in L and R */ ;
1563
1564         Sh = PATH_H_PBUFFER(tb->tb_path, h);
1565         Fh = PATH_H_PPARENT(tb->tb_path, h);
1566
1567         maxsize = MAX_CHILD_SIZE(Sh);
1568
1569 /*   using tb->insert_size[h], which is negative in this case, create_virtual_node calculates: */
1570 /*   new_nr_item = number of items node would have if operation is */
1571 /*      performed without balancing (new_nr_item); */
1572         create_virtual_node(tb, h);
1573
1574         if (!Fh) {              /* S[h] is the root. */
1575                 if (vn->vn_nr_item > 0) {
1576                         set_parameters(tb, h, 0, 0, 1, NULL, -1, -1);
1577                         return NO_BALANCING_NEEDED;     /* no balancing for higher levels needed */
1578                 }
1579                 /* new_nr_item == 0.
1580                  * Current root will be deleted resulting in
1581                  * decrementing the tree height. */
1582                 set_parameters(tb, h, 0, 0, 0, NULL, -1, -1);
1583                 return CARRY_ON;
1584         }
1585
1586         if ((n_ret_value = get_parents(tb, h)) != CARRY_ON)
1587                 return n_ret_value;
1588
1589         /* get free space of neighbors */
1590         rfree = get_rfree(tb, h);
1591         lfree = get_lfree(tb, h);
1592
1593         /* determine maximal number of items we can fit into neighbors */
1594         check_left(tb, h, lfree);
1595         check_right(tb, h, rfree);
1596
1597         if (vn->vn_nr_item >= MIN_NR_KEY(Sh)) { /* Balance condition for the internal node is valid.
1598                                                  * In this case we balance only if it leads to better packing. */
1599                 if (vn->vn_nr_item == MIN_NR_KEY(Sh)) { /* Here we join S[h] with one of its neighbors,
1600                                                          * which is impossible with greater values of new_nr_item. */
1601                         if (tb->lnum[h] >= vn->vn_nr_item + 1) {
1602                                 /* All contents of S[h] can be moved to L[h]. */
1603                                 int n;
1604                                 int order_L;
1605
1606                                 order_L =
1607                                     ((n =
1608                                       PATH_H_B_ITEM_ORDER(tb->tb_path,
1609                                                           h)) ==
1610                                      0) ? B_NR_ITEMS(tb->FL[h]) : n - 1;
1611                                 n = dc_size(B_N_CHILD(tb->FL[h], order_L)) /
1612                                     (DC_SIZE + KEY_SIZE);
1613                                 set_parameters(tb, h, -n - 1, 0, 0, NULL, -1,
1614                                                -1);
1615                                 return CARRY_ON;
1616                         }
1617
1618                         if (tb->rnum[h] >= vn->vn_nr_item + 1) {
1619                                 /* All contents of S[h] can be moved to R[h]. */
1620                                 int n;
1621                                 int order_R;
1622
1623                                 order_R =
1624                                     ((n =
1625                                       PATH_H_B_ITEM_ORDER(tb->tb_path,
1626                                                           h)) ==
1627                                      B_NR_ITEMS(Fh)) ? 0 : n + 1;
1628                                 n = dc_size(B_N_CHILD(tb->FR[h], order_R)) /
1629                                     (DC_SIZE + KEY_SIZE);
1630                                 set_parameters(tb, h, 0, -n - 1, 0, NULL, -1,
1631                                                -1);
1632                                 return CARRY_ON;
1633                         }
1634                 }
1635
1636                 if (tb->rnum[h] + tb->lnum[h] >= vn->vn_nr_item + 1) {
1637                         /* All contents of S[h] can be moved to the neighbors (L[h] & R[h]). */
1638                         int to_r;
1639
1640                         to_r =
1641                             ((MAX_NR_KEY(Sh) << 1) + 2 - tb->lnum[h] -
1642                              tb->rnum[h] + vn->vn_nr_item + 1) / 2 -
1643                             (MAX_NR_KEY(Sh) + 1 - tb->rnum[h]);
1644                         set_parameters(tb, h, vn->vn_nr_item + 1 - to_r, to_r,
1645                                        0, NULL, -1, -1);
1646                         return CARRY_ON;
1647                 }
1648
1649                 /* Balancing does not lead to better packing. */
1650                 set_parameters(tb, h, 0, 0, 1, NULL, -1, -1);
1651                 return NO_BALANCING_NEEDED;
1652         }
1653
1654         /* Current node contain insufficient number of items. Balancing is required. */
1655         /* Check whether we can merge S[h] with left neighbor. */
1656         if (tb->lnum[h] >= vn->vn_nr_item + 1)
1657                 if (is_left_neighbor_in_cache(tb, h)
1658                     || tb->rnum[h] < vn->vn_nr_item + 1 || !tb->FR[h]) {
1659                         int n;
1660                         int order_L;
1661
1662                         order_L =
1663                             ((n =
1664                               PATH_H_B_ITEM_ORDER(tb->tb_path,
1665                                                   h)) ==
1666                              0) ? B_NR_ITEMS(tb->FL[h]) : n - 1;
1667                         n = dc_size(B_N_CHILD(tb->FL[h], order_L)) / (DC_SIZE +
1668                                                                       KEY_SIZE);
1669                         set_parameters(tb, h, -n - 1, 0, 0, NULL, -1, -1);
1670                         return CARRY_ON;
1671                 }
1672
1673         /* Check whether we can merge S[h] with right neighbor. */
1674         if (tb->rnum[h] >= vn->vn_nr_item + 1) {
1675                 int n;
1676                 int order_R;
1677
1678                 order_R =
1679                     ((n =
1680                       PATH_H_B_ITEM_ORDER(tb->tb_path,
1681                                           h)) == B_NR_ITEMS(Fh)) ? 0 : (n + 1);
1682                 n = dc_size(B_N_CHILD(tb->FR[h], order_R)) / (DC_SIZE +
1683                                                               KEY_SIZE);
1684                 set_parameters(tb, h, 0, -n - 1, 0, NULL, -1, -1);
1685                 return CARRY_ON;
1686         }
1687
1688         /* All contents of S[h] can be moved to the neighbors (L[h] & R[h]). */
1689         if (tb->rnum[h] + tb->lnum[h] >= vn->vn_nr_item + 1) {
1690                 int to_r;
1691
1692                 to_r =
1693                     ((MAX_NR_KEY(Sh) << 1) + 2 - tb->lnum[h] - tb->rnum[h] +
1694                      vn->vn_nr_item + 1) / 2 - (MAX_NR_KEY(Sh) + 1 -
1695                                                 tb->rnum[h]);
1696                 set_parameters(tb, h, vn->vn_nr_item + 1 - to_r, to_r, 0, NULL,
1697                                -1, -1);
1698                 return CARRY_ON;
1699         }
1700
1701         /* For internal nodes try to borrow item from a neighbor */
1702         RFALSE(!tb->FL[h] && !tb->FR[h], "vs-8235: trying to borrow for root");
1703
1704         /* Borrow one or two items from caching neighbor */
1705         if (is_left_neighbor_in_cache(tb, h) || !tb->FR[h]) {
1706                 int from_l;
1707
1708                 from_l =
1709                     (MAX_NR_KEY(Sh) + 1 - tb->lnum[h] + vn->vn_nr_item +
1710                      1) / 2 - (vn->vn_nr_item + 1);
1711                 set_parameters(tb, h, -from_l, 0, 1, NULL, -1, -1);
1712                 return CARRY_ON;
1713         }
1714
1715         set_parameters(tb, h, 0,
1716                        -((MAX_NR_KEY(Sh) + 1 - tb->rnum[h] + vn->vn_nr_item +
1717                           1) / 2 - (vn->vn_nr_item + 1)), 1, NULL, -1, -1);
1718         return CARRY_ON;
1719 }
1720
1721 /* Check whether current node S[h] is balanced when Decreasing its size by
1722  * Deleting or Truncating for LEAF node of S+tree.
1723  * Calculate parameters for balancing for current level h.
1724  * Parameters:
1725  *      tb      tree_balance structure;
1726  *      h       current level of the node;
1727  *      inum    item number in S[h];
1728  *      mode    i - insert, p - paste;
1729  * Returns:     1 - schedule occurred; 
1730  *              0 - balancing for higher levels needed;
1731  *             -1 - no balancing for higher levels needed;
1732  *             -2 - no disk space.
1733  */
1734 static int dc_check_balance_leaf(struct tree_balance *tb, int h)
1735 {
1736         struct virtual_node *vn = tb->tb_vn;
1737
1738         /* Number of bytes that must be deleted from
1739            (value is negative if bytes are deleted) buffer which
1740            contains node being balanced.  The mnemonic is that the
1741            attempted change in node space used level is levbytes bytes. */
1742         int levbytes;
1743         /* the maximal item size */
1744         int maxsize, n_ret_value;
1745         /* S0 is the node whose balance is currently being checked,
1746            and F0 is its father.  */
1747         struct buffer_head *S0, *F0;
1748         int lfree, rfree /* free space in L and R */ ;
1749
1750         S0 = PATH_H_PBUFFER(tb->tb_path, 0);
1751         F0 = PATH_H_PPARENT(tb->tb_path, 0);
1752
1753         levbytes = tb->insert_size[h];
1754
1755         maxsize = MAX_CHILD_SIZE(S0);   /* maximal possible size of an item */
1756
1757         if (!F0) {              /* S[0] is the root now. */
1758
1759                 RFALSE(-levbytes >= maxsize - B_FREE_SPACE(S0),
1760                        "vs-8240: attempt to create empty buffer tree");
1761
1762                 set_parameters(tb, h, 0, 0, 1, NULL, -1, -1);
1763                 return NO_BALANCING_NEEDED;
1764         }
1765
1766         if ((n_ret_value = get_parents(tb, h)) != CARRY_ON)
1767                 return n_ret_value;
1768
1769         /* get free space of neighbors */
1770         rfree = get_rfree(tb, h);
1771         lfree = get_lfree(tb, h);
1772
1773         create_virtual_node(tb, h);
1774
1775         /* if 3 leaves can be merge to one, set parameters and return */
1776         if (are_leaves_removable(tb, lfree, rfree))
1777                 return CARRY_ON;
1778
1779         /* determine maximal number of items we can shift to the left/right  neighbor
1780            and the maximal number of bytes that can flow to the left/right neighbor
1781            from the left/right most liquid item that cannot be shifted from S[0] entirely
1782          */
1783         check_left(tb, h, lfree);
1784         check_right(tb, h, rfree);
1785
1786         /* check whether we can merge S with left neighbor. */
1787         if (tb->lnum[0] >= vn->vn_nr_item && tb->lbytes == -1)
1788                 if (is_left_neighbor_in_cache(tb, h) || ((tb->rnum[0] - ((tb->rbytes == -1) ? 0 : 1)) < vn->vn_nr_item) ||      /* S can not be merged with R */
1789                     !tb->FR[h]) {
1790
1791                         RFALSE(!tb->FL[h],
1792                                "vs-8245: dc_check_balance_leaf: FL[h] must exist");
1793
1794                         /* set parameter to merge S[0] with its left neighbor */
1795                         set_parameters(tb, h, -1, 0, 0, NULL, -1, -1);
1796                         return CARRY_ON;
1797                 }
1798
1799         /* check whether we can merge S[0] with right neighbor. */
1800         if (tb->rnum[0] >= vn->vn_nr_item && tb->rbytes == -1) {
1801                 set_parameters(tb, h, 0, -1, 0, NULL, -1, -1);
1802                 return CARRY_ON;
1803         }
1804
1805         /* All contents of S[0] can be moved to the neighbors (L[0] & R[0]). Set parameters and return */
1806         if (is_leaf_removable(tb))
1807                 return CARRY_ON;
1808
1809         /* Balancing is not required. */
1810         tb->s0num = vn->vn_nr_item;
1811         set_parameters(tb, h, 0, 0, 1, NULL, -1, -1);
1812         return NO_BALANCING_NEEDED;
1813 }
1814
1815 /* Check whether current node S[h] is balanced when Decreasing its size by
1816  * Deleting or Cutting.
1817  * Calculate parameters for balancing for current level h.
1818  * Parameters:
1819  *      tb      tree_balance structure;
1820  *      h       current level of the node;
1821  *      inum    item number in S[h];
1822  *      mode    d - delete, c - cut.
1823  * Returns:     1 - schedule occurred; 
1824  *              0 - balancing for higher levels needed;
1825  *             -1 - no balancing for higher levels needed;
1826  *             -2 - no disk space.
1827  */
1828 static int dc_check_balance(struct tree_balance *tb, int h)
1829 {
1830         RFALSE(!(PATH_H_PBUFFER(tb->tb_path, h)),
1831                "vs-8250: S is not initialized");
1832
1833         if (h)
1834                 return dc_check_balance_internal(tb, h);
1835         else
1836                 return dc_check_balance_leaf(tb, h);
1837 }
1838
1839 /* Check whether current node S[h] is balanced.
1840  * Calculate parameters for balancing for current level h.
1841  * Parameters:
1842  *
1843  *      tb      tree_balance structure:
1844  *
1845  *              tb is a large structure that must be read about in the header file
1846  *              at the same time as this procedure if the reader is to successfully
1847  *              understand this procedure
1848  *
1849  *      h       current level of the node;
1850  *      inum    item number in S[h];
1851  *      mode    i - insert, p - paste, d - delete, c - cut.
1852  * Returns:     1 - schedule occurred; 
1853  *              0 - balancing for higher levels needed;
1854  *             -1 - no balancing for higher levels needed;
1855  *             -2 - no disk space.
1856  */
1857 static int check_balance(int mode,
1858                          struct tree_balance *tb,
1859                          int h,
1860                          int inum,
1861                          int pos_in_item,
1862                          struct item_head *ins_ih, const void *data)
1863 {
1864         struct virtual_node *vn;
1865
1866         vn = tb->tb_vn = (struct virtual_node *)(tb->vn_buf);
1867         vn->vn_free_ptr = (char *)(tb->tb_vn + 1);
1868         vn->vn_mode = mode;
1869         vn->vn_affected_item_num = inum;
1870         vn->vn_pos_in_item = pos_in_item;
1871         vn->vn_ins_ih = ins_ih;
1872         vn->vn_data = data;
1873
1874         RFALSE(mode == M_INSERT && !vn->vn_ins_ih,
1875                "vs-8255: ins_ih can not be 0 in insert mode");
1876
1877         if (tb->insert_size[h] > 0)
1878                 /* Calculate balance parameters when size of node is increasing. */
1879                 return ip_check_balance(tb, h);
1880
1881         /* Calculate balance parameters when  size of node is decreasing. */
1882         return dc_check_balance(tb, h);
1883 }
1884
1885 /* Check whether parent at the path is the really parent of the current node.*/
1886 static int get_direct_parent(struct tree_balance *p_s_tb, int n_h)
1887 {
1888         struct buffer_head *p_s_bh;
1889         struct path *p_s_path = p_s_tb->tb_path;
1890         int n_position,
1891             n_path_offset = PATH_H_PATH_OFFSET(p_s_tb->tb_path, n_h);
1892
1893         /* We are in the root or in the new root. */
1894         if (n_path_offset <= FIRST_PATH_ELEMENT_OFFSET) {
1895
1896                 RFALSE(n_path_offset < FIRST_PATH_ELEMENT_OFFSET - 1,
1897                        "PAP-8260: invalid offset in the path");
1898
1899                 if (PATH_OFFSET_PBUFFER(p_s_path, FIRST_PATH_ELEMENT_OFFSET)->
1900                     b_blocknr == SB_ROOT_BLOCK(p_s_tb->tb_sb)) {
1901                         /* Root is not changed. */
1902                         PATH_OFFSET_PBUFFER(p_s_path, n_path_offset - 1) = NULL;
1903                         PATH_OFFSET_POSITION(p_s_path, n_path_offset - 1) = 0;
1904                         return CARRY_ON;
1905                 }
1906                 return REPEAT_SEARCH;   /* Root is changed and we must recalculate the path. */
1907         }
1908
1909         if (!B_IS_IN_TREE
1910             (p_s_bh = PATH_OFFSET_PBUFFER(p_s_path, n_path_offset - 1)))
1911                 return REPEAT_SEARCH;   /* Parent in the path is not in the tree. */
1912
1913         if ((n_position =
1914              PATH_OFFSET_POSITION(p_s_path,
1915                                   n_path_offset - 1)) > B_NR_ITEMS(p_s_bh))
1916                 return REPEAT_SEARCH;
1917
1918         if (B_N_CHILD_NUM(p_s_bh, n_position) !=
1919             PATH_OFFSET_PBUFFER(p_s_path, n_path_offset)->b_blocknr)
1920                 /* Parent in the path is not parent of the current node in the tree. */
1921                 return REPEAT_SEARCH;
1922
1923         if (buffer_locked(p_s_bh)) {
1924                 __wait_on_buffer(p_s_bh);
1925                 if (FILESYSTEM_CHANGED_TB(p_s_tb))
1926                         return REPEAT_SEARCH;
1927         }
1928
1929         return CARRY_ON;        /* Parent in the path is unlocked and really parent of the current node.  */
1930 }
1931
1932 /* Using lnum[n_h] and rnum[n_h] we should determine what neighbors
1933  * of S[n_h] we
1934  * need in order to balance S[n_h], and get them if necessary.
1935  * Returns:     SCHEDULE_OCCURRED - schedule occurred while the function worked;
1936  *              CARRY_ON - schedule didn't occur while the function worked;
1937  */
1938 static int get_neighbors(struct tree_balance *p_s_tb, int n_h)
1939 {
1940         int n_child_position,
1941             n_path_offset = PATH_H_PATH_OFFSET(p_s_tb->tb_path, n_h + 1);
1942         unsigned long n_son_number;
1943         struct super_block *p_s_sb = p_s_tb->tb_sb;
1944         struct buffer_head *p_s_bh;
1945
1946         PROC_INFO_INC(p_s_sb, get_neighbors[n_h]);
1947
1948         if (p_s_tb->lnum[n_h]) {
1949                 /* We need left neighbor to balance S[n_h]. */
1950                 PROC_INFO_INC(p_s_sb, need_l_neighbor[n_h]);
1951                 p_s_bh = PATH_OFFSET_PBUFFER(p_s_tb->tb_path, n_path_offset);
1952
1953                 RFALSE(p_s_bh == p_s_tb->FL[n_h] &&
1954                        !PATH_OFFSET_POSITION(p_s_tb->tb_path, n_path_offset),
1955                        "PAP-8270: invalid position in the parent");
1956
1957                 n_child_position =
1958                     (p_s_bh ==
1959                      p_s_tb->FL[n_h]) ? p_s_tb->lkey[n_h] : B_NR_ITEMS(p_s_tb->
1960                                                                        FL[n_h]);
1961                 n_son_number = B_N_CHILD_NUM(p_s_tb->FL[n_h], n_child_position);
1962                 p_s_bh = sb_bread(p_s_sb, n_son_number);
1963                 if (!p_s_bh)
1964                         return IO_ERROR;
1965                 if (FILESYSTEM_CHANGED_TB(p_s_tb)) {
1966                         decrement_bcount(p_s_bh);
1967                         PROC_INFO_INC(p_s_sb, get_neighbors_restart[n_h]);
1968                         return REPEAT_SEARCH;
1969                 }
1970
1971                 RFALSE(!B_IS_IN_TREE(p_s_tb->FL[n_h]) ||
1972                        n_child_position > B_NR_ITEMS(p_s_tb->FL[n_h]) ||
1973                        B_N_CHILD_NUM(p_s_tb->FL[n_h], n_child_position) !=
1974                        p_s_bh->b_blocknr, "PAP-8275: invalid parent");
1975                 RFALSE(!B_IS_IN_TREE(p_s_bh), "PAP-8280: invalid child");
1976                 RFALSE(!n_h &&
1977                        B_FREE_SPACE(p_s_bh) !=
1978                        MAX_CHILD_SIZE(p_s_bh) -
1979                        dc_size(B_N_CHILD(p_s_tb->FL[0], n_child_position)),
1980                        "PAP-8290: invalid child size of left neighbor");
1981
1982                 decrement_bcount(p_s_tb->L[n_h]);
1983                 p_s_tb->L[n_h] = p_s_bh;
1984         }
1985
1986         if (p_s_tb->rnum[n_h]) {        /* We need right neighbor to balance S[n_path_offset]. */
1987                 PROC_INFO_INC(p_s_sb, need_r_neighbor[n_h]);
1988                 p_s_bh = PATH_OFFSET_PBUFFER(p_s_tb->tb_path, n_path_offset);
1989
1990                 RFALSE(p_s_bh == p_s_tb->FR[n_h] &&
1991                        PATH_OFFSET_POSITION(p_s_tb->tb_path,
1992                                             n_path_offset) >=
1993                        B_NR_ITEMS(p_s_bh),
1994                        "PAP-8295: invalid position in the parent");
1995
1996                 n_child_position =
1997                     (p_s_bh == p_s_tb->FR[n_h]) ? p_s_tb->rkey[n_h] + 1 : 0;
1998                 n_son_number = B_N_CHILD_NUM(p_s_tb->FR[n_h], n_child_position);
1999                 p_s_bh = sb_bread(p_s_sb, n_son_number);
2000                 if (!p_s_bh)
2001                         return IO_ERROR;
2002                 if (FILESYSTEM_CHANGED_TB(p_s_tb)) {
2003                         decrement_bcount(p_s_bh);
2004                         PROC_INFO_INC(p_s_sb, get_neighbors_restart[n_h]);
2005                         return REPEAT_SEARCH;
2006                 }
2007                 decrement_bcount(p_s_tb->R[n_h]);
2008                 p_s_tb->R[n_h] = p_s_bh;
2009
2010                 RFALSE(!n_h
2011                        && B_FREE_SPACE(p_s_bh) !=
2012                        MAX_CHILD_SIZE(p_s_bh) -
2013                        dc_size(B_N_CHILD(p_s_tb->FR[0], n_child_position)),
2014                        "PAP-8300: invalid child size of right neighbor (%d != %d - %d)",
2015                        B_FREE_SPACE(p_s_bh), MAX_CHILD_SIZE(p_s_bh),
2016                        dc_size(B_N_CHILD(p_s_tb->FR[0], n_child_position)));
2017
2018         }
2019         return CARRY_ON;
2020 }
2021
2022 static int get_virtual_node_size(struct super_block *sb, struct buffer_head *bh)
2023 {
2024         int max_num_of_items;
2025         int max_num_of_entries;
2026         unsigned long blocksize = sb->s_blocksize;
2027
2028 #define MIN_NAME_LEN 1
2029
2030         max_num_of_items = (blocksize - BLKH_SIZE) / (IH_SIZE + MIN_ITEM_LEN);
2031         max_num_of_entries = (blocksize - BLKH_SIZE - IH_SIZE) /
2032             (DEH_SIZE + MIN_NAME_LEN);
2033
2034         return sizeof(struct virtual_node) +
2035             max(max_num_of_items * sizeof(struct virtual_item),
2036                 sizeof(struct virtual_item) + sizeof(struct direntry_uarea) +
2037                 (max_num_of_entries - 1) * sizeof(__u16));
2038 }
2039
2040 /* maybe we should fail balancing we are going to perform when kmalloc
2041    fails several times. But now it will loop until kmalloc gets
2042    required memory */
2043 static int get_mem_for_virtual_node(struct tree_balance *tb)
2044 {
2045         int check_fs = 0;
2046         int size;
2047         char *buf;
2048
2049         size = get_virtual_node_size(tb->tb_sb, PATH_PLAST_BUFFER(tb->tb_path));
2050
2051         if (size > tb->vn_buf_size) {
2052                 /* we have to allocate more memory for virtual node */
2053                 if (tb->vn_buf) {
2054                         /* free memory allocated before */
2055                         kfree(tb->vn_buf);
2056                         /* this is not needed if kfree is atomic */
2057                         check_fs = 1;
2058                 }
2059
2060                 /* virtual node requires now more memory */
2061                 tb->vn_buf_size = size;
2062
2063                 /* get memory for virtual item */
2064                 buf = kmalloc(size, GFP_ATOMIC | __GFP_NOWARN);
2065                 if (!buf) {
2066                         /* getting memory with GFP_KERNEL priority may involve
2067                            balancing now (due to indirect_to_direct conversion on
2068                            dcache shrinking). So, release path and collected
2069                            resources here */
2070                         free_buffers_in_tb(tb);
2071                         buf = kmalloc(size, GFP_NOFS);
2072                         if (!buf) {
2073                                 tb->vn_buf_size = 0;
2074                         }
2075                         tb->vn_buf = buf;
2076                         schedule();
2077                         return REPEAT_SEARCH;
2078                 }
2079
2080                 tb->vn_buf = buf;
2081         }
2082
2083         if (check_fs && FILESYSTEM_CHANGED_TB(tb))
2084                 return REPEAT_SEARCH;
2085
2086         return CARRY_ON;
2087 }
2088
2089 #ifdef CONFIG_REISERFS_CHECK
2090 static void tb_buffer_sanity_check(struct super_block *p_s_sb,
2091                                    struct buffer_head *p_s_bh,
2092                                    const char *descr, int level)
2093 {
2094         if (p_s_bh) {
2095                 if (atomic_read(&(p_s_bh->b_count)) <= 0) {
2096
2097                         reiserfs_panic(p_s_sb,
2098                                        "jmacd-1: tb_buffer_sanity_check(): negative or zero reference counter for buffer %s[%d] (%b)\n",
2099                                        descr, level, p_s_bh);
2100                 }
2101
2102                 if (!buffer_uptodate(p_s_bh)) {
2103                         reiserfs_panic(p_s_sb,
2104                                        "jmacd-2: tb_buffer_sanity_check(): buffer is not up to date %s[%d] (%b)\n",
2105                                        descr, level, p_s_bh);
2106                 }
2107
2108                 if (!B_IS_IN_TREE(p_s_bh)) {
2109                         reiserfs_panic(p_s_sb,
2110                                        "jmacd-3: tb_buffer_sanity_check(): buffer is not in tree %s[%d] (%b)\n",
2111                                        descr, level, p_s_bh);
2112                 }
2113
2114                 if (p_s_bh->b_bdev != p_s_sb->s_bdev) {
2115                         reiserfs_panic(p_s_sb,
2116                                        "jmacd-4: tb_buffer_sanity_check(): buffer has wrong device %s[%d] (%b)\n",
2117                                        descr, level, p_s_bh);
2118                 }
2119
2120                 if (p_s_bh->b_size != p_s_sb->s_blocksize) {
2121                         reiserfs_panic(p_s_sb,
2122                                        "jmacd-5: tb_buffer_sanity_check(): buffer has wrong blocksize %s[%d] (%b)\n",
2123                                        descr, level, p_s_bh);
2124                 }
2125
2126                 if (p_s_bh->b_blocknr > SB_BLOCK_COUNT(p_s_sb)) {
2127                         reiserfs_panic(p_s_sb,
2128                                        "jmacd-6: tb_buffer_sanity_check(): buffer block number too high %s[%d] (%b)\n",
2129                                        descr, level, p_s_bh);
2130                 }
2131         }
2132 }
2133 #else
2134 static void tb_buffer_sanity_check(struct super_block *p_s_sb,
2135                                    struct buffer_head *p_s_bh,
2136                                    const char *descr, int level)
2137 {;
2138 }
2139 #endif
2140
2141 static int clear_all_dirty_bits(struct super_block *s, struct buffer_head *bh)
2142 {
2143         return reiserfs_prepare_for_journal(s, bh, 0);
2144 }
2145
2146 static int wait_tb_buffers_until_unlocked(struct tree_balance *p_s_tb)
2147 {
2148         struct buffer_head *locked;
2149 #ifdef CONFIG_REISERFS_CHECK
2150         int repeat_counter = 0;
2151 #endif
2152         int i;
2153
2154         do {
2155
2156                 locked = NULL;
2157
2158                 for (i = p_s_tb->tb_path->path_length;
2159                      !locked && i > ILLEGAL_PATH_ELEMENT_OFFSET; i--) {
2160                         if (PATH_OFFSET_PBUFFER(p_s_tb->tb_path, i)) {
2161                                 /* if I understand correctly, we can only be sure the last buffer
2162                                  ** in the path is in the tree --clm
2163                                  */
2164 #ifdef CONFIG_REISERFS_CHECK
2165                                 if (PATH_PLAST_BUFFER(p_s_tb->tb_path) ==
2166                                     PATH_OFFSET_PBUFFER(p_s_tb->tb_path, i)) {
2167                                         tb_buffer_sanity_check(p_s_tb->tb_sb,
2168                                                                PATH_OFFSET_PBUFFER
2169                                                                (p_s_tb->tb_path,
2170                                                                 i), "S",
2171                                                                p_s_tb->tb_path->
2172                                                                path_length - i);
2173                                 }
2174 #endif
2175                                 if (!clear_all_dirty_bits(p_s_tb->tb_sb,
2176                                                           PATH_OFFSET_PBUFFER
2177                                                           (p_s_tb->tb_path,
2178                                                            i))) {
2179                                         locked =
2180                                             PATH_OFFSET_PBUFFER(p_s_tb->tb_path,
2181                                                                 i);
2182                                 }
2183                         }
2184                 }
2185
2186                 for (i = 0; !locked && i < MAX_HEIGHT && p_s_tb->insert_size[i];
2187                      i++) {
2188
2189                         if (p_s_tb->lnum[i]) {
2190
2191                                 if (p_s_tb->L[i]) {
2192                                         tb_buffer_sanity_check(p_s_tb->tb_sb,
2193                                                                p_s_tb->L[i],
2194                                                                "L", i);
2195                                         if (!clear_all_dirty_bits
2196                                             (p_s_tb->tb_sb, p_s_tb->L[i]))
2197                                                 locked = p_s_tb->L[i];
2198                                 }
2199
2200                                 if (!locked && p_s_tb->FL[i]) {
2201                                         tb_buffer_sanity_check(p_s_tb->tb_sb,
2202                                                                p_s_tb->FL[i],
2203                                                                "FL", i);
2204                                         if (!clear_all_dirty_bits
2205                                             (p_s_tb->tb_sb, p_s_tb->FL[i]))
2206                                                 locked = p_s_tb->FL[i];
2207                                 }
2208
2209                                 if (!locked && p_s_tb->CFL[i]) {
2210                                         tb_buffer_sanity_check(p_s_tb->tb_sb,
2211                                                                p_s_tb->CFL[i],
2212                                                                "CFL", i);
2213                                         if (!clear_all_dirty_bits
2214                                             (p_s_tb->tb_sb, p_s_tb->CFL[i]))
2215                                                 locked = p_s_tb->CFL[i];
2216                                 }
2217
2218                         }
2219
2220                         if (!locked && (p_s_tb->rnum[i])) {
2221
2222                                 if (p_s_tb->R[i]) {
2223                                         tb_buffer_sanity_check(p_s_tb->tb_sb,
2224                                                                p_s_tb->R[i],
2225                                                                "R", i);
2226                                         if (!clear_all_dirty_bits
2227                                             (p_s_tb->tb_sb, p_s_tb->R[i]))
2228                                                 locked = p_s_tb->R[i];
2229                                 }
2230
2231                                 if (!locked && p_s_tb->FR[i]) {
2232                                         tb_buffer_sanity_check(p_s_tb->tb_sb,
2233                                                                p_s_tb->FR[i],
2234                                                                "FR", i);
2235                                         if (!clear_all_dirty_bits
2236                                             (p_s_tb->tb_sb, p_s_tb->FR[i]))
2237                                                 locked = p_s_tb->FR[i];
2238                                 }
2239
2240                                 if (!locked && p_s_tb->CFR[i]) {
2241                                         tb_buffer_sanity_check(p_s_tb->tb_sb,
2242                                                                p_s_tb->CFR[i],
2243                                                                "CFR", i);
2244                                         if (!clear_all_dirty_bits
2245                                             (p_s_tb->tb_sb, p_s_tb->CFR[i]))
2246                                                 locked = p_s_tb->CFR[i];
2247                                 }
2248                         }
2249                 }
2250                 /* as far as I can tell, this is not required.  The FEB list seems
2251                  ** to be full of newly allocated nodes, which will never be locked,
2252                  ** dirty, or anything else.
2253                  ** To be safe, I'm putting in the checks and waits in.  For the moment,
2254                  ** they are needed to keep the code in journal.c from complaining
2255                  ** about the buffer.  That code is inside CONFIG_REISERFS_CHECK as well.
2256                  ** --clm
2257                  */
2258                 for (i = 0; !locked && i < MAX_FEB_SIZE; i++) {
2259                         if (p_s_tb->FEB[i]) {
2260                                 if (!clear_all_dirty_bits
2261                                     (p_s_tb->tb_sb, p_s_tb->FEB[i]))
2262                                         locked = p_s_tb->FEB[i];
2263                         }
2264                 }
2265
2266                 if (locked) {
2267 #ifdef CONFIG_REISERFS_CHECK
2268                         repeat_counter++;
2269                         if ((repeat_counter % 10000) == 0) {
2270                                 reiserfs_warning(p_s_tb->tb_sb,
2271                                                  "wait_tb_buffers_until_released(): too many "
2272                                                  "iterations waiting for buffer to unlock "
2273                                                  "(%b)", locked);
2274
2275                                 /* Don't loop forever.  Try to recover from possible error. */
2276
2277                                 return (FILESYSTEM_CHANGED_TB(p_s_tb)) ?
2278                                     REPEAT_SEARCH : CARRY_ON;
2279                         }
2280 #endif
2281                         __wait_on_buffer(locked);
2282                         if (FILESYSTEM_CHANGED_TB(p_s_tb)) {
2283                                 return REPEAT_SEARCH;
2284                         }
2285                 }
2286
2287         } while (locked);
2288
2289         return CARRY_ON;
2290 }
2291
2292 /* Prepare for balancing, that is
2293  *      get all necessary parents, and neighbors;
2294  *      analyze what and where should be moved;
2295  *      get sufficient number of new nodes;
2296  * Balancing will start only after all resources will be collected at a time.
2297  * 
2298  * When ported to SMP kernels, only at the last moment after all needed nodes
2299  * are collected in cache, will the resources be locked using the usual
2300  * textbook ordered lock acquisition algorithms.  Note that ensuring that
2301  * this code neither write locks what it does not need to write lock nor locks out of order
2302  * will be a pain in the butt that could have been avoided.  Grumble grumble. -Hans
2303  * 
2304  * fix is meant in the sense of render unchanging
2305  * 
2306  * Latency might be improved by first gathering a list of what buffers are needed
2307  * and then getting as many of them in parallel as possible? -Hans
2308  *
2309  * Parameters:
2310  *      op_mode i - insert, d - delete, c - cut (truncate), p - paste (append)
2311  *      tb      tree_balance structure;
2312  *      inum    item number in S[h];
2313  *      pos_in_item - comment this if you can
2314  *      ins_ih & ins_sd are used when inserting
2315  * Returns:     1 - schedule occurred while the function worked;
2316  *              0 - schedule didn't occur while the function worked;
2317  *             -1 - if no_disk_space 
2318  */
2319
2320 int fix_nodes(int n_op_mode, struct tree_balance *p_s_tb, struct item_head *p_s_ins_ih, // item head of item being inserted
2321               const void *data  // inserted item or data to be pasted
2322     )
2323 {
2324         int n_ret_value, n_h, n_item_num = PATH_LAST_POSITION(p_s_tb->tb_path);
2325         int n_pos_in_item;
2326
2327         /* we set wait_tb_buffers_run when we have to restore any dirty bits cleared
2328          ** during wait_tb_buffers_run
2329          */
2330         int wait_tb_buffers_run = 0;
2331         struct buffer_head *p_s_tbS0 = PATH_PLAST_BUFFER(p_s_tb->tb_path);
2332
2333         ++REISERFS_SB(p_s_tb->tb_sb)->s_fix_nodes;
2334
2335         n_pos_in_item = p_s_tb->tb_path->pos_in_item;
2336
2337         p_s_tb->fs_gen = get_generation(p_s_tb->tb_sb);
2338
2339         /* we prepare and log the super here so it will already be in the
2340          ** transaction when do_balance needs to change it.
2341          ** This way do_balance won't have to schedule when trying to prepare
2342          ** the super for logging
2343          */
2344         reiserfs_prepare_for_journal(p_s_tb->tb_sb,
2345                                      SB_BUFFER_WITH_SB(p_s_tb->tb_sb), 1);
2346         journal_mark_dirty(p_s_tb->transaction_handle, p_s_tb->tb_sb,
2347                            SB_BUFFER_WITH_SB(p_s_tb->tb_sb));
2348         if (FILESYSTEM_CHANGED_TB(p_s_tb))
2349                 return REPEAT_SEARCH;
2350
2351         /* if it possible in indirect_to_direct conversion */
2352         if (buffer_locked(p_s_tbS0)) {
2353                 __wait_on_buffer(p_s_tbS0);
2354                 if (FILESYSTEM_CHANGED_TB(p_s_tb))
2355                         return REPEAT_SEARCH;
2356         }
2357 #ifdef CONFIG_REISERFS_CHECK
2358         if (cur_tb) {
2359                 print_cur_tb("fix_nodes");
2360                 reiserfs_panic(p_s_tb->tb_sb,
2361                                "PAP-8305: fix_nodes:  there is pending do_balance");
2362         }
2363
2364         if (!buffer_uptodate(p_s_tbS0) || !B_IS_IN_TREE(p_s_tbS0)) {
2365                 reiserfs_panic(p_s_tb->tb_sb,
2366                                "PAP-8320: fix_nodes: S[0] (%b %z) is not uptodate "
2367                                "at the beginning of fix_nodes or not in tree (mode %c)",
2368                                p_s_tbS0, p_s_tbS0, n_op_mode);
2369         }
2370
2371         /* Check parameters. */
2372         switch (n_op_mode) {
2373         case M_INSERT:
2374                 if (n_item_num <= 0 || n_item_num > B_NR_ITEMS(p_s_tbS0))
2375                         reiserfs_panic(p_s_tb->tb_sb,
2376                                        "PAP-8330: fix_nodes: Incorrect item number %d (in S0 - %d) in case of insert",
2377                                        n_item_num, B_NR_ITEMS(p_s_tbS0));
2378                 break;
2379         case M_PASTE:
2380         case M_DELETE:
2381         case M_CUT:
2382                 if (n_item_num < 0 || n_item_num >= B_NR_ITEMS(p_s_tbS0)) {
2383                         print_block(p_s_tbS0, 0, -1, -1);
2384                         reiserfs_panic(p_s_tb->tb_sb,
2385                                        "PAP-8335: fix_nodes: Incorrect item number(%d); mode = %c insert_size = %d\n",
2386                                        n_item_num, n_op_mode,
2387                                        p_s_tb->insert_size[0]);
2388                 }
2389                 break;
2390         default:
2391                 reiserfs_panic(p_s_tb->tb_sb,
2392                                "PAP-8340: fix_nodes: Incorrect mode of operation");
2393         }
2394 #endif
2395
2396         if (get_mem_for_virtual_node(p_s_tb) == REPEAT_SEARCH)
2397                 // FIXME: maybe -ENOMEM when tb->vn_buf == 0? Now just repeat
2398                 return REPEAT_SEARCH;
2399
2400         /* Starting from the leaf level; for all levels n_h of the tree. */
2401         for (n_h = 0; n_h < MAX_HEIGHT && p_s_tb->insert_size[n_h]; n_h++) {
2402                 if ((n_ret_value = get_direct_parent(p_s_tb, n_h)) != CARRY_ON) {
2403                         goto repeat;
2404                 }
2405
2406                 if ((n_ret_value =
2407                      check_balance(n_op_mode, p_s_tb, n_h, n_item_num,
2408                                    n_pos_in_item, p_s_ins_ih,
2409                                    data)) != CARRY_ON) {
2410                         if (n_ret_value == NO_BALANCING_NEEDED) {
2411                                 /* No balancing for higher levels needed. */
2412                                 if ((n_ret_value =
2413                                      get_neighbors(p_s_tb, n_h)) != CARRY_ON) {
2414                                         goto repeat;
2415                                 }
2416                                 if (n_h != MAX_HEIGHT - 1)
2417                                         p_s_tb->insert_size[n_h + 1] = 0;
2418                                 /* ok, analysis and resource gathering are complete */
2419                                 break;
2420                         }
2421                         goto repeat;
2422                 }
2423
2424                 if ((n_ret_value = get_neighbors(p_s_tb, n_h)) != CARRY_ON) {
2425                         goto repeat;
2426                 }
2427
2428                 if ((n_ret_value = get_empty_nodes(p_s_tb, n_h)) != CARRY_ON) {
2429                         goto repeat;    /* No disk space, or schedule occurred and
2430                                            analysis may be invalid and needs to be redone. */
2431                 }
2432
2433                 if (!PATH_H_PBUFFER(p_s_tb->tb_path, n_h)) {
2434                         /* We have a positive insert size but no nodes exist on this
2435                            level, this means that we are creating a new root. */
2436
2437                         RFALSE(p_s_tb->blknum[n_h] != 1,
2438                                "PAP-8350: creating new empty root");
2439
2440                         if (n_h < MAX_HEIGHT - 1)
2441                                 p_s_tb->insert_size[n_h + 1] = 0;
2442                 } else if (!PATH_H_PBUFFER(p_s_tb->tb_path, n_h + 1)) {
2443                         if (p_s_tb->blknum[n_h] > 1) {
2444                                 /* The tree needs to be grown, so this node S[n_h]
2445                                    which is the root node is split into two nodes,
2446                                    and a new node (S[n_h+1]) will be created to
2447                                    become the root node.  */
2448
2449                                 RFALSE(n_h == MAX_HEIGHT - 1,
2450                                        "PAP-8355: attempt to create too high of a tree");
2451
2452                                 p_s_tb->insert_size[n_h + 1] =
2453                                     (DC_SIZE +
2454                                      KEY_SIZE) * (p_s_tb->blknum[n_h] - 1) +
2455                                     DC_SIZE;
2456                         } else if (n_h < MAX_HEIGHT - 1)
2457                                 p_s_tb->insert_size[n_h + 1] = 0;
2458                 } else
2459                         p_s_tb->insert_size[n_h + 1] =
2460                             (DC_SIZE + KEY_SIZE) * (p_s_tb->blknum[n_h] - 1);
2461         }
2462
2463         if ((n_ret_value = wait_tb_buffers_until_unlocked(p_s_tb)) == CARRY_ON) {
2464                 if (FILESYSTEM_CHANGED_TB(p_s_tb)) {
2465                         wait_tb_buffers_run = 1;
2466                         n_ret_value = REPEAT_SEARCH;
2467                         goto repeat;
2468                 } else {
2469                         return CARRY_ON;
2470                 }
2471         } else {
2472                 wait_tb_buffers_run = 1;
2473                 goto repeat;
2474         }
2475
2476       repeat:
2477         // fix_nodes was unable to perform its calculation due to
2478         // filesystem got changed under us, lack of free disk space or i/o
2479         // failure. If the first is the case - the search will be
2480         // repeated. For now - free all resources acquired so far except
2481         // for the new allocated nodes
2482         {
2483                 int i;
2484
2485                 /* Release path buffers. */
2486                 if (wait_tb_buffers_run) {
2487                         pathrelse_and_restore(p_s_tb->tb_sb, p_s_tb->tb_path);
2488                 } else {
2489                         pathrelse(p_s_tb->tb_path);
2490                 }
2491                 /* brelse all resources collected for balancing */
2492                 for (i = 0; i < MAX_HEIGHT; i++) {
2493                         if (wait_tb_buffers_run) {
2494                                 reiserfs_restore_prepared_buffer(p_s_tb->tb_sb,
2495                                                                  p_s_tb->L[i]);
2496                                 reiserfs_restore_prepared_buffer(p_s_tb->tb_sb,
2497                                                                  p_s_tb->R[i]);
2498                                 reiserfs_restore_prepared_buffer(p_s_tb->tb_sb,
2499                                                                  p_s_tb->FL[i]);
2500                                 reiserfs_restore_prepared_buffer(p_s_tb->tb_sb,
2501                                                                  p_s_tb->FR[i]);
2502                                 reiserfs_restore_prepared_buffer(p_s_tb->tb_sb,
2503                                                                  p_s_tb->
2504                                                                  CFL[i]);
2505                                 reiserfs_restore_prepared_buffer(p_s_tb->tb_sb,
2506                                                                  p_s_tb->
2507                                                                  CFR[i]);
2508                         }
2509
2510                         brelse(p_s_tb->L[i]);
2511                         p_s_tb->L[i] = NULL;
2512                         brelse(p_s_tb->R[i]);
2513                         p_s_tb->R[i] = NULL;
2514                         brelse(p_s_tb->FL[i]);
2515                         p_s_tb->FL[i] = NULL;
2516                         brelse(p_s_tb->FR[i]);
2517                         p_s_tb->FR[i] = NULL;
2518                         brelse(p_s_tb->CFL[i]);
2519                         p_s_tb->CFL[i] = NULL;
2520                         brelse(p_s_tb->CFR[i]);
2521                         p_s_tb->CFR[i] = NULL;
2522                 }
2523
2524                 if (wait_tb_buffers_run) {
2525                         for (i = 0; i < MAX_FEB_SIZE; i++) {
2526                                 if (p_s_tb->FEB[i]) {
2527                                         reiserfs_restore_prepared_buffer
2528                                             (p_s_tb->tb_sb, p_s_tb->FEB[i]);
2529                                 }
2530                         }
2531                 }
2532                 return n_ret_value;
2533         }
2534
2535 }
2536
2537 /* Anatoly will probably forgive me renaming p_s_tb to tb. I just
2538    wanted to make lines shorter */
2539 void unfix_nodes(struct tree_balance *tb)
2540 {
2541         int i;
2542
2543         /* Release path buffers. */
2544         pathrelse_and_restore(tb->tb_sb, tb->tb_path);
2545
2546         /* brelse all resources collected for balancing */
2547         for (i = 0; i < MAX_HEIGHT; i++) {
2548                 reiserfs_restore_prepared_buffer(tb->tb_sb, tb->L[i]);
2549                 reiserfs_restore_prepared_buffer(tb->tb_sb, tb->R[i]);
2550                 reiserfs_restore_prepared_buffer(tb->tb_sb, tb->FL[i]);
2551                 reiserfs_restore_prepared_buffer(tb->tb_sb, tb->FR[i]);
2552                 reiserfs_restore_prepared_buffer(tb->tb_sb, tb->CFL[i]);
2553                 reiserfs_restore_prepared_buffer(tb->tb_sb, tb->CFR[i]);
2554
2555                 brelse(tb->L[i]);
2556                 brelse(tb->R[i]);
2557                 brelse(tb->FL[i]);
2558                 brelse(tb->FR[i]);
2559                 brelse(tb->CFL[i]);
2560                 brelse(tb->CFR[i]);
2561         }
2562
2563         /* deal with list of allocated (used and unused) nodes */
2564         for (i = 0; i < MAX_FEB_SIZE; i++) {
2565                 if (tb->FEB[i]) {
2566                         b_blocknr_t blocknr = tb->FEB[i]->b_blocknr;
2567                         /* de-allocated block which was not used by balancing and
2568                            bforget about buffer for it */
2569                         brelse(tb->FEB[i]);
2570                         reiserfs_free_block(tb->transaction_handle, NULL,
2571                                             blocknr, 0);
2572                 }
2573                 if (tb->used[i]) {
2574                         /* release used as new nodes including a new root */
2575                         brelse(tb->used[i]);
2576                 }
2577         }
2578
2579         kfree(tb->vn_buf);
2580
2581 }