[XFS] XFS should not be looking at filp reference counts
[linux-2.6] / fs / xfs / xfs_vnodeops.c
1 /*
2  * Copyright (c) 2000-2006 Silicon Graphics, Inc.
3  * All Rights Reserved.
4  *
5  * This program is free software; you can redistribute it and/or
6  * modify it under the terms of the GNU General Public License as
7  * published by the Free Software Foundation.
8  *
9  * This program is distributed in the hope that it would be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12  * GNU General Public License for more details.
13  *
14  * You should have received a copy of the GNU General Public License
15  * along with this program; if not, write the Free Software Foundation,
16  * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
17  */
18
19 #include "xfs.h"
20 #include "xfs_fs.h"
21 #include "xfs_types.h"
22 #include "xfs_bit.h"
23 #include "xfs_log.h"
24 #include "xfs_inum.h"
25 #include "xfs_trans.h"
26 #include "xfs_sb.h"
27 #include "xfs_ag.h"
28 #include "xfs_dir2.h"
29 #include "xfs_dmapi.h"
30 #include "xfs_mount.h"
31 #include "xfs_da_btree.h"
32 #include "xfs_bmap_btree.h"
33 #include "xfs_alloc_btree.h"
34 #include "xfs_ialloc_btree.h"
35 #include "xfs_dir2_sf.h"
36 #include "xfs_attr_sf.h"
37 #include "xfs_dinode.h"
38 #include "xfs_inode.h"
39 #include "xfs_inode_item.h"
40 #include "xfs_itable.h"
41 #include "xfs_btree.h"
42 #include "xfs_ialloc.h"
43 #include "xfs_alloc.h"
44 #include "xfs_bmap.h"
45 #include "xfs_attr.h"
46 #include "xfs_rw.h"
47 #include "xfs_error.h"
48 #include "xfs_quota.h"
49 #include "xfs_utils.h"
50 #include "xfs_rtalloc.h"
51 #include "xfs_refcache.h"
52 #include "xfs_trans_space.h"
53 #include "xfs_log_priv.h"
54
55 STATIC int
56 xfs_open(
57         bhv_desc_t      *bdp,
58         cred_t          *credp)
59 {
60         int             mode;
61         bhv_vnode_t     *vp = BHV_TO_VNODE(bdp);
62         xfs_inode_t     *ip = XFS_BHVTOI(bdp);
63
64         if (XFS_FORCED_SHUTDOWN(ip->i_mount))
65                 return XFS_ERROR(EIO);
66
67         /*
68          * If it's a directory with any blocks, read-ahead block 0
69          * as we're almost certain to have the next operation be a read there.
70          */
71         if (VN_ISDIR(vp) && ip->i_d.di_nextents > 0) {
72                 mode = xfs_ilock_map_shared(ip);
73                 if (ip->i_d.di_nextents > 0)
74                         (void)xfs_da_reada_buf(NULL, ip, 0, XFS_DATA_FORK);
75                 xfs_iunlock(ip, mode);
76         }
77         return 0;
78 }
79
80 /*
81  * xfs_getattr
82  */
83 STATIC int
84 xfs_getattr(
85         bhv_desc_t      *bdp,
86         bhv_vattr_t     *vap,
87         int             flags,
88         cred_t          *credp)
89 {
90         xfs_inode_t     *ip;
91         xfs_mount_t     *mp;
92         bhv_vnode_t     *vp;
93
94         vp  = BHV_TO_VNODE(bdp);
95         vn_trace_entry(vp, __FUNCTION__, (inst_t *)__return_address);
96
97         ip = XFS_BHVTOI(bdp);
98         mp = ip->i_mount;
99
100         if (XFS_FORCED_SHUTDOWN(mp))
101                 return XFS_ERROR(EIO);
102
103         if (!(flags & ATTR_LAZY))
104                 xfs_ilock(ip, XFS_ILOCK_SHARED);
105
106         vap->va_size = XFS_ISIZE(ip);
107         if (vap->va_mask == XFS_AT_SIZE)
108                 goto all_done;
109
110         vap->va_nblocks =
111                 XFS_FSB_TO_BB(mp, ip->i_d.di_nblocks + ip->i_delayed_blks);
112         vap->va_nodeid = ip->i_ino;
113 #if XFS_BIG_INUMS
114         vap->va_nodeid += mp->m_inoadd;
115 #endif
116         vap->va_nlink = ip->i_d.di_nlink;
117
118         /*
119          * Quick exit for non-stat callers
120          */
121         if ((vap->va_mask &
122             ~(XFS_AT_SIZE|XFS_AT_FSID|XFS_AT_NODEID|
123               XFS_AT_NLINK|XFS_AT_BLKSIZE)) == 0)
124                 goto all_done;
125
126         /*
127          * Copy from in-core inode.
128          */
129         vap->va_mode = ip->i_d.di_mode;
130         vap->va_uid = ip->i_d.di_uid;
131         vap->va_gid = ip->i_d.di_gid;
132         vap->va_projid = ip->i_d.di_projid;
133
134         /*
135          * Check vnode type block/char vs. everything else.
136          */
137         switch (ip->i_d.di_mode & S_IFMT) {
138         case S_IFBLK:
139         case S_IFCHR:
140                 vap->va_rdev = ip->i_df.if_u2.if_rdev;
141                 vap->va_blocksize = BLKDEV_IOSIZE;
142                 break;
143         default:
144                 vap->va_rdev = 0;
145
146                 if (!(ip->i_d.di_flags & XFS_DIFLAG_REALTIME)) {
147                         vap->va_blocksize = xfs_preferred_iosize(mp);
148                 } else {
149
150                         /*
151                          * If the file blocks are being allocated from a
152                          * realtime partition, then return the inode's
153                          * realtime extent size or the realtime volume's
154                          * extent size.
155                          */
156                         vap->va_blocksize =
157                                 xfs_get_extsz_hint(ip) << mp->m_sb.sb_blocklog;
158                 }
159                 break;
160         }
161
162         vn_atime_to_timespec(vp, &vap->va_atime);
163         vap->va_mtime.tv_sec = ip->i_d.di_mtime.t_sec;
164         vap->va_mtime.tv_nsec = ip->i_d.di_mtime.t_nsec;
165         vap->va_ctime.tv_sec = ip->i_d.di_ctime.t_sec;
166         vap->va_ctime.tv_nsec = ip->i_d.di_ctime.t_nsec;
167
168         /*
169          * Exit for stat callers.  See if any of the rest of the fields
170          * to be filled in are needed.
171          */
172         if ((vap->va_mask &
173              (XFS_AT_XFLAGS|XFS_AT_EXTSIZE|XFS_AT_NEXTENTS|XFS_AT_ANEXTENTS|
174               XFS_AT_GENCOUNT|XFS_AT_VCODE)) == 0)
175                 goto all_done;
176
177         /*
178          * Convert di_flags to xflags.
179          */
180         vap->va_xflags = xfs_ip2xflags(ip);
181
182         /*
183          * Exit for inode revalidate.  See if any of the rest of
184          * the fields to be filled in are needed.
185          */
186         if ((vap->va_mask &
187              (XFS_AT_EXTSIZE|XFS_AT_NEXTENTS|XFS_AT_ANEXTENTS|
188               XFS_AT_GENCOUNT|XFS_AT_VCODE)) == 0)
189                 goto all_done;
190
191         vap->va_extsize = ip->i_d.di_extsize << mp->m_sb.sb_blocklog;
192         vap->va_nextents =
193                 (ip->i_df.if_flags & XFS_IFEXTENTS) ?
194                         ip->i_df.if_bytes / sizeof(xfs_bmbt_rec_t) :
195                         ip->i_d.di_nextents;
196         if (ip->i_afp)
197                 vap->va_anextents =
198                         (ip->i_afp->if_flags & XFS_IFEXTENTS) ?
199                                 ip->i_afp->if_bytes / sizeof(xfs_bmbt_rec_t) :
200                                  ip->i_d.di_anextents;
201         else
202                 vap->va_anextents = 0;
203         vap->va_gen = ip->i_d.di_gen;
204
205  all_done:
206         if (!(flags & ATTR_LAZY))
207                 xfs_iunlock(ip, XFS_ILOCK_SHARED);
208         return 0;
209 }
210
211
212 /*
213  * xfs_setattr
214  */
215 int
216 xfs_setattr(
217         bhv_desc_t              *bdp,
218         bhv_vattr_t             *vap,
219         int                     flags,
220         cred_t                  *credp)
221 {
222         xfs_inode_t             *ip;
223         xfs_trans_t             *tp;
224         xfs_mount_t             *mp;
225         int                     mask;
226         int                     code;
227         uint                    lock_flags;
228         uint                    commit_flags=0;
229         uid_t                   uid=0, iuid=0;
230         gid_t                   gid=0, igid=0;
231         int                     timeflags = 0;
232         bhv_vnode_t             *vp;
233         xfs_prid_t              projid=0, iprojid=0;
234         int                     mandlock_before, mandlock_after;
235         struct xfs_dquot        *udqp, *gdqp, *olddquot1, *olddquot2;
236         int                     file_owner;
237         int                     need_iolock = 1;
238
239         vp = BHV_TO_VNODE(bdp);
240         vn_trace_entry(vp, __FUNCTION__, (inst_t *)__return_address);
241
242         if (vp->v_vfsp->vfs_flag & VFS_RDONLY)
243                 return XFS_ERROR(EROFS);
244
245         /*
246          * Cannot set certain attributes.
247          */
248         mask = vap->va_mask;
249         if (mask & XFS_AT_NOSET) {
250                 return XFS_ERROR(EINVAL);
251         }
252
253         ip = XFS_BHVTOI(bdp);
254         mp = ip->i_mount;
255
256         if (XFS_FORCED_SHUTDOWN(mp))
257                 return XFS_ERROR(EIO);
258
259         /*
260          * Timestamps do not need to be logged and hence do not
261          * need to be done within a transaction.
262          */
263         if (mask & XFS_AT_UPDTIMES) {
264                 ASSERT((mask & ~XFS_AT_UPDTIMES) == 0);
265                 timeflags = ((mask & XFS_AT_UPDATIME) ? XFS_ICHGTIME_ACC : 0) |
266                             ((mask & XFS_AT_UPDCTIME) ? XFS_ICHGTIME_CHG : 0) |
267                             ((mask & XFS_AT_UPDMTIME) ? XFS_ICHGTIME_MOD : 0);
268                 xfs_ichgtime(ip, timeflags);
269                 return 0;
270         }
271
272         olddquot1 = olddquot2 = NULL;
273         udqp = gdqp = NULL;
274
275         /*
276          * If disk quotas is on, we make sure that the dquots do exist on disk,
277          * before we start any other transactions. Trying to do this later
278          * is messy. We don't care to take a readlock to look at the ids
279          * in inode here, because we can't hold it across the trans_reserve.
280          * If the IDs do change before we take the ilock, we're covered
281          * because the i_*dquot fields will get updated anyway.
282          */
283         if (XFS_IS_QUOTA_ON(mp) &&
284             (mask & (XFS_AT_UID|XFS_AT_GID|XFS_AT_PROJID))) {
285                 uint    qflags = 0;
286
287                 if ((mask & XFS_AT_UID) && XFS_IS_UQUOTA_ON(mp)) {
288                         uid = vap->va_uid;
289                         qflags |= XFS_QMOPT_UQUOTA;
290                 } else {
291                         uid = ip->i_d.di_uid;
292                 }
293                 if ((mask & XFS_AT_GID) && XFS_IS_GQUOTA_ON(mp)) {
294                         gid = vap->va_gid;
295                         qflags |= XFS_QMOPT_GQUOTA;
296                 }  else {
297                         gid = ip->i_d.di_gid;
298                 }
299                 if ((mask & XFS_AT_PROJID) && XFS_IS_PQUOTA_ON(mp)) {
300                         projid = vap->va_projid;
301                         qflags |= XFS_QMOPT_PQUOTA;
302                 }  else {
303                         projid = ip->i_d.di_projid;
304                 }
305                 /*
306                  * We take a reference when we initialize udqp and gdqp,
307                  * so it is important that we never blindly double trip on
308                  * the same variable. See xfs_create() for an example.
309                  */
310                 ASSERT(udqp == NULL);
311                 ASSERT(gdqp == NULL);
312                 code = XFS_QM_DQVOPALLOC(mp, ip, uid, gid, projid, qflags,
313                                          &udqp, &gdqp);
314                 if (code)
315                         return code;
316         }
317
318         /*
319          * For the other attributes, we acquire the inode lock and
320          * first do an error checking pass.
321          */
322         tp = NULL;
323         lock_flags = XFS_ILOCK_EXCL;
324         if (flags & ATTR_NOLOCK)
325                 need_iolock = 0;
326         if (!(mask & XFS_AT_SIZE)) {
327                 if ((mask != (XFS_AT_CTIME|XFS_AT_ATIME|XFS_AT_MTIME)) ||
328                     (mp->m_flags & XFS_MOUNT_WSYNC)) {
329                         tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE);
330                         commit_flags = 0;
331                         if ((code = xfs_trans_reserve(tp, 0,
332                                                      XFS_ICHANGE_LOG_RES(mp), 0,
333                                                      0, 0))) {
334                                 lock_flags = 0;
335                                 goto error_return;
336                         }
337                 }
338         } else {
339                 if (DM_EVENT_ENABLED (vp->v_vfsp, ip, DM_EVENT_TRUNCATE) &&
340                     !(flags & ATTR_DMI)) {
341                         int dmflags = AT_DELAY_FLAG(flags) | DM_SEM_FLAG_WR;
342                         code = XFS_SEND_DATA(mp, DM_EVENT_TRUNCATE, vp,
343                                 vap->va_size, 0, dmflags, NULL);
344                         if (code) {
345                                 lock_flags = 0;
346                                 goto error_return;
347                         }
348                 }
349                 if (need_iolock)
350                         lock_flags |= XFS_IOLOCK_EXCL;
351         }
352
353         xfs_ilock(ip, lock_flags);
354
355         /* boolean: are we the file owner? */
356         file_owner = (current_fsuid(credp) == ip->i_d.di_uid);
357
358         /*
359          * Change various properties of a file.
360          * Only the owner or users with CAP_FOWNER
361          * capability may do these things.
362          */
363         if (mask &
364             (XFS_AT_MODE|XFS_AT_XFLAGS|XFS_AT_EXTSIZE|XFS_AT_UID|
365              XFS_AT_GID|XFS_AT_PROJID)) {
366                 /*
367                  * CAP_FOWNER overrides the following restrictions:
368                  *
369                  * The user ID of the calling process must be equal
370                  * to the file owner ID, except in cases where the
371                  * CAP_FSETID capability is applicable.
372                  */
373                 if (!file_owner && !capable(CAP_FOWNER)) {
374                         code = XFS_ERROR(EPERM);
375                         goto error_return;
376                 }
377
378                 /*
379                  * CAP_FSETID overrides the following restrictions:
380                  *
381                  * The effective user ID of the calling process shall match
382                  * the file owner when setting the set-user-ID and
383                  * set-group-ID bits on that file.
384                  *
385                  * The effective group ID or one of the supplementary group
386                  * IDs of the calling process shall match the group owner of
387                  * the file when setting the set-group-ID bit on that file
388                  */
389                 if (mask & XFS_AT_MODE) {
390                         mode_t m = 0;
391
392                         if ((vap->va_mode & S_ISUID) && !file_owner)
393                                 m |= S_ISUID;
394                         if ((vap->va_mode & S_ISGID) &&
395                             !in_group_p((gid_t)ip->i_d.di_gid))
396                                 m |= S_ISGID;
397 #if 0
398                         /* Linux allows this, Irix doesn't. */
399                         if ((vap->va_mode & S_ISVTX) && !VN_ISDIR(vp))
400                                 m |= S_ISVTX;
401 #endif
402                         if (m && !capable(CAP_FSETID))
403                                 vap->va_mode &= ~m;
404                 }
405         }
406
407         /*
408          * Change file ownership.  Must be the owner or privileged.
409          * If the system was configured with the "restricted_chown"
410          * option, the owner is not permitted to give away the file,
411          * and can change the group id only to a group of which he
412          * or she is a member.
413          */
414         if (mask & (XFS_AT_UID|XFS_AT_GID|XFS_AT_PROJID)) {
415                 /*
416                  * These IDs could have changed since we last looked at them.
417                  * But, we're assured that if the ownership did change
418                  * while we didn't have the inode locked, inode's dquot(s)
419                  * would have changed also.
420                  */
421                 iuid = ip->i_d.di_uid;
422                 iprojid = ip->i_d.di_projid;
423                 igid = ip->i_d.di_gid;
424                 gid = (mask & XFS_AT_GID) ? vap->va_gid : igid;
425                 uid = (mask & XFS_AT_UID) ? vap->va_uid : iuid;
426                 projid = (mask & XFS_AT_PROJID) ? (xfs_prid_t)vap->va_projid :
427                          iprojid;
428
429                 /*
430                  * CAP_CHOWN overrides the following restrictions:
431                  *
432                  * If _POSIX_CHOWN_RESTRICTED is defined, this capability
433                  * shall override the restriction that a process cannot
434                  * change the user ID of a file it owns and the restriction
435                  * that the group ID supplied to the chown() function
436                  * shall be equal to either the group ID or one of the
437                  * supplementary group IDs of the calling process.
438                  */
439                 if (restricted_chown &&
440                     (iuid != uid || (igid != gid &&
441                                      !in_group_p((gid_t)gid))) &&
442                     !capable(CAP_CHOWN)) {
443                         code = XFS_ERROR(EPERM);
444                         goto error_return;
445                 }
446                 /*
447                  * Do a quota reservation only if uid/projid/gid is actually
448                  * going to change.
449                  */
450                 if ((XFS_IS_UQUOTA_ON(mp) && iuid != uid) ||
451                     (XFS_IS_PQUOTA_ON(mp) && iprojid != projid) ||
452                     (XFS_IS_GQUOTA_ON(mp) && igid != gid)) {
453                         ASSERT(tp);
454                         code = XFS_QM_DQVOPCHOWNRESV(mp, tp, ip, udqp, gdqp,
455                                                 capable(CAP_FOWNER) ?
456                                                 XFS_QMOPT_FORCE_RES : 0);
457                         if (code)       /* out of quota */
458                                 goto error_return;
459                 }
460         }
461
462         /*
463          * Truncate file.  Must have write permission and not be a directory.
464          */
465         if (mask & XFS_AT_SIZE) {
466                 /* Short circuit the truncate case for zero length files */
467                 if ((vap->va_size == 0) &&
468                    (ip->i_size == 0) && (ip->i_d.di_nextents == 0)) {
469                         xfs_iunlock(ip, XFS_ILOCK_EXCL);
470                         lock_flags &= ~XFS_ILOCK_EXCL;
471                         if (mask & XFS_AT_CTIME)
472                                 xfs_ichgtime(ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
473                         code = 0;
474                         goto error_return;
475                 }
476
477                 if (VN_ISDIR(vp)) {
478                         code = XFS_ERROR(EISDIR);
479                         goto error_return;
480                 } else if (!VN_ISREG(vp)) {
481                         code = XFS_ERROR(EINVAL);
482                         goto error_return;
483                 }
484                 /*
485                  * Make sure that the dquots are attached to the inode.
486                  */
487                 if ((code = XFS_QM_DQATTACH(mp, ip, XFS_QMOPT_ILOCKED)))
488                         goto error_return;
489         }
490
491         /*
492          * Change file access or modified times.
493          */
494         if (mask & (XFS_AT_ATIME|XFS_AT_MTIME)) {
495                 if (!file_owner) {
496                         if ((flags & ATTR_UTIME) &&
497                             !capable(CAP_FOWNER)) {
498                                 code = XFS_ERROR(EPERM);
499                                 goto error_return;
500                         }
501                 }
502         }
503
504         /*
505          * Change extent size or realtime flag.
506          */
507         if (mask & (XFS_AT_EXTSIZE|XFS_AT_XFLAGS)) {
508                 /*
509                  * Can't change extent size if any extents are allocated.
510                  */
511                 if (ip->i_d.di_nextents && (mask & XFS_AT_EXTSIZE) &&
512                     ((ip->i_d.di_extsize << mp->m_sb.sb_blocklog) !=
513                      vap->va_extsize) ) {
514                         code = XFS_ERROR(EINVAL);       /* EFBIG? */
515                         goto error_return;
516                 }
517
518                 /*
519                  * Can't change realtime flag if any extents are allocated.
520                  */
521                 if ((ip->i_d.di_nextents || ip->i_delayed_blks) &&
522                     (mask & XFS_AT_XFLAGS) &&
523                     (ip->i_d.di_flags & XFS_DIFLAG_REALTIME) !=
524                     (vap->va_xflags & XFS_XFLAG_REALTIME)) {
525                         code = XFS_ERROR(EINVAL);       /* EFBIG? */
526                         goto error_return;
527                 }
528                 /*
529                  * Extent size must be a multiple of the appropriate block
530                  * size, if set at all.
531                  */
532                 if ((mask & XFS_AT_EXTSIZE) && vap->va_extsize != 0) {
533                         xfs_extlen_t    size;
534
535                         if ((ip->i_d.di_flags & XFS_DIFLAG_REALTIME) ||
536                             ((mask & XFS_AT_XFLAGS) &&
537                             (vap->va_xflags & XFS_XFLAG_REALTIME))) {
538                                 size = mp->m_sb.sb_rextsize <<
539                                        mp->m_sb.sb_blocklog;
540                         } else {
541                                 size = mp->m_sb.sb_blocksize;
542                         }
543                         if (vap->va_extsize % size) {
544                                 code = XFS_ERROR(EINVAL);
545                                 goto error_return;
546                         }
547                 }
548                 /*
549                  * If realtime flag is set then must have realtime data.
550                  */
551                 if ((mask & XFS_AT_XFLAGS) &&
552                     (vap->va_xflags & XFS_XFLAG_REALTIME)) {
553                         if ((mp->m_sb.sb_rblocks == 0) ||
554                             (mp->m_sb.sb_rextsize == 0) ||
555                             (ip->i_d.di_extsize % mp->m_sb.sb_rextsize)) {
556                                 code = XFS_ERROR(EINVAL);
557                                 goto error_return;
558                         }
559                 }
560
561                 /*
562                  * Can't modify an immutable/append-only file unless
563                  * we have appropriate permission.
564                  */
565                 if ((mask & XFS_AT_XFLAGS) &&
566                     (ip->i_d.di_flags &
567                                 (XFS_DIFLAG_IMMUTABLE|XFS_DIFLAG_APPEND) ||
568                      (vap->va_xflags &
569                                 (XFS_XFLAG_IMMUTABLE | XFS_XFLAG_APPEND))) &&
570                     !capable(CAP_LINUX_IMMUTABLE)) {
571                         code = XFS_ERROR(EPERM);
572                         goto error_return;
573                 }
574         }
575
576         /*
577          * Now we can make the changes.  Before we join the inode
578          * to the transaction, if XFS_AT_SIZE is set then take care of
579          * the part of the truncation that must be done without the
580          * inode lock.  This needs to be done before joining the inode
581          * to the transaction, because the inode cannot be unlocked
582          * once it is a part of the transaction.
583          */
584         if (mask & XFS_AT_SIZE) {
585                 code = 0;
586                 if ((vap->va_size > ip->i_size) &&
587                     (flags & ATTR_NOSIZETOK) == 0) {
588                         code = xfs_igrow_start(ip, vap->va_size, credp);
589                 }
590                 xfs_iunlock(ip, XFS_ILOCK_EXCL);
591                 vn_iowait(vp); /* wait for the completion of any pending DIOs */
592                 if (!code)
593                         code = xfs_itruncate_data(ip, vap->va_size);
594                 if (code) {
595                         ASSERT(tp == NULL);
596                         lock_flags &= ~XFS_ILOCK_EXCL;
597                         ASSERT(lock_flags == XFS_IOLOCK_EXCL);
598                         goto error_return;
599                 }
600                 tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_SIZE);
601                 if ((code = xfs_trans_reserve(tp, 0,
602                                              XFS_ITRUNCATE_LOG_RES(mp), 0,
603                                              XFS_TRANS_PERM_LOG_RES,
604                                              XFS_ITRUNCATE_LOG_COUNT))) {
605                         xfs_trans_cancel(tp, 0);
606                         if (need_iolock)
607                                 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
608                         return code;
609                 }
610                 commit_flags = XFS_TRANS_RELEASE_LOG_RES;
611                 xfs_ilock(ip, XFS_ILOCK_EXCL);
612         }
613
614         if (tp) {
615                 xfs_trans_ijoin(tp, ip, lock_flags);
616                 xfs_trans_ihold(tp, ip);
617         }
618
619         /* determine whether mandatory locking mode changes */
620         mandlock_before = MANDLOCK(vp, ip->i_d.di_mode);
621
622         /*
623          * Truncate file.  Must have write permission and not be a directory.
624          */
625         if (mask & XFS_AT_SIZE) {
626                 if (vap->va_size > ip->i_size) {
627                         xfs_igrow_finish(tp, ip, vap->va_size,
628                             !(flags & ATTR_DMI));
629                 } else if ((vap->va_size <= ip->i_size) ||
630                            ((vap->va_size == 0) && ip->i_d.di_nextents)) {
631                         /*
632                          * signal a sync transaction unless
633                          * we're truncating an already unlinked
634                          * file on a wsync filesystem
635                          */
636                         code = xfs_itruncate_finish(&tp, ip,
637                                             (xfs_fsize_t)vap->va_size,
638                                             XFS_DATA_FORK,
639                                             ((ip->i_d.di_nlink != 0 ||
640                                               !(mp->m_flags & XFS_MOUNT_WSYNC))
641                                              ? 1 : 0));
642                         if (code)
643                                 goto abort_return;
644                         /*
645                          * Truncated "down", so we're removing references
646                          * to old data here - if we now delay flushing for
647                          * a long time, we expose ourselves unduly to the
648                          * notorious NULL files problem.  So, we mark this
649                          * vnode and flush it when the file is closed, and
650                          * do not wait the usual (long) time for writeout.
651                          */
652                         VTRUNCATE(vp);
653                 }
654                 /*
655                  * Have to do this even if the file's size doesn't change.
656                  */
657                 timeflags |= XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG;
658         }
659
660         /*
661          * Change file access modes.
662          */
663         if (mask & XFS_AT_MODE) {
664                 ip->i_d.di_mode &= S_IFMT;
665                 ip->i_d.di_mode |= vap->va_mode & ~S_IFMT;
666
667                 xfs_trans_log_inode (tp, ip, XFS_ILOG_CORE);
668                 timeflags |= XFS_ICHGTIME_CHG;
669         }
670
671         /*
672          * Change file ownership.  Must be the owner or privileged.
673          * If the system was configured with the "restricted_chown"
674          * option, the owner is not permitted to give away the file,
675          * and can change the group id only to a group of which he
676          * or she is a member.
677          */
678         if (mask & (XFS_AT_UID|XFS_AT_GID|XFS_AT_PROJID)) {
679                 /*
680                  * CAP_FSETID overrides the following restrictions:
681                  *
682                  * The set-user-ID and set-group-ID bits of a file will be
683                  * cleared upon successful return from chown()
684                  */
685                 if ((ip->i_d.di_mode & (S_ISUID|S_ISGID)) &&
686                     !capable(CAP_FSETID)) {
687                         ip->i_d.di_mode &= ~(S_ISUID|S_ISGID);
688                 }
689
690                 /*
691                  * Change the ownerships and register quota modifications
692                  * in the transaction.
693                  */
694                 if (iuid != uid) {
695                         if (XFS_IS_UQUOTA_ON(mp)) {
696                                 ASSERT(mask & XFS_AT_UID);
697                                 ASSERT(udqp);
698                                 olddquot1 = XFS_QM_DQVOPCHOWN(mp, tp, ip,
699                                                         &ip->i_udquot, udqp);
700                         }
701                         ip->i_d.di_uid = uid;
702                 }
703                 if (igid != gid) {
704                         if (XFS_IS_GQUOTA_ON(mp)) {
705                                 ASSERT(!XFS_IS_PQUOTA_ON(mp));
706                                 ASSERT(mask & XFS_AT_GID);
707                                 ASSERT(gdqp);
708                                 olddquot2 = XFS_QM_DQVOPCHOWN(mp, tp, ip,
709                                                         &ip->i_gdquot, gdqp);
710                         }
711                         ip->i_d.di_gid = gid;
712                 }
713                 if (iprojid != projid) {
714                         if (XFS_IS_PQUOTA_ON(mp)) {
715                                 ASSERT(!XFS_IS_GQUOTA_ON(mp));
716                                 ASSERT(mask & XFS_AT_PROJID);
717                                 ASSERT(gdqp);
718                                 olddquot2 = XFS_QM_DQVOPCHOWN(mp, tp, ip,
719                                                         &ip->i_gdquot, gdqp);
720                         }
721                         ip->i_d.di_projid = projid;
722                         /*
723                          * We may have to rev the inode as well as
724                          * the superblock version number since projids didn't
725                          * exist before DINODE_VERSION_2 and SB_VERSION_NLINK.
726                          */
727                         if (ip->i_d.di_version == XFS_DINODE_VERSION_1)
728                                 xfs_bump_ino_vers2(tp, ip);
729                 }
730
731                 xfs_trans_log_inode (tp, ip, XFS_ILOG_CORE);
732                 timeflags |= XFS_ICHGTIME_CHG;
733         }
734
735
736         /*
737          * Change file access or modified times.
738          */
739         if (mask & (XFS_AT_ATIME|XFS_AT_MTIME)) {
740                 if (mask & XFS_AT_ATIME) {
741                         ip->i_d.di_atime.t_sec = vap->va_atime.tv_sec;
742                         ip->i_d.di_atime.t_nsec = vap->va_atime.tv_nsec;
743                         ip->i_update_core = 1;
744                         timeflags &= ~XFS_ICHGTIME_ACC;
745                 }
746                 if (mask & XFS_AT_MTIME) {
747                         ip->i_d.di_mtime.t_sec = vap->va_mtime.tv_sec;
748                         ip->i_d.di_mtime.t_nsec = vap->va_mtime.tv_nsec;
749                         timeflags &= ~XFS_ICHGTIME_MOD;
750                         timeflags |= XFS_ICHGTIME_CHG;
751                 }
752                 if (tp && (flags & ATTR_UTIME))
753                         xfs_trans_log_inode (tp, ip, XFS_ILOG_CORE);
754         }
755
756         /*
757          * Change XFS-added attributes.
758          */
759         if (mask & (XFS_AT_EXTSIZE|XFS_AT_XFLAGS)) {
760                 if (mask & XFS_AT_EXTSIZE) {
761                         /*
762                          * Converting bytes to fs blocks.
763                          */
764                         ip->i_d.di_extsize = vap->va_extsize >>
765                                 mp->m_sb.sb_blocklog;
766                 }
767                 if (mask & XFS_AT_XFLAGS) {
768                         uint    di_flags;
769
770                         /* can't set PREALLOC this way, just preserve it */
771                         di_flags = (ip->i_d.di_flags & XFS_DIFLAG_PREALLOC);
772                         if (vap->va_xflags & XFS_XFLAG_IMMUTABLE)
773                                 di_flags |= XFS_DIFLAG_IMMUTABLE;
774                         if (vap->va_xflags & XFS_XFLAG_APPEND)
775                                 di_flags |= XFS_DIFLAG_APPEND;
776                         if (vap->va_xflags & XFS_XFLAG_SYNC)
777                                 di_flags |= XFS_DIFLAG_SYNC;
778                         if (vap->va_xflags & XFS_XFLAG_NOATIME)
779                                 di_flags |= XFS_DIFLAG_NOATIME;
780                         if (vap->va_xflags & XFS_XFLAG_NODUMP)
781                                 di_flags |= XFS_DIFLAG_NODUMP;
782                         if (vap->va_xflags & XFS_XFLAG_PROJINHERIT)
783                                 di_flags |= XFS_DIFLAG_PROJINHERIT;
784                         if (vap->va_xflags & XFS_XFLAG_NODEFRAG)
785                                 di_flags |= XFS_DIFLAG_NODEFRAG;
786                         if ((ip->i_d.di_mode & S_IFMT) == S_IFDIR) {
787                                 if (vap->va_xflags & XFS_XFLAG_RTINHERIT)
788                                         di_flags |= XFS_DIFLAG_RTINHERIT;
789                                 if (vap->va_xflags & XFS_XFLAG_NOSYMLINKS)
790                                         di_flags |= XFS_DIFLAG_NOSYMLINKS;
791                                 if (vap->va_xflags & XFS_XFLAG_EXTSZINHERIT)
792                                         di_flags |= XFS_DIFLAG_EXTSZINHERIT;
793                         } else if ((ip->i_d.di_mode & S_IFMT) == S_IFREG) {
794                                 if (vap->va_xflags & XFS_XFLAG_REALTIME) {
795                                         di_flags |= XFS_DIFLAG_REALTIME;
796                                         ip->i_iocore.io_flags |= XFS_IOCORE_RT;
797                                 } else {
798                                         ip->i_iocore.io_flags &= ~XFS_IOCORE_RT;
799                                 }
800                                 if (vap->va_xflags & XFS_XFLAG_EXTSIZE)
801                                         di_flags |= XFS_DIFLAG_EXTSIZE;
802                         }
803                         ip->i_d.di_flags = di_flags;
804                 }
805                 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
806                 timeflags |= XFS_ICHGTIME_CHG;
807         }
808
809         /*
810          * Change file inode change time only if XFS_AT_CTIME set
811          * AND we have been called by a DMI function.
812          */
813
814         if ( (flags & ATTR_DMI) && (mask & XFS_AT_CTIME) ) {
815                 ip->i_d.di_ctime.t_sec = vap->va_ctime.tv_sec;
816                 ip->i_d.di_ctime.t_nsec = vap->va_ctime.tv_nsec;
817                 ip->i_update_core = 1;
818                 timeflags &= ~XFS_ICHGTIME_CHG;
819         }
820
821         /*
822          * Send out timestamp changes that need to be set to the
823          * current time.  Not done when called by a DMI function.
824          */
825         if (timeflags && !(flags & ATTR_DMI))
826                 xfs_ichgtime(ip, timeflags);
827
828         XFS_STATS_INC(xs_ig_attrchg);
829
830         /*
831          * If this is a synchronous mount, make sure that the
832          * transaction goes to disk before returning to the user.
833          * This is slightly sub-optimal in that truncates require
834          * two sync transactions instead of one for wsync filesystems.
835          * One for the truncate and one for the timestamps since we
836          * don't want to change the timestamps unless we're sure the
837          * truncate worked.  Truncates are less than 1% of the laddis
838          * mix so this probably isn't worth the trouble to optimize.
839          */
840         code = 0;
841         if (tp) {
842                 if (mp->m_flags & XFS_MOUNT_WSYNC)
843                         xfs_trans_set_sync(tp);
844
845                 code = xfs_trans_commit(tp, commit_flags);
846         }
847
848         /*
849          * If the (regular) file's mandatory locking mode changed, then
850          * notify the vnode.  We do this under the inode lock to prevent
851          * racing calls to vop_vnode_change.
852          */
853         mandlock_after = MANDLOCK(vp, ip->i_d.di_mode);
854         if (mandlock_before != mandlock_after) {
855                 bhv_vop_vnode_change(vp, VCHANGE_FLAGS_ENF_LOCKING,
856                                  mandlock_after);
857         }
858
859         xfs_iunlock(ip, lock_flags);
860
861         /*
862          * Release any dquot(s) the inode had kept before chown.
863          */
864         XFS_QM_DQRELE(mp, olddquot1);
865         XFS_QM_DQRELE(mp, olddquot2);
866         XFS_QM_DQRELE(mp, udqp);
867         XFS_QM_DQRELE(mp, gdqp);
868
869         if (code) {
870                 return code;
871         }
872
873         if (DM_EVENT_ENABLED(vp->v_vfsp, ip, DM_EVENT_ATTRIBUTE) &&
874             !(flags & ATTR_DMI)) {
875                 (void) XFS_SEND_NAMESP(mp, DM_EVENT_ATTRIBUTE, vp, DM_RIGHT_NULL,
876                                         NULL, DM_RIGHT_NULL, NULL, NULL,
877                                         0, 0, AT_DELAY_FLAG(flags));
878         }
879         return 0;
880
881  abort_return:
882         commit_flags |= XFS_TRANS_ABORT;
883         /* FALLTHROUGH */
884  error_return:
885         XFS_QM_DQRELE(mp, udqp);
886         XFS_QM_DQRELE(mp, gdqp);
887         if (tp) {
888                 xfs_trans_cancel(tp, commit_flags);
889         }
890         if (lock_flags != 0) {
891                 xfs_iunlock(ip, lock_flags);
892         }
893         return code;
894 }
895
896
897 /*
898  * xfs_access
899  * Null conversion from vnode mode bits to inode mode bits, as in efs.
900  */
901 STATIC int
902 xfs_access(
903         bhv_desc_t      *bdp,
904         int             mode,
905         cred_t          *credp)
906 {
907         xfs_inode_t     *ip;
908         int             error;
909
910         vn_trace_entry(BHV_TO_VNODE(bdp), __FUNCTION__,
911                                                (inst_t *)__return_address);
912
913         ip = XFS_BHVTOI(bdp);
914         xfs_ilock(ip, XFS_ILOCK_SHARED);
915         error = xfs_iaccess(ip, mode, credp);
916         xfs_iunlock(ip, XFS_ILOCK_SHARED);
917         return error;
918 }
919
920
921 /*
922  * The maximum pathlen is 1024 bytes. Since the minimum file system
923  * blocksize is 512 bytes, we can get a max of 2 extents back from
924  * bmapi.
925  */
926 #define SYMLINK_MAPS 2
927
928 /*
929  * xfs_readlink
930  *
931  */
932 STATIC int
933 xfs_readlink(
934         bhv_desc_t      *bdp,
935         uio_t           *uiop,
936         int             ioflags,
937         cred_t          *credp)
938 {
939         xfs_inode_t     *ip;
940         int             count;
941         xfs_off_t       offset;
942         int             pathlen;
943         bhv_vnode_t     *vp;
944         int             error = 0;
945         xfs_mount_t     *mp;
946         int             nmaps;
947         xfs_bmbt_irec_t mval[SYMLINK_MAPS];
948         xfs_daddr_t     d;
949         int             byte_cnt;
950         int             n;
951         xfs_buf_t       *bp;
952
953         vp = BHV_TO_VNODE(bdp);
954         vn_trace_entry(vp, __FUNCTION__, (inst_t *)__return_address);
955
956         ip = XFS_BHVTOI(bdp);
957         mp = ip->i_mount;
958
959         if (XFS_FORCED_SHUTDOWN(mp))
960                 return XFS_ERROR(EIO);
961
962         xfs_ilock(ip, XFS_ILOCK_SHARED);
963
964         ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFLNK);
965
966         offset = uiop->uio_offset;
967         count = uiop->uio_resid;
968
969         if (offset < 0) {
970                 error = XFS_ERROR(EINVAL);
971                 goto error_return;
972         }
973         if (count <= 0) {
974                 error = 0;
975                 goto error_return;
976         }
977
978         /*
979          * See if the symlink is stored inline.
980          */
981         pathlen = (int)ip->i_d.di_size;
982
983         if (ip->i_df.if_flags & XFS_IFINLINE) {
984                 error = xfs_uio_read(ip->i_df.if_u1.if_data, pathlen, uiop);
985         }
986         else {
987                 /*
988                  * Symlink not inline.  Call bmap to get it in.
989                  */
990                 nmaps = SYMLINK_MAPS;
991
992                 error = xfs_bmapi(NULL, ip, 0, XFS_B_TO_FSB(mp, pathlen),
993                                   0, NULL, 0, mval, &nmaps, NULL, NULL);
994
995                 if (error) {
996                         goto error_return;
997                 }
998
999                 for (n = 0; n < nmaps; n++) {
1000                         d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock);
1001                         byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount);
1002                         bp = xfs_buf_read(mp->m_ddev_targp, d,
1003                                       BTOBB(byte_cnt), 0);
1004                         error = XFS_BUF_GETERROR(bp);
1005                         if (error) {
1006                                 xfs_ioerror_alert("xfs_readlink",
1007                                           ip->i_mount, bp, XFS_BUF_ADDR(bp));
1008                                 xfs_buf_relse(bp);
1009                                 goto error_return;
1010                         }
1011                         if (pathlen < byte_cnt)
1012                                 byte_cnt = pathlen;
1013                         pathlen -= byte_cnt;
1014
1015                         error = xfs_uio_read(XFS_BUF_PTR(bp), byte_cnt, uiop);
1016                         xfs_buf_relse (bp);
1017                 }
1018
1019         }
1020
1021 error_return:
1022         xfs_iunlock(ip, XFS_ILOCK_SHARED);
1023         return error;
1024 }
1025
1026
1027 /*
1028  * xfs_fsync
1029  *
1030  * This is called to sync the inode and its data out to disk.
1031  * We need to hold the I/O lock while flushing the data, and
1032  * the inode lock while flushing the inode.  The inode lock CANNOT
1033  * be held while flushing the data, so acquire after we're done
1034  * with that.
1035  */
1036 STATIC int
1037 xfs_fsync(
1038         bhv_desc_t      *bdp,
1039         int             flag,
1040         cred_t          *credp,
1041         xfs_off_t       start,
1042         xfs_off_t       stop)
1043 {
1044         xfs_inode_t     *ip;
1045         xfs_trans_t     *tp;
1046         int             error;
1047         int             log_flushed = 0, changed = 1;
1048
1049         vn_trace_entry(BHV_TO_VNODE(bdp),
1050                         __FUNCTION__, (inst_t *)__return_address);
1051
1052         ip = XFS_BHVTOI(bdp);
1053
1054         ASSERT(start >= 0 && stop >= -1);
1055
1056         if (XFS_FORCED_SHUTDOWN(ip->i_mount))
1057                 return XFS_ERROR(EIO);
1058
1059         /*
1060          * We always need to make sure that the required inode state
1061          * is safe on disk.  The vnode might be clean but because
1062          * of committed transactions that haven't hit the disk yet.
1063          * Likewise, there could be unflushed non-transactional
1064          * changes to the inode core that have to go to disk.
1065          *
1066          * The following code depends on one assumption:  that
1067          * any transaction that changes an inode logs the core
1068          * because it has to change some field in the inode core
1069          * (typically nextents or nblocks).  That assumption
1070          * implies that any transactions against an inode will
1071          * catch any non-transactional updates.  If inode-altering
1072          * transactions exist that violate this assumption, the
1073          * code breaks.  Right now, it figures that if the involved
1074          * update_* field is clear and the inode is unpinned, the
1075          * inode is clean.  Either it's been flushed or it's been
1076          * committed and the commit has hit the disk unpinning the inode.
1077          * (Note that xfs_inode_item_format() called at commit clears
1078          * the update_* fields.)
1079          */
1080         xfs_ilock(ip, XFS_ILOCK_SHARED);
1081
1082         /* If we are flushing data then we care about update_size
1083          * being set, otherwise we care about update_core
1084          */
1085         if ((flag & FSYNC_DATA) ?
1086                         (ip->i_update_size == 0) :
1087                         (ip->i_update_core == 0)) {
1088                 /*
1089                  * Timestamps/size haven't changed since last inode
1090                  * flush or inode transaction commit.  That means
1091                  * either nothing got written or a transaction
1092                  * committed which caught the updates.  If the
1093                  * latter happened and the transaction hasn't
1094                  * hit the disk yet, the inode will be still
1095                  * be pinned.  If it is, force the log.
1096                  */
1097
1098                 xfs_iunlock(ip, XFS_ILOCK_SHARED);
1099
1100                 if (xfs_ipincount(ip)) {
1101                         _xfs_log_force(ip->i_mount, (xfs_lsn_t)0,
1102                                       XFS_LOG_FORCE |
1103                                       ((flag & FSYNC_WAIT)
1104                                        ? XFS_LOG_SYNC : 0),
1105                                       &log_flushed);
1106                 } else {
1107                         /*
1108                          * If the inode is not pinned and nothing
1109                          * has changed we don't need to flush the
1110                          * cache.
1111                          */
1112                         changed = 0;
1113                 }
1114                 error = 0;
1115         } else  {
1116                 /*
1117                  * Kick off a transaction to log the inode
1118                  * core to get the updates.  Make it
1119                  * sync if FSYNC_WAIT is passed in (which
1120                  * is done by everybody but specfs).  The
1121                  * sync transaction will also force the log.
1122                  */
1123                 xfs_iunlock(ip, XFS_ILOCK_SHARED);
1124                 tp = xfs_trans_alloc(ip->i_mount, XFS_TRANS_FSYNC_TS);
1125                 if ((error = xfs_trans_reserve(tp, 0,
1126                                 XFS_FSYNC_TS_LOG_RES(ip->i_mount),
1127                                 0, 0, 0)))  {
1128                         xfs_trans_cancel(tp, 0);
1129                         return error;
1130                 }
1131                 xfs_ilock(ip, XFS_ILOCK_EXCL);
1132
1133                 /*
1134                  * Note - it's possible that we might have pushed
1135                  * ourselves out of the way during trans_reserve
1136                  * which would flush the inode.  But there's no
1137                  * guarantee that the inode buffer has actually
1138                  * gone out yet (it's delwri).  Plus the buffer
1139                  * could be pinned anyway if it's part of an
1140                  * inode in another recent transaction.  So we
1141                  * play it safe and fire off the transaction anyway.
1142                  */
1143                 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
1144                 xfs_trans_ihold(tp, ip);
1145                 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1146                 if (flag & FSYNC_WAIT)
1147                         xfs_trans_set_sync(tp);
1148                 error = _xfs_trans_commit(tp, 0, &log_flushed);
1149
1150                 xfs_iunlock(ip, XFS_ILOCK_EXCL);
1151         }
1152
1153         if ((ip->i_mount->m_flags & XFS_MOUNT_BARRIER) && changed) {
1154                 /*
1155                  * If the log write didn't issue an ordered tag we need
1156                  * to flush the disk cache for the data device now.
1157                  */
1158                 if (!log_flushed)
1159                         xfs_blkdev_issue_flush(ip->i_mount->m_ddev_targp);
1160
1161                 /*
1162                  * If this inode is on the RT dev we need to flush that
1163                  * cache as well.
1164                  */
1165                 if (ip->i_d.di_flags & XFS_DIFLAG_REALTIME)
1166                         xfs_blkdev_issue_flush(ip->i_mount->m_rtdev_targp);
1167         }
1168
1169         return error;
1170 }
1171
1172 /*
1173  * This is called by xfs_inactive to free any blocks beyond eof
1174  * when the link count isn't zero and by xfs_dm_punch_hole() when
1175  * punching a hole to EOF.
1176  */
1177 int
1178 xfs_free_eofblocks(
1179         xfs_mount_t     *mp,
1180         xfs_inode_t     *ip,
1181         int             flags)
1182 {
1183         xfs_trans_t     *tp;
1184         int             error;
1185         xfs_fileoff_t   end_fsb;
1186         xfs_fileoff_t   last_fsb;
1187         xfs_filblks_t   map_len;
1188         int             nimaps;
1189         xfs_bmbt_irec_t imap;
1190         int             use_iolock = (flags & XFS_FREE_EOF_LOCK);
1191
1192         /*
1193          * Figure out if there are any blocks beyond the end
1194          * of the file.  If not, then there is nothing to do.
1195          */
1196         end_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)ip->i_size));
1197         last_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_MAXIOFFSET(mp));
1198         map_len = last_fsb - end_fsb;
1199         if (map_len <= 0)
1200                 return 0;
1201
1202         nimaps = 1;
1203         xfs_ilock(ip, XFS_ILOCK_SHARED);
1204         error = XFS_BMAPI(mp, NULL, &ip->i_iocore, end_fsb, map_len, 0,
1205                           NULL, 0, &imap, &nimaps, NULL, NULL);
1206         xfs_iunlock(ip, XFS_ILOCK_SHARED);
1207
1208         if (!error && (nimaps != 0) &&
1209             (imap.br_startblock != HOLESTARTBLOCK ||
1210              ip->i_delayed_blks)) {
1211                 /*
1212                  * Attach the dquots to the inode up front.
1213                  */
1214                 if ((error = XFS_QM_DQATTACH(mp, ip, 0)))
1215                         return error;
1216
1217                 /*
1218                  * There are blocks after the end of file.
1219                  * Free them up now by truncating the file to
1220                  * its current size.
1221                  */
1222                 tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
1223
1224                 /*
1225                  * Do the xfs_itruncate_start() call before
1226                  * reserving any log space because
1227                  * itruncate_start will call into the buffer
1228                  * cache and we can't
1229                  * do that within a transaction.
1230                  */
1231                 if (use_iolock)
1232                         xfs_ilock(ip, XFS_IOLOCK_EXCL);
1233                 error = xfs_itruncate_start(ip, XFS_ITRUNC_DEFINITE,
1234                                     ip->i_size);
1235                 if (error) {
1236                         xfs_trans_cancel(tp, 0);
1237                         if (use_iolock)
1238                                 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1239                         return error;
1240                 }
1241
1242                 error = xfs_trans_reserve(tp, 0,
1243                                           XFS_ITRUNCATE_LOG_RES(mp),
1244                                           0, XFS_TRANS_PERM_LOG_RES,
1245                                           XFS_ITRUNCATE_LOG_COUNT);
1246                 if (error) {
1247                         ASSERT(XFS_FORCED_SHUTDOWN(mp));
1248                         xfs_trans_cancel(tp, 0);
1249                         xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1250                         return error;
1251                 }
1252
1253                 xfs_ilock(ip, XFS_ILOCK_EXCL);
1254                 xfs_trans_ijoin(tp, ip,
1255                                 XFS_IOLOCK_EXCL |
1256                                 XFS_ILOCK_EXCL);
1257                 xfs_trans_ihold(tp, ip);
1258
1259                 error = xfs_itruncate_finish(&tp, ip,
1260                                              ip->i_size,
1261                                              XFS_DATA_FORK,
1262                                              0);
1263                 /*
1264                  * If we get an error at this point we
1265                  * simply don't bother truncating the file.
1266                  */
1267                 if (error) {
1268                         xfs_trans_cancel(tp,
1269                                          (XFS_TRANS_RELEASE_LOG_RES |
1270                                           XFS_TRANS_ABORT));
1271                 } else {
1272                         error = xfs_trans_commit(tp,
1273                                                 XFS_TRANS_RELEASE_LOG_RES);
1274                 }
1275                 xfs_iunlock(ip, (use_iolock ? (XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL)
1276                                             : XFS_ILOCK_EXCL));
1277         }
1278         return error;
1279 }
1280
1281 /*
1282  * Free a symlink that has blocks associated with it.
1283  */
1284 STATIC int
1285 xfs_inactive_symlink_rmt(
1286         xfs_inode_t     *ip,
1287         xfs_trans_t     **tpp)
1288 {
1289         xfs_buf_t       *bp;
1290         int             committed;
1291         int             done;
1292         int             error;
1293         xfs_fsblock_t   first_block;
1294         xfs_bmap_free_t free_list;
1295         int             i;
1296         xfs_mount_t     *mp;
1297         xfs_bmbt_irec_t mval[SYMLINK_MAPS];
1298         int             nmaps;
1299         xfs_trans_t     *ntp;
1300         int             size;
1301         xfs_trans_t     *tp;
1302
1303         tp = *tpp;
1304         mp = ip->i_mount;
1305         ASSERT(ip->i_d.di_size > XFS_IFORK_DSIZE(ip));
1306         /*
1307          * We're freeing a symlink that has some
1308          * blocks allocated to it.  Free the
1309          * blocks here.  We know that we've got
1310          * either 1 or 2 extents and that we can
1311          * free them all in one bunmapi call.
1312          */
1313         ASSERT(ip->i_d.di_nextents > 0 && ip->i_d.di_nextents <= 2);
1314         if ((error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0,
1315                         XFS_TRANS_PERM_LOG_RES, XFS_ITRUNCATE_LOG_COUNT))) {
1316                 ASSERT(XFS_FORCED_SHUTDOWN(mp));
1317                 xfs_trans_cancel(tp, 0);
1318                 *tpp = NULL;
1319                 return error;
1320         }
1321         /*
1322          * Lock the inode, fix the size, and join it to the transaction.
1323          * Hold it so in the normal path, we still have it locked for
1324          * the second transaction.  In the error paths we need it
1325          * held so the cancel won't rele it, see below.
1326          */
1327         xfs_ilock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1328         size = (int)ip->i_d.di_size;
1329         ip->i_d.di_size = 0;
1330         xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
1331         xfs_trans_ihold(tp, ip);
1332         xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1333         /*
1334          * Find the block(s) so we can inval and unmap them.
1335          */
1336         done = 0;
1337         XFS_BMAP_INIT(&free_list, &first_block);
1338         nmaps = ARRAY_SIZE(mval);
1339         if ((error = xfs_bmapi(tp, ip, 0, XFS_B_TO_FSB(mp, size),
1340                         XFS_BMAPI_METADATA, &first_block, 0, mval, &nmaps,
1341                         &free_list, NULL)))
1342                 goto error0;
1343         /*
1344          * Invalidate the block(s).
1345          */
1346         for (i = 0; i < nmaps; i++) {
1347                 bp = xfs_trans_get_buf(tp, mp->m_ddev_targp,
1348                         XFS_FSB_TO_DADDR(mp, mval[i].br_startblock),
1349                         XFS_FSB_TO_BB(mp, mval[i].br_blockcount), 0);
1350                 xfs_trans_binval(tp, bp);
1351         }
1352         /*
1353          * Unmap the dead block(s) to the free_list.
1354          */
1355         if ((error = xfs_bunmapi(tp, ip, 0, size, XFS_BMAPI_METADATA, nmaps,
1356                         &first_block, &free_list, NULL, &done)))
1357                 goto error1;
1358         ASSERT(done);
1359         /*
1360          * Commit the first transaction.  This logs the EFI and the inode.
1361          */
1362         if ((error = xfs_bmap_finish(&tp, &free_list, &committed)))
1363                 goto error1;
1364         /*
1365          * The transaction must have been committed, since there were
1366          * actually extents freed by xfs_bunmapi.  See xfs_bmap_finish.
1367          * The new tp has the extent freeing and EFDs.
1368          */
1369         ASSERT(committed);
1370         /*
1371          * The first xact was committed, so add the inode to the new one.
1372          * Mark it dirty so it will be logged and moved forward in the log as
1373          * part of every commit.
1374          */
1375         xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
1376         xfs_trans_ihold(tp, ip);
1377         xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1378         /*
1379          * Get a new, empty transaction to return to our caller.
1380          */
1381         ntp = xfs_trans_dup(tp);
1382         /*
1383          * Commit the transaction containing extent freeing and EFDs.
1384          * If we get an error on the commit here or on the reserve below,
1385          * we need to unlock the inode since the new transaction doesn't
1386          * have the inode attached.
1387          */
1388         error = xfs_trans_commit(tp, 0);
1389         tp = ntp;
1390         if (error) {
1391                 ASSERT(XFS_FORCED_SHUTDOWN(mp));
1392                 goto error0;
1393         }
1394         /*
1395          * Remove the memory for extent descriptions (just bookkeeping).
1396          */
1397         if (ip->i_df.if_bytes)
1398                 xfs_idata_realloc(ip, -ip->i_df.if_bytes, XFS_DATA_FORK);
1399         ASSERT(ip->i_df.if_bytes == 0);
1400         /*
1401          * Put an itruncate log reservation in the new transaction
1402          * for our caller.
1403          */
1404         if ((error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0,
1405                         XFS_TRANS_PERM_LOG_RES, XFS_ITRUNCATE_LOG_COUNT))) {
1406                 ASSERT(XFS_FORCED_SHUTDOWN(mp));
1407                 goto error0;
1408         }
1409         /*
1410          * Return with the inode locked but not joined to the transaction.
1411          */
1412         *tpp = tp;
1413         return 0;
1414
1415  error1:
1416         xfs_bmap_cancel(&free_list);
1417  error0:
1418         /*
1419          * Have to come here with the inode locked and either
1420          * (held and in the transaction) or (not in the transaction).
1421          * If the inode isn't held then cancel would iput it, but
1422          * that's wrong since this is inactive and the vnode ref
1423          * count is 0 already.
1424          * Cancel won't do anything to the inode if held, but it still
1425          * needs to be locked until the cancel is done, if it was
1426          * joined to the transaction.
1427          */
1428         xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
1429         xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1430         *tpp = NULL;
1431         return error;
1432
1433 }
1434
1435 STATIC int
1436 xfs_inactive_symlink_local(
1437         xfs_inode_t     *ip,
1438         xfs_trans_t     **tpp)
1439 {
1440         int             error;
1441
1442         ASSERT(ip->i_d.di_size <= XFS_IFORK_DSIZE(ip));
1443         /*
1444          * We're freeing a symlink which fit into
1445          * the inode.  Just free the memory used
1446          * to hold the old symlink.
1447          */
1448         error = xfs_trans_reserve(*tpp, 0,
1449                                   XFS_ITRUNCATE_LOG_RES(ip->i_mount),
1450                                   0, XFS_TRANS_PERM_LOG_RES,
1451                                   XFS_ITRUNCATE_LOG_COUNT);
1452
1453         if (error) {
1454                 xfs_trans_cancel(*tpp, 0);
1455                 *tpp = NULL;
1456                 return error;
1457         }
1458         xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
1459
1460         /*
1461          * Zero length symlinks _can_ exist.
1462          */
1463         if (ip->i_df.if_bytes > 0) {
1464                 xfs_idata_realloc(ip,
1465                                   -(ip->i_df.if_bytes),
1466                                   XFS_DATA_FORK);
1467                 ASSERT(ip->i_df.if_bytes == 0);
1468         }
1469         return 0;
1470 }
1471
1472 STATIC int
1473 xfs_inactive_attrs(
1474         xfs_inode_t     *ip,
1475         xfs_trans_t     **tpp)
1476 {
1477         xfs_trans_t     *tp;
1478         int             error;
1479         xfs_mount_t     *mp;
1480
1481         ASSERT(ismrlocked(&ip->i_iolock, MR_UPDATE));
1482         tp = *tpp;
1483         mp = ip->i_mount;
1484         ASSERT(ip->i_d.di_forkoff != 0);
1485         xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
1486         xfs_iunlock(ip, XFS_ILOCK_EXCL);
1487
1488         error = xfs_attr_inactive(ip);
1489         if (error) {
1490                 *tpp = NULL;
1491                 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1492                 return error; /* goto out */
1493         }
1494
1495         tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
1496         error = xfs_trans_reserve(tp, 0,
1497                                   XFS_IFREE_LOG_RES(mp),
1498                                   0, XFS_TRANS_PERM_LOG_RES,
1499                                   XFS_INACTIVE_LOG_COUNT);
1500         if (error) {
1501                 ASSERT(XFS_FORCED_SHUTDOWN(mp));
1502                 xfs_trans_cancel(tp, 0);
1503                 *tpp = NULL;
1504                 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1505                 return error;
1506         }
1507
1508         xfs_ilock(ip, XFS_ILOCK_EXCL);
1509         xfs_trans_ijoin(tp, ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1510         xfs_trans_ihold(tp, ip);
1511         xfs_idestroy_fork(ip, XFS_ATTR_FORK);
1512
1513         ASSERT(ip->i_d.di_anextents == 0);
1514
1515         *tpp = tp;
1516         return 0;
1517 }
1518
1519 STATIC int
1520 xfs_release(
1521         bhv_desc_t      *bdp)
1522 {
1523         xfs_inode_t     *ip;
1524         bhv_vnode_t     *vp;
1525         xfs_mount_t     *mp;
1526         int             error;
1527
1528         vp = BHV_TO_VNODE(bdp);
1529         ip = XFS_BHVTOI(bdp);
1530         mp = ip->i_mount;
1531
1532         if (!VN_ISREG(vp) || (ip->i_d.di_mode == 0))
1533                 return 0;
1534
1535         /* If this is a read-only mount, don't do this (would generate I/O) */
1536         if (vp->v_vfsp->vfs_flag & VFS_RDONLY)
1537                 return 0;
1538
1539         if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
1540                 /*
1541                  * If we previously truncated this file and removed old data
1542                  * in the process, we want to initiate "early" writeout on
1543                  * the last close.  This is an attempt to combat the notorious
1544                  * NULL files problem which is particularly noticable from a
1545                  * truncate down, buffered (re-)write (delalloc), followed by
1546                  * a crash.  What we are effectively doing here is
1547                  * significantly reducing the time window where we'd otherwise
1548                  * be exposed to that problem.
1549                  */
1550                 if (VUNTRUNCATE(vp) && VN_DIRTY(vp) && ip->i_delayed_blks > 0)
1551                         bhv_vop_flush_pages(vp, 0, -1, XFS_B_ASYNC, FI_NONE);
1552         }
1553
1554
1555 #ifdef HAVE_REFCACHE
1556         /* If we are in the NFS reference cache then don't do this now */
1557         if (ip->i_refcache)
1558                 return 0;
1559 #endif
1560
1561         if (ip->i_d.di_nlink != 0) {
1562                 if ((((ip->i_d.di_mode & S_IFMT) == S_IFREG) &&
1563                      ((ip->i_size > 0) || (VN_CACHED(vp) > 0 ||
1564                        ip->i_delayed_blks > 0)) &&
1565                      (ip->i_df.if_flags & XFS_IFEXTENTS))  &&
1566                     (!(ip->i_d.di_flags &
1567                                 (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)))) {
1568                         error = xfs_free_eofblocks(mp, ip, XFS_FREE_EOF_LOCK);
1569                         if (error)
1570                                 return error;
1571                         /* Update linux inode block count after free above */
1572                         vn_to_inode(vp)->i_blocks = XFS_FSB_TO_BB(mp,
1573                                 ip->i_d.di_nblocks + ip->i_delayed_blks);
1574                 }
1575         }
1576
1577         return 0;
1578 }
1579
1580 /*
1581  * xfs_inactive
1582  *
1583  * This is called when the vnode reference count for the vnode
1584  * goes to zero.  If the file has been unlinked, then it must
1585  * now be truncated.  Also, we clear all of the read-ahead state
1586  * kept for the inode here since the file is now closed.
1587  */
1588 STATIC int
1589 xfs_inactive(
1590         bhv_desc_t      *bdp,
1591         cred_t          *credp)
1592 {
1593         xfs_inode_t     *ip;
1594         bhv_vnode_t     *vp;
1595         xfs_bmap_free_t free_list;
1596         xfs_fsblock_t   first_block;
1597         int             committed;
1598         xfs_trans_t     *tp;
1599         xfs_mount_t     *mp;
1600         int             error;
1601         int             truncate;
1602
1603         vp = BHV_TO_VNODE(bdp);
1604         vn_trace_entry(vp, __FUNCTION__, (inst_t *)__return_address);
1605
1606         ip = XFS_BHVTOI(bdp);
1607
1608         /*
1609          * If the inode is already free, then there can be nothing
1610          * to clean up here.
1611          */
1612         if (ip->i_d.di_mode == 0 || VN_BAD(vp)) {
1613                 ASSERT(ip->i_df.if_real_bytes == 0);
1614                 ASSERT(ip->i_df.if_broot_bytes == 0);
1615                 return VN_INACTIVE_CACHE;
1616         }
1617
1618         /*
1619          * Only do a truncate if it's a regular file with
1620          * some actual space in it.  It's OK to look at the
1621          * inode's fields without the lock because we're the
1622          * only one with a reference to the inode.
1623          */
1624         truncate = ((ip->i_d.di_nlink == 0) &&
1625             ((ip->i_d.di_size != 0) || (ip->i_size != 0) ||
1626              (ip->i_d.di_nextents > 0) || (ip->i_delayed_blks > 0)) &&
1627             ((ip->i_d.di_mode & S_IFMT) == S_IFREG));
1628
1629         mp = ip->i_mount;
1630
1631         if (ip->i_d.di_nlink == 0 &&
1632             DM_EVENT_ENABLED(vp->v_vfsp, ip, DM_EVENT_DESTROY)) {
1633                 (void) XFS_SEND_DESTROY(mp, vp, DM_RIGHT_NULL);
1634         }
1635
1636         error = 0;
1637
1638         /* If this is a read-only mount, don't do this (would generate I/O) */
1639         if (vp->v_vfsp->vfs_flag & VFS_RDONLY)
1640                 goto out;
1641
1642         if (ip->i_d.di_nlink != 0) {
1643                 if ((((ip->i_d.di_mode & S_IFMT) == S_IFREG) &&
1644                      ((ip->i_size > 0) || (VN_CACHED(vp) > 0 ||
1645                        ip->i_delayed_blks > 0)) &&
1646                       (ip->i_df.if_flags & XFS_IFEXTENTS) &&
1647                      (!(ip->i_d.di_flags &
1648                                 (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)) ||
1649                       (ip->i_delayed_blks != 0)))) {
1650                         error = xfs_free_eofblocks(mp, ip, XFS_FREE_EOF_LOCK);
1651                         if (error)
1652                                 return VN_INACTIVE_CACHE;
1653                         /* Update linux inode block count after free above */
1654                         vn_to_inode(vp)->i_blocks = XFS_FSB_TO_BB(mp,
1655                                 ip->i_d.di_nblocks + ip->i_delayed_blks);
1656                 }
1657                 goto out;
1658         }
1659
1660         ASSERT(ip->i_d.di_nlink == 0);
1661
1662         if ((error = XFS_QM_DQATTACH(mp, ip, 0)))
1663                 return VN_INACTIVE_CACHE;
1664
1665         tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
1666         if (truncate) {
1667                 /*
1668                  * Do the xfs_itruncate_start() call before
1669                  * reserving any log space because itruncate_start
1670                  * will call into the buffer cache and we can't
1671                  * do that within a transaction.
1672                  */
1673                 xfs_ilock(ip, XFS_IOLOCK_EXCL);
1674
1675                 error = xfs_itruncate_start(ip, XFS_ITRUNC_DEFINITE, 0);
1676                 if (error) {
1677                         xfs_trans_cancel(tp, 0);
1678                         xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1679                         return VN_INACTIVE_CACHE;
1680                 }
1681
1682                 error = xfs_trans_reserve(tp, 0,
1683                                           XFS_ITRUNCATE_LOG_RES(mp),
1684                                           0, XFS_TRANS_PERM_LOG_RES,
1685                                           XFS_ITRUNCATE_LOG_COUNT);
1686                 if (error) {
1687                         /* Don't call itruncate_cleanup */
1688                         ASSERT(XFS_FORCED_SHUTDOWN(mp));
1689                         xfs_trans_cancel(tp, 0);
1690                         xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1691                         return VN_INACTIVE_CACHE;
1692                 }
1693
1694                 xfs_ilock(ip, XFS_ILOCK_EXCL);
1695                 xfs_trans_ijoin(tp, ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1696                 xfs_trans_ihold(tp, ip);
1697
1698                 /*
1699                  * normally, we have to run xfs_itruncate_finish sync.
1700                  * But if filesystem is wsync and we're in the inactive
1701                  * path, then we know that nlink == 0, and that the
1702                  * xaction that made nlink == 0 is permanently committed
1703                  * since xfs_remove runs as a synchronous transaction.
1704                  */
1705                 error = xfs_itruncate_finish(&tp, ip, 0, XFS_DATA_FORK,
1706                                 (!(mp->m_flags & XFS_MOUNT_WSYNC) ? 1 : 0));
1707
1708                 if (error) {
1709                         xfs_trans_cancel(tp,
1710                                 XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
1711                         xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1712                         return VN_INACTIVE_CACHE;
1713                 }
1714         } else if ((ip->i_d.di_mode & S_IFMT) == S_IFLNK) {
1715
1716                 /*
1717                  * If we get an error while cleaning up a
1718                  * symlink we bail out.
1719                  */
1720                 error = (ip->i_d.di_size > XFS_IFORK_DSIZE(ip)) ?
1721                         xfs_inactive_symlink_rmt(ip, &tp) :
1722                         xfs_inactive_symlink_local(ip, &tp);
1723
1724                 if (error) {
1725                         ASSERT(tp == NULL);
1726                         return VN_INACTIVE_CACHE;
1727                 }
1728
1729                 xfs_trans_ijoin(tp, ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1730                 xfs_trans_ihold(tp, ip);
1731         } else {
1732                 error = xfs_trans_reserve(tp, 0,
1733                                           XFS_IFREE_LOG_RES(mp),
1734                                           0, XFS_TRANS_PERM_LOG_RES,
1735                                           XFS_INACTIVE_LOG_COUNT);
1736                 if (error) {
1737                         ASSERT(XFS_FORCED_SHUTDOWN(mp));
1738                         xfs_trans_cancel(tp, 0);
1739                         return VN_INACTIVE_CACHE;
1740                 }
1741
1742                 xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
1743                 xfs_trans_ijoin(tp, ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1744                 xfs_trans_ihold(tp, ip);
1745         }
1746
1747         /*
1748          * If there are attributes associated with the file
1749          * then blow them away now.  The code calls a routine
1750          * that recursively deconstructs the attribute fork.
1751          * We need to just commit the current transaction
1752          * because we can't use it for xfs_attr_inactive().
1753          */
1754         if (ip->i_d.di_anextents > 0) {
1755                 error = xfs_inactive_attrs(ip, &tp);
1756                 /*
1757                  * If we got an error, the transaction is already
1758                  * cancelled, and the inode is unlocked. Just get out.
1759                  */
1760                  if (error)
1761                          return VN_INACTIVE_CACHE;
1762         } else if (ip->i_afp) {
1763                 xfs_idestroy_fork(ip, XFS_ATTR_FORK);
1764         }
1765
1766         /*
1767          * Free the inode.
1768          */
1769         XFS_BMAP_INIT(&free_list, &first_block);
1770         error = xfs_ifree(tp, ip, &free_list);
1771         if (error) {
1772                 /*
1773                  * If we fail to free the inode, shut down.  The cancel
1774                  * might do that, we need to make sure.  Otherwise the
1775                  * inode might be lost for a long time or forever.
1776                  */
1777                 if (!XFS_FORCED_SHUTDOWN(mp)) {
1778                         cmn_err(CE_NOTE,
1779                 "xfs_inactive:  xfs_ifree() returned an error = %d on %s",
1780                                 error, mp->m_fsname);
1781                         xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
1782                 }
1783                 xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
1784         } else {
1785                 /*
1786                  * Credit the quota account(s). The inode is gone.
1787                  */
1788                 XFS_TRANS_MOD_DQUOT_BYINO(mp, tp, ip, XFS_TRANS_DQ_ICOUNT, -1);
1789
1790                 /*
1791                  * Just ignore errors at this point.  There is
1792                  * nothing we can do except to try to keep going.
1793                  */
1794                 (void) xfs_bmap_finish(&tp,  &free_list, &committed);
1795                 (void) xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
1796         }
1797         /*
1798          * Release the dquots held by inode, if any.
1799          */
1800         XFS_QM_DQDETACH(mp, ip);
1801
1802         xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1803
1804  out:
1805         return VN_INACTIVE_CACHE;
1806 }
1807
1808
1809 /*
1810  * xfs_lookup
1811  */
1812 STATIC int
1813 xfs_lookup(
1814         bhv_desc_t              *dir_bdp,
1815         bhv_vname_t             *dentry,
1816         bhv_vnode_t             **vpp,
1817         int                     flags,
1818         bhv_vnode_t             *rdir,
1819         cred_t                  *credp)
1820 {
1821         xfs_inode_t             *dp, *ip;
1822         xfs_ino_t               e_inum;
1823         int                     error;
1824         uint                    lock_mode;
1825         bhv_vnode_t             *dir_vp;
1826
1827         dir_vp = BHV_TO_VNODE(dir_bdp);
1828         vn_trace_entry(dir_vp, __FUNCTION__, (inst_t *)__return_address);
1829
1830         dp = XFS_BHVTOI(dir_bdp);
1831
1832         if (XFS_FORCED_SHUTDOWN(dp->i_mount))
1833                 return XFS_ERROR(EIO);
1834
1835         lock_mode = xfs_ilock_map_shared(dp);
1836         error = xfs_dir_lookup_int(dir_bdp, lock_mode, dentry, &e_inum, &ip);
1837         if (!error) {
1838                 *vpp = XFS_ITOV(ip);
1839                 ITRACE(ip);
1840         }
1841         xfs_iunlock_map_shared(dp, lock_mode);
1842         return error;
1843 }
1844
1845
1846 /*
1847  * xfs_create (create a new file).
1848  */
1849 STATIC int
1850 xfs_create(
1851         bhv_desc_t              *dir_bdp,
1852         bhv_vname_t             *dentry,
1853         bhv_vattr_t             *vap,
1854         bhv_vnode_t             **vpp,
1855         cred_t                  *credp)
1856 {
1857         char                    *name = VNAME(dentry);
1858         bhv_vnode_t             *dir_vp;
1859         xfs_inode_t             *dp, *ip;
1860         bhv_vnode_t             *vp = NULL;
1861         xfs_trans_t             *tp;
1862         xfs_mount_t             *mp;
1863         xfs_dev_t               rdev;
1864         int                     error;
1865         xfs_bmap_free_t         free_list;
1866         xfs_fsblock_t           first_block;
1867         boolean_t               dp_joined_to_trans;
1868         int                     dm_event_sent = 0;
1869         uint                    cancel_flags;
1870         int                     committed;
1871         xfs_prid_t              prid;
1872         struct xfs_dquot        *udqp, *gdqp;
1873         uint                    resblks;
1874         int                     dm_di_mode;
1875         int                     namelen;
1876
1877         ASSERT(!*vpp);
1878         dir_vp = BHV_TO_VNODE(dir_bdp);
1879         vn_trace_entry(dir_vp, __FUNCTION__, (inst_t *)__return_address);
1880
1881         dp = XFS_BHVTOI(dir_bdp);
1882         mp = dp->i_mount;
1883
1884         dm_di_mode = vap->va_mode;
1885         namelen = VNAMELEN(dentry);
1886
1887         if (DM_EVENT_ENABLED(dir_vp->v_vfsp, dp, DM_EVENT_CREATE)) {
1888                 error = XFS_SEND_NAMESP(mp, DM_EVENT_CREATE,
1889                                 dir_vp, DM_RIGHT_NULL, NULL,
1890                                 DM_RIGHT_NULL, name, NULL,
1891                                 dm_di_mode, 0, 0);
1892
1893                 if (error)
1894                         return error;
1895                 dm_event_sent = 1;
1896         }
1897
1898         if (XFS_FORCED_SHUTDOWN(mp))
1899                 return XFS_ERROR(EIO);
1900
1901         /* Return through std_return after this point. */
1902
1903         udqp = gdqp = NULL;
1904         if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
1905                 prid = dp->i_d.di_projid;
1906         else if (vap->va_mask & XFS_AT_PROJID)
1907                 prid = (xfs_prid_t)vap->va_projid;
1908         else
1909                 prid = (xfs_prid_t)dfltprid;
1910
1911         /*
1912          * Make sure that we have allocated dquot(s) on disk.
1913          */
1914         error = XFS_QM_DQVOPALLOC(mp, dp,
1915                         current_fsuid(credp), current_fsgid(credp), prid,
1916                         XFS_QMOPT_QUOTALL|XFS_QMOPT_INHERIT, &udqp, &gdqp);
1917         if (error)
1918                 goto std_return;
1919
1920         ip = NULL;
1921         dp_joined_to_trans = B_FALSE;
1922
1923         tp = xfs_trans_alloc(mp, XFS_TRANS_CREATE);
1924         cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
1925         resblks = XFS_CREATE_SPACE_RES(mp, namelen);
1926         /*
1927          * Initially assume that the file does not exist and
1928          * reserve the resources for that case.  If that is not
1929          * the case we'll drop the one we have and get a more
1930          * appropriate transaction later.
1931          */
1932         error = xfs_trans_reserve(tp, resblks, XFS_CREATE_LOG_RES(mp), 0,
1933                         XFS_TRANS_PERM_LOG_RES, XFS_CREATE_LOG_COUNT);
1934         if (error == ENOSPC) {
1935                 resblks = 0;
1936                 error = xfs_trans_reserve(tp, 0, XFS_CREATE_LOG_RES(mp), 0,
1937                                 XFS_TRANS_PERM_LOG_RES, XFS_CREATE_LOG_COUNT);
1938         }
1939         if (error) {
1940                 cancel_flags = 0;
1941                 dp = NULL;
1942                 goto error_return;
1943         }
1944
1945         xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
1946
1947         XFS_BMAP_INIT(&free_list, &first_block);
1948
1949         ASSERT(ip == NULL);
1950
1951         /*
1952          * Reserve disk quota and the inode.
1953          */
1954         error = XFS_TRANS_RESERVE_QUOTA(mp, tp, udqp, gdqp, resblks, 1, 0);
1955         if (error)
1956                 goto error_return;
1957
1958         if (resblks == 0 && (error = xfs_dir_canenter(tp, dp, name, namelen)))
1959                 goto error_return;
1960         rdev = (vap->va_mask & XFS_AT_RDEV) ? vap->va_rdev : 0;
1961         error = xfs_dir_ialloc(&tp, dp, vap->va_mode, 1,
1962                         rdev, credp, prid, resblks > 0,
1963                         &ip, &committed);
1964         if (error) {
1965                 if (error == ENOSPC)
1966                         goto error_return;
1967                 goto abort_return;
1968         }
1969         ITRACE(ip);
1970
1971         /*
1972          * At this point, we've gotten a newly allocated inode.
1973          * It is locked (and joined to the transaction).
1974          */
1975
1976         ASSERT(ismrlocked (&ip->i_lock, MR_UPDATE));
1977
1978         /*
1979          * Now we join the directory inode to the transaction.
1980          * We do not do it earlier because xfs_dir_ialloc
1981          * might commit the previous transaction (and release
1982          * all the locks).
1983          */
1984
1985         VN_HOLD(dir_vp);
1986         xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
1987         dp_joined_to_trans = B_TRUE;
1988
1989         error = xfs_dir_createname(tp, dp, name, namelen, ip->i_ino,
1990                                         &first_block, &free_list, resblks ?
1991                                         resblks - XFS_IALLOC_SPACE_RES(mp) : 0);
1992         if (error) {
1993                 ASSERT(error != ENOSPC);
1994                 goto abort_return;
1995         }
1996         xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
1997         xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
1998
1999         /*
2000          * If this is a synchronous mount, make sure that the
2001          * create transaction goes to disk before returning to
2002          * the user.
2003          */
2004         if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
2005                 xfs_trans_set_sync(tp);
2006         }
2007
2008         dp->i_gen++;
2009
2010         /*
2011          * Attach the dquot(s) to the inodes and modify them incore.
2012          * These ids of the inode couldn't have changed since the new
2013          * inode has been locked ever since it was created.
2014          */
2015         XFS_QM_DQVOPCREATE(mp, tp, ip, udqp, gdqp);
2016
2017         /*
2018          * xfs_trans_commit normally decrements the vnode ref count
2019          * when it unlocks the inode. Since we want to return the
2020          * vnode to the caller, we bump the vnode ref count now.
2021          */
2022         IHOLD(ip);
2023         vp = XFS_ITOV(ip);
2024
2025         error = xfs_bmap_finish(&tp, &free_list, &committed);
2026         if (error) {
2027                 xfs_bmap_cancel(&free_list);
2028                 goto abort_rele;
2029         }
2030
2031         error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
2032         if (error) {
2033                 IRELE(ip);
2034                 tp = NULL;
2035                 goto error_return;
2036         }
2037
2038         XFS_QM_DQRELE(mp, udqp);
2039         XFS_QM_DQRELE(mp, gdqp);
2040
2041         /*
2042          * Propagate the fact that the vnode changed after the
2043          * xfs_inode locks have been released.
2044          */
2045         bhv_vop_vnode_change(vp, VCHANGE_FLAGS_TRUNCATED, 3);
2046
2047         *vpp = vp;
2048
2049         /* Fallthrough to std_return with error = 0  */
2050
2051 std_return:
2052         if ( (*vpp || (error != 0 && dm_event_sent != 0)) &&
2053                         DM_EVENT_ENABLED(dir_vp->v_vfsp, XFS_BHVTOI(dir_bdp),
2054                                                         DM_EVENT_POSTCREATE)) {
2055                 (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTCREATE,
2056                         dir_vp, DM_RIGHT_NULL,
2057                         *vpp ? vp:NULL,
2058                         DM_RIGHT_NULL, name, NULL,
2059                         dm_di_mode, error, 0);
2060         }
2061         return error;
2062
2063  abort_return:
2064         cancel_flags |= XFS_TRANS_ABORT;
2065         /* FALLTHROUGH */
2066
2067  error_return:
2068         if (tp != NULL)
2069                 xfs_trans_cancel(tp, cancel_flags);
2070
2071         if (!dp_joined_to_trans && (dp != NULL))
2072                 xfs_iunlock(dp, XFS_ILOCK_EXCL);
2073         XFS_QM_DQRELE(mp, udqp);
2074         XFS_QM_DQRELE(mp, gdqp);
2075
2076         goto std_return;
2077
2078  abort_rele:
2079         /*
2080          * Wait until after the current transaction is aborted to
2081          * release the inode.  This prevents recursive transactions
2082          * and deadlocks from xfs_inactive.
2083          */
2084         cancel_flags |= XFS_TRANS_ABORT;
2085         xfs_trans_cancel(tp, cancel_flags);
2086         IRELE(ip);
2087
2088         XFS_QM_DQRELE(mp, udqp);
2089         XFS_QM_DQRELE(mp, gdqp);
2090
2091         goto std_return;
2092 }
2093
2094 #ifdef DEBUG
2095 /*
2096  * Some counters to see if (and how often) we are hitting some deadlock
2097  * prevention code paths.
2098  */
2099
2100 int xfs_rm_locks;
2101 int xfs_rm_lock_delays;
2102 int xfs_rm_attempts;
2103 #endif
2104
2105 /*
2106  * The following routine will lock the inodes associated with the
2107  * directory and the named entry in the directory. The locks are
2108  * acquired in increasing inode number.
2109  *
2110  * If the entry is "..", then only the directory is locked. The
2111  * vnode ref count will still include that from the .. entry in
2112  * this case.
2113  *
2114  * There is a deadlock we need to worry about. If the locked directory is
2115  * in the AIL, it might be blocking up the log. The next inode we lock
2116  * could be already locked by another thread waiting for log space (e.g
2117  * a permanent log reservation with a long running transaction (see
2118  * xfs_itruncate_finish)). To solve this, we must check if the directory
2119  * is in the ail and use lock_nowait. If we can't lock, we need to
2120  * drop the inode lock on the directory and try again. xfs_iunlock will
2121  * potentially push the tail if we were holding up the log.
2122  */
2123 STATIC int
2124 xfs_lock_dir_and_entry(
2125         xfs_inode_t     *dp,
2126         xfs_inode_t     *ip)    /* inode of entry 'name' */
2127 {
2128         int             attempts;
2129         xfs_ino_t       e_inum;
2130         xfs_inode_t     *ips[2];
2131         xfs_log_item_t  *lp;
2132
2133 #ifdef DEBUG
2134         xfs_rm_locks++;
2135 #endif
2136         attempts = 0;
2137
2138 again:
2139         xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
2140
2141         e_inum = ip->i_ino;
2142
2143         ITRACE(ip);
2144
2145         /*
2146          * We want to lock in increasing inum. Since we've already
2147          * acquired the lock on the directory, we may need to release
2148          * if if the inum of the entry turns out to be less.
2149          */
2150         if (e_inum > dp->i_ino) {
2151                 /*
2152                  * We are already in the right order, so just
2153                  * lock on the inode of the entry.
2154                  * We need to use nowait if dp is in the AIL.
2155                  */
2156
2157                 lp = (xfs_log_item_t *)dp->i_itemp;
2158                 if (lp && (lp->li_flags & XFS_LI_IN_AIL)) {
2159                         if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
2160                                 attempts++;
2161 #ifdef DEBUG
2162                                 xfs_rm_attempts++;
2163 #endif
2164
2165                                 /*
2166                                  * Unlock dp and try again.
2167                                  * xfs_iunlock will try to push the tail
2168                                  * if the inode is in the AIL.
2169                                  */
2170
2171                                 xfs_iunlock(dp, XFS_ILOCK_EXCL);
2172
2173                                 if ((attempts % 5) == 0) {
2174                                         delay(1); /* Don't just spin the CPU */
2175 #ifdef DEBUG
2176                                         xfs_rm_lock_delays++;
2177 #endif
2178                                 }
2179                                 goto again;
2180                         }
2181                 } else {
2182                         xfs_ilock(ip, XFS_ILOCK_EXCL);
2183                 }
2184         } else if (e_inum < dp->i_ino) {
2185                 xfs_iunlock(dp, XFS_ILOCK_EXCL);
2186
2187                 ips[0] = ip;
2188                 ips[1] = dp;
2189                 xfs_lock_inodes(ips, 2, 0, XFS_ILOCK_EXCL);
2190         }
2191         /* else  e_inum == dp->i_ino */
2192         /*     This can happen if we're asked to lock /x/..
2193          *     the entry is "..", which is also the parent directory.
2194          */
2195
2196         return 0;
2197 }
2198
2199 #ifdef DEBUG
2200 int xfs_locked_n;
2201 int xfs_small_retries;
2202 int xfs_middle_retries;
2203 int xfs_lots_retries;
2204 int xfs_lock_delays;
2205 #endif
2206
2207 /*
2208  * Bump the subclass so xfs_lock_inodes() acquires each lock with
2209  * a different value
2210  */
2211 static inline int
2212 xfs_lock_inumorder(int lock_mode, int subclass)
2213 {
2214         if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL))
2215                 lock_mode |= (subclass + XFS_IOLOCK_INUMORDER) << XFS_IOLOCK_SHIFT;
2216         if (lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL))
2217                 lock_mode |= (subclass + XFS_ILOCK_INUMORDER) << XFS_ILOCK_SHIFT;
2218
2219         return lock_mode;
2220 }
2221
2222 /*
2223  * The following routine will lock n inodes in exclusive mode.
2224  * We assume the caller calls us with the inodes in i_ino order.
2225  *
2226  * We need to detect deadlock where an inode that we lock
2227  * is in the AIL and we start waiting for another inode that is locked
2228  * by a thread in a long running transaction (such as truncate). This can
2229  * result in deadlock since the long running trans might need to wait
2230  * for the inode we just locked in order to push the tail and free space
2231  * in the log.
2232  */
2233 void
2234 xfs_lock_inodes(
2235         xfs_inode_t     **ips,
2236         int             inodes,
2237         int             first_locked,
2238         uint            lock_mode)
2239 {
2240         int             attempts = 0, i, j, try_lock;
2241         xfs_log_item_t  *lp;
2242
2243         ASSERT(ips && (inodes >= 2)); /* we need at least two */
2244
2245         if (first_locked) {
2246                 try_lock = 1;
2247                 i = 1;
2248         } else {
2249                 try_lock = 0;
2250                 i = 0;
2251         }
2252
2253 again:
2254         for (; i < inodes; i++) {
2255                 ASSERT(ips[i]);
2256
2257                 if (i && (ips[i] == ips[i-1]))  /* Already locked */
2258                         continue;
2259
2260                 /*
2261                  * If try_lock is not set yet, make sure all locked inodes
2262                  * are not in the AIL.
2263                  * If any are, set try_lock to be used later.
2264                  */
2265
2266                 if (!try_lock) {
2267                         for (j = (i - 1); j >= 0 && !try_lock; j--) {
2268                                 lp = (xfs_log_item_t *)ips[j]->i_itemp;
2269                                 if (lp && (lp->li_flags & XFS_LI_IN_AIL)) {
2270                                         try_lock++;
2271                                 }
2272                         }
2273                 }
2274
2275                 /*
2276                  * If any of the previous locks we have locked is in the AIL,
2277                  * we must TRY to get the second and subsequent locks. If
2278                  * we can't get any, we must release all we have
2279                  * and try again.
2280                  */
2281
2282                 if (try_lock) {
2283                         /* try_lock must be 0 if i is 0. */
2284                         /*
2285                          * try_lock means we have an inode locked
2286                          * that is in the AIL.
2287                          */
2288                         ASSERT(i != 0);
2289                         if (!xfs_ilock_nowait(ips[i], xfs_lock_inumorder(lock_mode, i))) {
2290                                 attempts++;
2291
2292                                 /*
2293                                  * Unlock all previous guys and try again.
2294                                  * xfs_iunlock will try to push the tail
2295                                  * if the inode is in the AIL.
2296                                  */
2297
2298                                 for(j = i - 1; j >= 0; j--) {
2299
2300                                         /*
2301                                          * Check to see if we've already
2302                                          * unlocked this one.
2303                                          * Not the first one going back,
2304                                          * and the inode ptr is the same.
2305                                          */
2306                                         if ((j != (i - 1)) && ips[j] ==
2307                                                                 ips[j+1])
2308                                                 continue;
2309
2310                                         xfs_iunlock(ips[j], lock_mode);
2311                                 }
2312
2313                                 if ((attempts % 5) == 0) {
2314                                         delay(1); /* Don't just spin the CPU */
2315 #ifdef DEBUG
2316                                         xfs_lock_delays++;
2317 #endif
2318                                 }
2319                                 i = 0;
2320                                 try_lock = 0;
2321                                 goto again;
2322                         }
2323                 } else {
2324                         xfs_ilock(ips[i], xfs_lock_inumorder(lock_mode, i));
2325                 }
2326         }
2327
2328 #ifdef DEBUG
2329         if (attempts) {
2330                 if (attempts < 5) xfs_small_retries++;
2331                 else if (attempts < 100) xfs_middle_retries++;
2332                 else xfs_lots_retries++;
2333         } else {
2334                 xfs_locked_n++;
2335         }
2336 #endif
2337 }
2338
2339 #ifdef  DEBUG
2340 #define REMOVE_DEBUG_TRACE(x)   {remove_which_error_return = (x);}
2341 int remove_which_error_return = 0;
2342 #else /* ! DEBUG */
2343 #define REMOVE_DEBUG_TRACE(x)
2344 #endif  /* ! DEBUG */
2345
2346
2347 /*
2348  * xfs_remove
2349  *
2350  */
2351 STATIC int
2352 xfs_remove(
2353         bhv_desc_t              *dir_bdp,
2354         bhv_vname_t             *dentry,
2355         cred_t                  *credp)
2356 {
2357         bhv_vnode_t             *dir_vp;
2358         char                    *name = VNAME(dentry);
2359         xfs_inode_t             *dp, *ip;
2360         xfs_trans_t             *tp = NULL;
2361         xfs_mount_t             *mp;
2362         int                     error = 0;
2363         xfs_bmap_free_t         free_list;
2364         xfs_fsblock_t           first_block;
2365         int                     cancel_flags;
2366         int                     committed;
2367         int                     dm_di_mode = 0;
2368         int                     link_zero;
2369         uint                    resblks;
2370         int                     namelen;
2371
2372         dir_vp = BHV_TO_VNODE(dir_bdp);
2373         vn_trace_entry(dir_vp, __FUNCTION__, (inst_t *)__return_address);
2374
2375         dp = XFS_BHVTOI(dir_bdp);
2376         mp = dp->i_mount;
2377
2378         if (XFS_FORCED_SHUTDOWN(mp))
2379                 return XFS_ERROR(EIO);
2380
2381         namelen = VNAMELEN(dentry);
2382
2383         if (!xfs_get_dir_entry(dentry, &ip)) {
2384                 dm_di_mode = ip->i_d.di_mode;
2385                 IRELE(ip);
2386         }
2387
2388         if (DM_EVENT_ENABLED(dir_vp->v_vfsp, dp, DM_EVENT_REMOVE)) {
2389                 error = XFS_SEND_NAMESP(mp, DM_EVENT_REMOVE, dir_vp,
2390                                         DM_RIGHT_NULL, NULL, DM_RIGHT_NULL,
2391                                         name, NULL, dm_di_mode, 0, 0);
2392                 if (error)
2393                         return error;
2394         }
2395
2396         /* From this point on, return through std_return */
2397         ip = NULL;
2398
2399         /*
2400          * We need to get a reference to ip before we get our log
2401          * reservation. The reason for this is that we cannot call
2402          * xfs_iget for an inode for which we do not have a reference
2403          * once we've acquired a log reservation. This is because the
2404          * inode we are trying to get might be in xfs_inactive going
2405          * for a log reservation. Since we'll have to wait for the
2406          * inactive code to complete before returning from xfs_iget,
2407          * we need to make sure that we don't have log space reserved
2408          * when we call xfs_iget.  Instead we get an unlocked reference
2409          * to the inode before getting our log reservation.
2410          */
2411         error = xfs_get_dir_entry(dentry, &ip);
2412         if (error) {
2413                 REMOVE_DEBUG_TRACE(__LINE__);
2414                 goto std_return;
2415         }
2416
2417         dm_di_mode = ip->i_d.di_mode;
2418
2419         vn_trace_entry(XFS_ITOV(ip), __FUNCTION__, (inst_t *)__return_address);
2420
2421         ITRACE(ip);
2422
2423         error = XFS_QM_DQATTACH(mp, dp, 0);
2424         if (!error && dp != ip)
2425                 error = XFS_QM_DQATTACH(mp, ip, 0);
2426         if (error) {
2427                 REMOVE_DEBUG_TRACE(__LINE__);
2428                 IRELE(ip);
2429                 goto std_return;
2430         }
2431
2432         tp = xfs_trans_alloc(mp, XFS_TRANS_REMOVE);
2433         cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
2434         /*
2435          * We try to get the real space reservation first,
2436          * allowing for directory btree deletion(s) implying
2437          * possible bmap insert(s).  If we can't get the space
2438          * reservation then we use 0 instead, and avoid the bmap
2439          * btree insert(s) in the directory code by, if the bmap
2440          * insert tries to happen, instead trimming the LAST
2441          * block from the directory.
2442          */
2443         resblks = XFS_REMOVE_SPACE_RES(mp);
2444         error = xfs_trans_reserve(tp, resblks, XFS_REMOVE_LOG_RES(mp), 0,
2445                         XFS_TRANS_PERM_LOG_RES, XFS_REMOVE_LOG_COUNT);
2446         if (error == ENOSPC) {
2447                 resblks = 0;
2448                 error = xfs_trans_reserve(tp, 0, XFS_REMOVE_LOG_RES(mp), 0,
2449                                 XFS_TRANS_PERM_LOG_RES, XFS_REMOVE_LOG_COUNT);
2450         }
2451         if (error) {
2452                 ASSERT(error != ENOSPC);
2453                 REMOVE_DEBUG_TRACE(__LINE__);
2454                 xfs_trans_cancel(tp, 0);
2455                 IRELE(ip);
2456                 return error;
2457         }
2458
2459         error = xfs_lock_dir_and_entry(dp, ip);
2460         if (error) {
2461                 REMOVE_DEBUG_TRACE(__LINE__);
2462                 xfs_trans_cancel(tp, cancel_flags);
2463                 IRELE(ip);
2464                 goto std_return;
2465         }
2466
2467         /*
2468          * At this point, we've gotten both the directory and the entry
2469          * inodes locked.
2470          */
2471         xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
2472         if (dp != ip) {
2473                 /*
2474                  * Increment vnode ref count only in this case since
2475                  * there's an extra vnode reference in the case where
2476                  * dp == ip.
2477                  */
2478                 IHOLD(dp);
2479                 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
2480         }
2481
2482         /*
2483          * Entry must exist since we did a lookup in xfs_lock_dir_and_entry.
2484          */
2485         XFS_BMAP_INIT(&free_list, &first_block);
2486         error = xfs_dir_removename(tp, dp, name, namelen, ip->i_ino,
2487                                         &first_block, &free_list, 0);
2488         if (error) {
2489                 ASSERT(error != ENOENT);
2490                 REMOVE_DEBUG_TRACE(__LINE__);
2491                 goto error1;
2492         }
2493         xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
2494
2495         dp->i_gen++;
2496         xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
2497
2498         error = xfs_droplink(tp, ip);
2499         if (error) {
2500                 REMOVE_DEBUG_TRACE(__LINE__);
2501                 goto error1;
2502         }
2503
2504         /* Determine if this is the last link while
2505          * we are in the transaction.
2506          */
2507         link_zero = (ip)->i_d.di_nlink==0;
2508
2509         /*
2510          * Take an extra ref on the inode so that it doesn't
2511          * go to xfs_inactive() from within the commit.
2512          */
2513         IHOLD(ip);
2514
2515         /*
2516          * If this is a synchronous mount, make sure that the
2517          * remove transaction goes to disk before returning to
2518          * the user.
2519          */
2520         if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
2521                 xfs_trans_set_sync(tp);
2522         }
2523
2524         error = xfs_bmap_finish(&tp, &free_list, &committed);
2525         if (error) {
2526                 REMOVE_DEBUG_TRACE(__LINE__);
2527                 goto error_rele;
2528         }
2529
2530         error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
2531         if (error) {
2532                 IRELE(ip);
2533                 goto std_return;
2534         }
2535
2536         /*
2537          * Before we drop our extra reference to the inode, purge it
2538          * from the refcache if it is there.  By waiting until afterwards
2539          * to do the IRELE, we ensure that we won't go inactive in the
2540          * xfs_refcache_purge_ip routine (although that would be OK).
2541          */
2542         xfs_refcache_purge_ip(ip);
2543
2544         vn_trace_exit(XFS_ITOV(ip), __FUNCTION__, (inst_t *)__return_address);
2545
2546         /*
2547          * Let interposed file systems know about removed links.
2548          */
2549         bhv_vop_link_removed(XFS_ITOV(ip), dir_vp, link_zero);
2550
2551         IRELE(ip);
2552
2553 /*      Fall through to std_return with error = 0 */
2554  std_return:
2555         if (DM_EVENT_ENABLED(dir_vp->v_vfsp, dp,
2556                                                 DM_EVENT_POSTREMOVE)) {
2557                 (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTREMOVE,
2558                                 dir_vp, DM_RIGHT_NULL,
2559                                 NULL, DM_RIGHT_NULL,
2560                                 name, NULL, dm_di_mode, error, 0);
2561         }
2562         return error;
2563
2564  error1:
2565         xfs_bmap_cancel(&free_list);
2566         cancel_flags |= XFS_TRANS_ABORT;
2567         xfs_trans_cancel(tp, cancel_flags);
2568         goto std_return;
2569
2570  error_rele:
2571         /*
2572          * In this case make sure to not release the inode until after
2573          * the current transaction is aborted.  Releasing it beforehand
2574          * can cause us to go to xfs_inactive and start a recursive
2575          * transaction which can easily deadlock with the current one.
2576          */
2577         xfs_bmap_cancel(&free_list);
2578         cancel_flags |= XFS_TRANS_ABORT;
2579         xfs_trans_cancel(tp, cancel_flags);
2580
2581         /*
2582          * Before we drop our extra reference to the inode, purge it
2583          * from the refcache if it is there.  By waiting until afterwards
2584          * to do the IRELE, we ensure that we won't go inactive in the
2585          * xfs_refcache_purge_ip routine (although that would be OK).
2586          */
2587         xfs_refcache_purge_ip(ip);
2588
2589         IRELE(ip);
2590
2591         goto std_return;
2592 }
2593
2594
2595 /*
2596  * xfs_link
2597  *
2598  */
2599 STATIC int
2600 xfs_link(
2601         bhv_desc_t              *target_dir_bdp,
2602         bhv_vnode_t             *src_vp,
2603         bhv_vname_t             *dentry,
2604         cred_t                  *credp)
2605 {
2606         xfs_inode_t             *tdp, *sip;
2607         xfs_trans_t             *tp;
2608         xfs_mount_t             *mp;
2609         xfs_inode_t             *ips[2];
2610         int                     error;
2611         xfs_bmap_free_t         free_list;
2612         xfs_fsblock_t           first_block;
2613         int                     cancel_flags;
2614         int                     committed;
2615         bhv_vnode_t             *target_dir_vp;
2616         int                     resblks;
2617         char                    *target_name = VNAME(dentry);
2618         int                     target_namelen;
2619
2620         target_dir_vp = BHV_TO_VNODE(target_dir_bdp);
2621         vn_trace_entry(target_dir_vp, __FUNCTION__, (inst_t *)__return_address);
2622         vn_trace_entry(src_vp, __FUNCTION__, (inst_t *)__return_address);
2623
2624         target_namelen = VNAMELEN(dentry);
2625         ASSERT(!VN_ISDIR(src_vp));
2626
2627         sip = xfs_vtoi(src_vp);
2628         tdp = XFS_BHVTOI(target_dir_bdp);
2629         mp = tdp->i_mount;
2630         if (XFS_FORCED_SHUTDOWN(mp))
2631                 return XFS_ERROR(EIO);
2632
2633         if (DM_EVENT_ENABLED(src_vp->v_vfsp, tdp, DM_EVENT_LINK)) {
2634                 error = XFS_SEND_NAMESP(mp, DM_EVENT_LINK,
2635                                         target_dir_vp, DM_RIGHT_NULL,
2636                                         src_vp, DM_RIGHT_NULL,
2637                                         target_name, NULL, 0, 0, 0);
2638                 if (error)
2639                         return error;
2640         }
2641
2642         /* Return through std_return after this point. */
2643
2644         error = XFS_QM_DQATTACH(mp, sip, 0);
2645         if (!error && sip != tdp)
2646                 error = XFS_QM_DQATTACH(mp, tdp, 0);
2647         if (error)
2648                 goto std_return;
2649
2650         tp = xfs_trans_alloc(mp, XFS_TRANS_LINK);
2651         cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
2652         resblks = XFS_LINK_SPACE_RES(mp, target_namelen);
2653         error = xfs_trans_reserve(tp, resblks, XFS_LINK_LOG_RES(mp), 0,
2654                         XFS_TRANS_PERM_LOG_RES, XFS_LINK_LOG_COUNT);
2655         if (error == ENOSPC) {
2656                 resblks = 0;
2657                 error = xfs_trans_reserve(tp, 0, XFS_LINK_LOG_RES(mp), 0,
2658                                 XFS_TRANS_PERM_LOG_RES, XFS_LINK_LOG_COUNT);
2659         }
2660         if (error) {
2661                 cancel_flags = 0;
2662                 goto error_return;
2663         }
2664
2665         if (sip->i_ino < tdp->i_ino) {
2666                 ips[0] = sip;
2667                 ips[1] = tdp;
2668         } else {
2669                 ips[0] = tdp;
2670                 ips[1] = sip;
2671         }
2672
2673         xfs_lock_inodes(ips, 2, 0, XFS_ILOCK_EXCL);
2674
2675         /*
2676          * Increment vnode ref counts since xfs_trans_commit &
2677          * xfs_trans_cancel will both unlock the inodes and
2678          * decrement the associated ref counts.
2679          */
2680         VN_HOLD(src_vp);
2681         VN_HOLD(target_dir_vp);
2682         xfs_trans_ijoin(tp, sip, XFS_ILOCK_EXCL);
2683         xfs_trans_ijoin(tp, tdp, XFS_ILOCK_EXCL);
2684
2685         /*
2686          * If the source has too many links, we can't make any more to it.
2687          */
2688         if (sip->i_d.di_nlink >= XFS_MAXLINK) {
2689                 error = XFS_ERROR(EMLINK);
2690                 goto error_return;
2691         }
2692
2693         /*
2694          * If we are using project inheritance, we only allow hard link
2695          * creation in our tree when the project IDs are the same; else
2696          * the tree quota mechanism could be circumvented.
2697          */
2698         if (unlikely((tdp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) &&
2699                      (tdp->i_d.di_projid != sip->i_d.di_projid))) {
2700                 error = XFS_ERROR(EXDEV);
2701                 goto error_return;
2702         }
2703
2704         if (resblks == 0 &&
2705             (error = xfs_dir_canenter(tp, tdp, target_name, target_namelen)))
2706                 goto error_return;
2707
2708         XFS_BMAP_INIT(&free_list, &first_block);
2709
2710         error = xfs_dir_createname(tp, tdp, target_name, target_namelen,
2711                                    sip->i_ino, &first_block, &free_list,
2712                                    resblks);
2713         if (error)
2714                 goto abort_return;
2715         xfs_ichgtime(tdp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
2716         tdp->i_gen++;
2717         xfs_trans_log_inode(tp, tdp, XFS_ILOG_CORE);
2718
2719         error = xfs_bumplink(tp, sip);
2720         if (error)
2721                 goto abort_return;
2722
2723         /*
2724          * If this is a synchronous mount, make sure that the
2725          * link transaction goes to disk before returning to
2726          * the user.
2727          */
2728         if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
2729                 xfs_trans_set_sync(tp);
2730         }
2731
2732         error = xfs_bmap_finish (&tp, &free_list, &committed);
2733         if (error) {
2734                 xfs_bmap_cancel(&free_list);
2735                 goto abort_return;
2736         }
2737
2738         error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
2739         if (error)
2740                 goto std_return;
2741
2742         /* Fall through to std_return with error = 0. */
2743 std_return:
2744         if (DM_EVENT_ENABLED(src_vp->v_vfsp, sip,
2745                                                 DM_EVENT_POSTLINK)) {
2746                 (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTLINK,
2747                                 target_dir_vp, DM_RIGHT_NULL,
2748                                 src_vp, DM_RIGHT_NULL,
2749                                 target_name, NULL, 0, error, 0);
2750         }
2751         return error;
2752
2753  abort_return:
2754         cancel_flags |= XFS_TRANS_ABORT;
2755         /* FALLTHROUGH */
2756
2757  error_return:
2758         xfs_trans_cancel(tp, cancel_flags);
2759         goto std_return;
2760 }
2761
2762
2763 /*
2764  * xfs_mkdir
2765  *
2766  */
2767 STATIC int
2768 xfs_mkdir(
2769         bhv_desc_t              *dir_bdp,
2770         bhv_vname_t             *dentry,
2771         bhv_vattr_t             *vap,
2772         bhv_vnode_t             **vpp,
2773         cred_t                  *credp)
2774 {
2775         char                    *dir_name = VNAME(dentry);
2776         xfs_inode_t             *dp;
2777         xfs_inode_t             *cdp;   /* inode of created dir */
2778         bhv_vnode_t             *cvp;   /* vnode of created dir */
2779         xfs_trans_t             *tp;
2780         xfs_mount_t             *mp;
2781         int                     cancel_flags;
2782         int                     error;
2783         int                     committed;
2784         xfs_bmap_free_t         free_list;
2785         xfs_fsblock_t           first_block;
2786         bhv_vnode_t             *dir_vp;
2787         boolean_t               dp_joined_to_trans;
2788         boolean_t               created = B_FALSE;
2789         int                     dm_event_sent = 0;
2790         xfs_prid_t              prid;
2791         struct xfs_dquot        *udqp, *gdqp;
2792         uint                    resblks;
2793         int                     dm_di_mode;
2794         int                     dir_namelen;
2795
2796         dir_vp = BHV_TO_VNODE(dir_bdp);
2797         dp = XFS_BHVTOI(dir_bdp);
2798         mp = dp->i_mount;
2799
2800         if (XFS_FORCED_SHUTDOWN(mp))
2801                 return XFS_ERROR(EIO);
2802
2803         dir_namelen = VNAMELEN(dentry);
2804
2805         tp = NULL;
2806         dp_joined_to_trans = B_FALSE;
2807         dm_di_mode = vap->va_mode;
2808
2809         if (DM_EVENT_ENABLED(dir_vp->v_vfsp, dp, DM_EVENT_CREATE)) {
2810                 error = XFS_SEND_NAMESP(mp, DM_EVENT_CREATE,
2811                                         dir_vp, DM_RIGHT_NULL, NULL,
2812                                         DM_RIGHT_NULL, dir_name, NULL,
2813                                         dm_di_mode, 0, 0);
2814                 if (error)
2815                         return error;
2816                 dm_event_sent = 1;
2817         }
2818
2819         /* Return through std_return after this point. */
2820
2821         vn_trace_entry(dir_vp, __FUNCTION__, (inst_t *)__return_address);
2822
2823         mp = dp->i_mount;
2824         udqp = gdqp = NULL;
2825         if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
2826                 prid = dp->i_d.di_projid;
2827         else if (vap->va_mask & XFS_AT_PROJID)
2828                 prid = (xfs_prid_t)vap->va_projid;
2829         else
2830                 prid = (xfs_prid_t)dfltprid;
2831
2832         /*
2833          * Make sure that we have allocated dquot(s) on disk.
2834          */
2835         error = XFS_QM_DQVOPALLOC(mp, dp,
2836                         current_fsuid(credp), current_fsgid(credp), prid,
2837                         XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp);
2838         if (error)
2839                 goto std_return;
2840
2841         tp = xfs_trans_alloc(mp, XFS_TRANS_MKDIR);
2842         cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
2843         resblks = XFS_MKDIR_SPACE_RES(mp, dir_namelen);
2844         error = xfs_trans_reserve(tp, resblks, XFS_MKDIR_LOG_RES(mp), 0,
2845                                   XFS_TRANS_PERM_LOG_RES, XFS_MKDIR_LOG_COUNT);
2846         if (error == ENOSPC) {
2847                 resblks = 0;
2848                 error = xfs_trans_reserve(tp, 0, XFS_MKDIR_LOG_RES(mp), 0,
2849                                           XFS_TRANS_PERM_LOG_RES,
2850                                           XFS_MKDIR_LOG_COUNT);
2851         }
2852         if (error) {
2853                 cancel_flags = 0;
2854                 dp = NULL;
2855                 goto error_return;
2856         }
2857
2858         xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
2859
2860         /*
2861          * Check for directory link count overflow.
2862          */
2863         if (dp->i_d.di_nlink >= XFS_MAXLINK) {
2864                 error = XFS_ERROR(EMLINK);
2865                 goto error_return;
2866         }
2867
2868         /*
2869          * Reserve disk quota and the inode.
2870          */
2871         error = XFS_TRANS_RESERVE_QUOTA(mp, tp, udqp, gdqp, resblks, 1, 0);
2872         if (error)
2873                 goto error_return;
2874
2875         if (resblks == 0 &&
2876             (error = xfs_dir_canenter(tp, dp, dir_name, dir_namelen)))
2877                 goto error_return;
2878         /*
2879          * create the directory inode.
2880          */
2881         error = xfs_dir_ialloc(&tp, dp, vap->va_mode, 2,
2882                         0, credp, prid, resblks > 0,
2883                 &cdp, NULL);
2884         if (error) {
2885                 if (error == ENOSPC)
2886                         goto error_return;
2887                 goto abort_return;
2888         }
2889         ITRACE(cdp);
2890
2891         /*
2892          * Now we add the directory inode to the transaction.
2893          * We waited until now since xfs_dir_ialloc might start
2894          * a new transaction.  Had we joined the transaction
2895          * earlier, the locks might have gotten released.
2896          */
2897         VN_HOLD(dir_vp);
2898         xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
2899         dp_joined_to_trans = B_TRUE;
2900
2901         XFS_BMAP_INIT(&free_list, &first_block);
2902
2903         error = xfs_dir_createname(tp, dp, dir_name, dir_namelen, cdp->i_ino,
2904                                    &first_block, &free_list, resblks ?
2905                                    resblks - XFS_IALLOC_SPACE_RES(mp) : 0);
2906         if (error) {
2907                 ASSERT(error != ENOSPC);
2908                 goto error1;
2909         }
2910         xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
2911
2912         /*
2913          * Bump the in memory version number of the parent directory
2914          * so that other processes accessing it will recognize that
2915          * the directory has changed.
2916          */
2917         dp->i_gen++;
2918
2919         error = xfs_dir_init(tp, cdp, dp);
2920         if (error)
2921                 goto error2;
2922
2923         cdp->i_gen = 1;
2924         error = xfs_bumplink(tp, dp);
2925         if (error)
2926                 goto error2;
2927
2928         cvp = XFS_ITOV(cdp);
2929
2930         created = B_TRUE;
2931
2932         *vpp = cvp;
2933         IHOLD(cdp);
2934
2935         /*
2936          * Attach the dquots to the new inode and modify the icount incore.
2937          */
2938         XFS_QM_DQVOPCREATE(mp, tp, cdp, udqp, gdqp);
2939
2940         /*
2941          * If this is a synchronous mount, make sure that the
2942          * mkdir transaction goes to disk before returning to
2943          * the user.
2944          */
2945         if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
2946                 xfs_trans_set_sync(tp);
2947         }
2948
2949         error = xfs_bmap_finish(&tp, &free_list, &committed);
2950         if (error) {
2951                 IRELE(cdp);
2952                 goto error2;
2953         }
2954
2955         error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
2956         XFS_QM_DQRELE(mp, udqp);
2957         XFS_QM_DQRELE(mp, gdqp);
2958         if (error) {
2959                 IRELE(cdp);
2960         }
2961
2962         /* Fall through to std_return with error = 0 or errno from
2963          * xfs_trans_commit. */
2964
2965 std_return:
2966         if ( (created || (error != 0 && dm_event_sent != 0)) &&
2967                         DM_EVENT_ENABLED(dir_vp->v_vfsp, XFS_BHVTOI(dir_bdp),
2968                                                 DM_EVENT_POSTCREATE)) {
2969                 (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTCREATE,
2970                                         dir_vp, DM_RIGHT_NULL,
2971                                         created ? XFS_ITOV(cdp):NULL,
2972                                         DM_RIGHT_NULL,
2973                                         dir_name, NULL,
2974                                         dm_di_mode, error, 0);
2975         }
2976         return error;
2977
2978  error2:
2979  error1:
2980         xfs_bmap_cancel(&free_list);
2981  abort_return:
2982         cancel_flags |= XFS_TRANS_ABORT;
2983  error_return:
2984         xfs_trans_cancel(tp, cancel_flags);
2985         XFS_QM_DQRELE(mp, udqp);
2986         XFS_QM_DQRELE(mp, gdqp);
2987
2988         if (!dp_joined_to_trans && (dp != NULL)) {
2989                 xfs_iunlock(dp, XFS_ILOCK_EXCL);
2990         }
2991
2992         goto std_return;
2993 }
2994
2995
2996 /*
2997  * xfs_rmdir
2998  *
2999  */
3000 STATIC int
3001 xfs_rmdir(
3002         bhv_desc_t              *dir_bdp,
3003         bhv_vname_t             *dentry,
3004         cred_t                  *credp)
3005 {
3006         char                    *name = VNAME(dentry);
3007         xfs_inode_t             *dp;
3008         xfs_inode_t             *cdp;   /* child directory */
3009         xfs_trans_t             *tp;
3010         xfs_mount_t             *mp;
3011         int                     error;
3012         xfs_bmap_free_t         free_list;
3013         xfs_fsblock_t           first_block;
3014         int                     cancel_flags;
3015         int                     committed;
3016         bhv_vnode_t             *dir_vp;
3017         int                     dm_di_mode = S_IFDIR;
3018         int                     last_cdp_link;
3019         int                     namelen;
3020         uint                    resblks;
3021
3022         dir_vp = BHV_TO_VNODE(dir_bdp);
3023         dp = XFS_BHVTOI(dir_bdp);
3024         mp = dp->i_mount;
3025
3026         vn_trace_entry(dir_vp, __FUNCTION__, (inst_t *)__return_address);
3027
3028         if (XFS_FORCED_SHUTDOWN(XFS_BHVTOI(dir_bdp)->i_mount))
3029                 return XFS_ERROR(EIO);
3030         namelen = VNAMELEN(dentry);
3031
3032         if (!xfs_get_dir_entry(dentry, &cdp)) {
3033                 dm_di_mode = cdp->i_d.di_mode;
3034                 IRELE(cdp);
3035         }
3036
3037         if (DM_EVENT_ENABLED(dir_vp->v_vfsp, dp, DM_EVENT_REMOVE)) {
3038                 error = XFS_SEND_NAMESP(mp, DM_EVENT_REMOVE,
3039                                         dir_vp, DM_RIGHT_NULL,
3040                                         NULL, DM_RIGHT_NULL,
3041                                         name, NULL, dm_di_mode, 0, 0);
3042                 if (error)
3043                         return XFS_ERROR(error);
3044         }
3045
3046         /* Return through std_return after this point. */
3047
3048         cdp = NULL;
3049
3050         /*
3051          * We need to get a reference to cdp before we get our log
3052          * reservation.  The reason for this is that we cannot call
3053          * xfs_iget for an inode for which we do not have a reference
3054          * once we've acquired a log reservation.  This is because the
3055          * inode we are trying to get might be in xfs_inactive going
3056          * for a log reservation.  Since we'll have to wait for the
3057          * inactive code to complete before returning from xfs_iget,
3058          * we need to make sure that we don't have log space reserved
3059          * when we call xfs_iget.  Instead we get an unlocked reference
3060          * to the inode before getting our log reservation.
3061          */
3062         error = xfs_get_dir_entry(dentry, &cdp);
3063         if (error) {
3064                 REMOVE_DEBUG_TRACE(__LINE__);
3065                 goto std_return;
3066         }
3067         mp = dp->i_mount;
3068         dm_di_mode = cdp->i_d.di_mode;
3069
3070         /*
3071          * Get the dquots for the inodes.
3072          */
3073         error = XFS_QM_DQATTACH(mp, dp, 0);
3074         if (!error && dp != cdp)
3075                 error = XFS_QM_DQATTACH(mp, cdp, 0);
3076         if (error) {
3077                 IRELE(cdp);
3078                 REMOVE_DEBUG_TRACE(__LINE__);
3079                 goto std_return;
3080         }
3081
3082         tp = xfs_trans_alloc(mp, XFS_TRANS_RMDIR);
3083         cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
3084         /*
3085          * We try to get the real space reservation first,
3086          * allowing for directory btree deletion(s) implying
3087          * possible bmap insert(s).  If we can't get the space
3088          * reservation then we use 0 instead, and avoid the bmap
3089          * btree insert(s) in the directory code by, if the bmap
3090          * insert tries to happen, instead trimming the LAST
3091          * block from the directory.
3092          */
3093         resblks = XFS_REMOVE_SPACE_RES(mp);
3094         error = xfs_trans_reserve(tp, resblks, XFS_REMOVE_LOG_RES(mp), 0,
3095                         XFS_TRANS_PERM_LOG_RES, XFS_DEFAULT_LOG_COUNT);
3096         if (error == ENOSPC) {
3097                 resblks = 0;
3098                 error = xfs_trans_reserve(tp, 0, XFS_REMOVE_LOG_RES(mp), 0,
3099                                 XFS_TRANS_PERM_LOG_RES, XFS_DEFAULT_LOG_COUNT);
3100         }
3101         if (error) {
3102                 ASSERT(error != ENOSPC);
3103                 cancel_flags = 0;
3104                 IRELE(cdp);
3105                 goto error_return;
3106         }
3107         XFS_BMAP_INIT(&free_list, &first_block);
3108
3109         /*
3110          * Now lock the child directory inode and the parent directory
3111          * inode in the proper order.  This will take care of validating
3112          * that the directory entry for the child directory inode has
3113          * not changed while we were obtaining a log reservation.
3114          */
3115         error = xfs_lock_dir_and_entry(dp, cdp);
3116         if (error) {
3117                 xfs_trans_cancel(tp, cancel_flags);
3118                 IRELE(cdp);
3119                 goto std_return;
3120         }
3121
3122         xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
3123         if (dp != cdp) {
3124                 /*
3125                  * Only increment the parent directory vnode count if
3126                  * we didn't bump it in looking up cdp.  The only time
3127                  * we don't bump it is when we're looking up ".".
3128                  */
3129                 VN_HOLD(dir_vp);
3130         }
3131
3132         ITRACE(cdp);
3133         xfs_trans_ijoin(tp, cdp, XFS_ILOCK_EXCL);
3134
3135         ASSERT(cdp->i_d.di_nlink >= 2);
3136         if (cdp->i_d.di_nlink != 2) {
3137                 error = XFS_ERROR(ENOTEMPTY);
3138                 goto error_return;
3139         }
3140         if (!xfs_dir_isempty(cdp)) {
3141                 error = XFS_ERROR(ENOTEMPTY);
3142                 goto error_return;
3143         }
3144
3145         error = xfs_dir_removename(tp, dp, name, namelen, cdp->i_ino,
3146                                         &first_block, &free_list, resblks);
3147         if (error)
3148                 goto error1;
3149
3150         xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
3151
3152         /*
3153          * Bump the in memory generation count on the parent
3154          * directory so that other can know that it has changed.
3155          */
3156         dp->i_gen++;
3157
3158         /*
3159          * Drop the link from cdp's "..".
3160          */
3161         error = xfs_droplink(tp, dp);
3162         if (error) {
3163                 goto error1;
3164         }
3165
3166         /*
3167          * Drop the link from dp to cdp.
3168          */
3169         error = xfs_droplink(tp, cdp);
3170         if (error) {
3171                 goto error1;
3172         }
3173
3174         /*
3175          * Drop the "." link from cdp to self.
3176          */
3177         error = xfs_droplink(tp, cdp);
3178         if (error) {
3179                 goto error1;
3180         }
3181
3182         /* Determine these before committing transaction */
3183         last_cdp_link = (cdp)->i_d.di_nlink==0;
3184
3185         /*
3186          * Take an extra ref on the child vnode so that it
3187          * does not go to xfs_inactive() from within the commit.
3188          */
3189         IHOLD(cdp);
3190
3191         /*
3192          * If this is a synchronous mount, make sure that the
3193          * rmdir transaction goes to disk before returning to
3194          * the user.
3195          */
3196         if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
3197                 xfs_trans_set_sync(tp);
3198         }
3199
3200         error = xfs_bmap_finish (&tp, &free_list, &committed);
3201         if (error) {
3202                 xfs_bmap_cancel(&free_list);
3203                 xfs_trans_cancel(tp, (XFS_TRANS_RELEASE_LOG_RES |
3204                                  XFS_TRANS_ABORT));
3205                 IRELE(cdp);
3206                 goto std_return;
3207         }
3208
3209         error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
3210         if (error) {
3211                 IRELE(cdp);
3212                 goto std_return;
3213         }
3214
3215
3216         /*
3217          * Let interposed file systems know about removed links.
3218          */
3219         bhv_vop_link_removed(XFS_ITOV(cdp), dir_vp, last_cdp_link);
3220
3221         IRELE(cdp);
3222
3223         /* Fall through to std_return with error = 0 or the errno
3224          * from xfs_trans_commit. */
3225  std_return:
3226         if (DM_EVENT_ENABLED(dir_vp->v_vfsp, dp, DM_EVENT_POSTREMOVE)) {
3227                 (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTREMOVE,
3228                                         dir_vp, DM_RIGHT_NULL,
3229                                         NULL, DM_RIGHT_NULL,
3230                                         name, NULL, dm_di_mode,
3231                                         error, 0);
3232         }
3233         return error;
3234
3235  error1:
3236         xfs_bmap_cancel(&free_list);
3237         cancel_flags |= XFS_TRANS_ABORT;
3238         /* FALLTHROUGH */
3239
3240  error_return:
3241         xfs_trans_cancel(tp, cancel_flags);
3242         goto std_return;
3243 }
3244
3245
3246 /*
3247  * Read dp's entries starting at uiop->uio_offset and translate them into
3248  * bufsize bytes worth of struct dirents starting at bufbase.
3249  */
3250 STATIC int
3251 xfs_readdir(
3252         bhv_desc_t      *dir_bdp,
3253         uio_t           *uiop,
3254         cred_t          *credp,
3255         int             *eofp)
3256 {
3257         xfs_inode_t     *dp;
3258         xfs_trans_t     *tp = NULL;
3259         int             error = 0;
3260         uint            lock_mode;
3261
3262         vn_trace_entry(BHV_TO_VNODE(dir_bdp), __FUNCTION__,
3263                                                (inst_t *)__return_address);
3264         dp = XFS_BHVTOI(dir_bdp);
3265
3266         if (XFS_FORCED_SHUTDOWN(dp->i_mount))
3267                 return XFS_ERROR(EIO);
3268
3269         lock_mode = xfs_ilock_map_shared(dp);
3270         error = xfs_dir_getdents(tp, dp, uiop, eofp);
3271         xfs_iunlock_map_shared(dp, lock_mode);
3272         return error;
3273 }
3274
3275
3276 STATIC int
3277 xfs_symlink(
3278         bhv_desc_t              *dir_bdp,
3279         bhv_vname_t             *dentry,
3280         bhv_vattr_t             *vap,
3281         char                    *target_path,
3282         bhv_vnode_t             **vpp,
3283         cred_t                  *credp)
3284 {
3285         xfs_trans_t             *tp;
3286         xfs_mount_t             *mp;
3287         xfs_inode_t             *dp;
3288         xfs_inode_t             *ip;
3289         int                     error;
3290         int                     pathlen;
3291         xfs_bmap_free_t         free_list;
3292         xfs_fsblock_t           first_block;
3293         boolean_t               dp_joined_to_trans;
3294         bhv_vnode_t             *dir_vp;
3295         uint                    cancel_flags;
3296         int                     committed;
3297         xfs_fileoff_t           first_fsb;
3298         xfs_filblks_t           fs_blocks;
3299         int                     nmaps;
3300         xfs_bmbt_irec_t         mval[SYMLINK_MAPS];
3301         xfs_daddr_t             d;
3302         char                    *cur_chunk;
3303         int                     byte_cnt;
3304         int                     n;
3305         xfs_buf_t               *bp;
3306         xfs_prid_t              prid;
3307         struct xfs_dquot        *udqp, *gdqp;
3308         uint                    resblks;
3309         char                    *link_name = VNAME(dentry);
3310         int                     link_namelen;
3311
3312         *vpp = NULL;
3313         dir_vp = BHV_TO_VNODE(dir_bdp);
3314         dp = XFS_BHVTOI(dir_bdp);
3315         dp_joined_to_trans = B_FALSE;
3316         error = 0;
3317         ip = NULL;
3318         tp = NULL;
3319
3320         vn_trace_entry(dir_vp, __FUNCTION__, (inst_t *)__return_address);
3321
3322         mp = dp->i_mount;
3323
3324         if (XFS_FORCED_SHUTDOWN(mp))
3325                 return XFS_ERROR(EIO);
3326
3327         link_namelen = VNAMELEN(dentry);
3328
3329         /*
3330          * Check component lengths of the target path name.
3331          */
3332         pathlen = strlen(target_path);
3333         if (pathlen >= MAXPATHLEN)      /* total string too long */
3334                 return XFS_ERROR(ENAMETOOLONG);
3335         if (pathlen >= MAXNAMELEN) {    /* is any component too long? */
3336                 int len, total;
3337                 char *path;
3338
3339                 for (total = 0, path = target_path; total < pathlen;) {
3340                         /*
3341                          * Skip any slashes.
3342                          */
3343                         while(*path == '/') {
3344                                 total++;
3345                                 path++;
3346                         }
3347
3348                         /*
3349                          * Count up to the next slash or end of path.
3350                          * Error out if the component is bigger than MAXNAMELEN.
3351                          */
3352                         for(len = 0; *path != '/' && total < pathlen;total++, path++) {
3353                                 if (++len >= MAXNAMELEN) {
3354                                         error = ENAMETOOLONG;
3355                                         return error;
3356                                 }
3357                         }
3358                 }
3359         }
3360
3361         if (DM_EVENT_ENABLED(dir_vp->v_vfsp, dp, DM_EVENT_SYMLINK)) {
3362                 error = XFS_SEND_NAMESP(mp, DM_EVENT_SYMLINK, dir_vp,
3363                                         DM_RIGHT_NULL, NULL, DM_RIGHT_NULL,
3364                                         link_name, target_path, 0, 0, 0);
3365                 if (error)
3366                         return error;
3367         }
3368
3369         /* Return through std_return after this point. */
3370
3371         udqp = gdqp = NULL;
3372         if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
3373                 prid = dp->i_d.di_projid;
3374         else if (vap->va_mask & XFS_AT_PROJID)
3375                 prid = (xfs_prid_t)vap->va_projid;
3376         else
3377                 prid = (xfs_prid_t)dfltprid;
3378
3379         /*
3380          * Make sure that we have allocated dquot(s) on disk.
3381          */
3382         error = XFS_QM_DQVOPALLOC(mp, dp,
3383                         current_fsuid(credp), current_fsgid(credp), prid,
3384                         XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp);
3385         if (error)
3386                 goto std_return;
3387
3388         tp = xfs_trans_alloc(mp, XFS_TRANS_SYMLINK);
3389         cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
3390         /*
3391          * The symlink will fit into the inode data fork?
3392          * There can't be any attributes so we get the whole variable part.
3393          */
3394         if (pathlen <= XFS_LITINO(mp))
3395                 fs_blocks = 0;
3396         else
3397                 fs_blocks = XFS_B_TO_FSB(mp, pathlen);
3398         resblks = XFS_SYMLINK_SPACE_RES(mp, link_namelen, fs_blocks);
3399         error = xfs_trans_reserve(tp, resblks, XFS_SYMLINK_LOG_RES(mp), 0,
3400                         XFS_TRANS_PERM_LOG_RES, XFS_SYMLINK_LOG_COUNT);
3401         if (error == ENOSPC && fs_blocks == 0) {
3402                 resblks = 0;
3403                 error = xfs_trans_reserve(tp, 0, XFS_SYMLINK_LOG_RES(mp), 0,
3404                                 XFS_TRANS_PERM_LOG_RES, XFS_SYMLINK_LOG_COUNT);
3405         }
3406         if (error) {
3407                 cancel_flags = 0;
3408                 dp = NULL;
3409                 goto error_return;
3410         }
3411
3412         xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
3413
3414         /*
3415          * Check whether the directory allows new symlinks or not.
3416          */
3417         if (dp->i_d.di_flags & XFS_DIFLAG_NOSYMLINKS) {
3418                 error = XFS_ERROR(EPERM);
3419                 goto error_return;
3420         }
3421
3422         /*
3423          * Reserve disk quota : blocks and inode.
3424          */
3425         error = XFS_TRANS_RESERVE_QUOTA(mp, tp, udqp, gdqp, resblks, 1, 0);
3426         if (error)
3427                 goto error_return;
3428
3429         /*
3430          * Check for ability to enter directory entry, if no space reserved.
3431          */
3432         if (resblks == 0 &&
3433             (error = xfs_dir_canenter(tp, dp, link_name, link_namelen)))
3434                 goto error_return;
3435         /*
3436          * Initialize the bmap freelist prior to calling either
3437          * bmapi or the directory create code.
3438          */
3439         XFS_BMAP_INIT(&free_list, &first_block);
3440
3441         /*
3442          * Allocate an inode for the symlink.
3443          */
3444         error = xfs_dir_ialloc(&tp, dp, S_IFLNK | (vap->va_mode&~S_IFMT),
3445                                1, 0, credp, prid, resblks > 0, &ip, NULL);
3446         if (error) {
3447                 if (error == ENOSPC)
3448                         goto error_return;
3449                 goto error1;
3450         }
3451         ITRACE(ip);
3452
3453         VN_HOLD(dir_vp);
3454         xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
3455         dp_joined_to_trans = B_TRUE;
3456
3457         /*
3458          * Also attach the dquot(s) to it, if applicable.
3459          */
3460         XFS_QM_DQVOPCREATE(mp, tp, ip, udqp, gdqp);
3461
3462         if (resblks)
3463                 resblks -= XFS_IALLOC_SPACE_RES(mp);
3464         /*
3465          * If the symlink will fit into the inode, write it inline.
3466          */
3467         if (pathlen <= XFS_IFORK_DSIZE(ip)) {
3468                 xfs_idata_realloc(ip, pathlen, XFS_DATA_FORK);
3469                 memcpy(ip->i_df.if_u1.if_data, target_path, pathlen);
3470                 ip->i_d.di_size = pathlen;
3471
3472                 /*
3473                  * The inode was initially created in extent format.
3474                  */
3475                 ip->i_df.if_flags &= ~(XFS_IFEXTENTS | XFS_IFBROOT);
3476                 ip->i_df.if_flags |= XFS_IFINLINE;
3477
3478                 ip->i_d.di_format = XFS_DINODE_FMT_LOCAL;
3479                 xfs_trans_log_inode(tp, ip, XFS_ILOG_DDATA | XFS_ILOG_CORE);
3480
3481         } else {
3482                 first_fsb = 0;
3483                 nmaps = SYMLINK_MAPS;
3484
3485                 error = xfs_bmapi(tp, ip, first_fsb, fs_blocks,
3486                                   XFS_BMAPI_WRITE | XFS_BMAPI_METADATA,
3487                                   &first_block, resblks, mval, &nmaps,
3488                                   &free_list, NULL);
3489                 if (error) {
3490                         goto error1;
3491                 }
3492
3493                 if (resblks)
3494                         resblks -= fs_blocks;
3495                 ip->i_d.di_size = pathlen;
3496                 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
3497
3498                 cur_chunk = target_path;
3499                 for (n = 0; n < nmaps; n++) {
3500                         d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock);
3501                         byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount);
3502                         bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d,
3503                                                BTOBB(byte_cnt), 0);
3504                         ASSERT(bp && !XFS_BUF_GETERROR(bp));
3505                         if (pathlen < byte_cnt) {
3506                                 byte_cnt = pathlen;
3507                         }
3508                         pathlen -= byte_cnt;
3509
3510                         memcpy(XFS_BUF_PTR(bp), cur_chunk, byte_cnt);
3511                         cur_chunk += byte_cnt;
3512
3513                         xfs_trans_log_buf(tp, bp, 0, byte_cnt - 1);
3514                 }
3515         }
3516
3517         /*
3518          * Create the directory entry for the symlink.
3519          */
3520         error = xfs_dir_createname(tp, dp, link_name, link_namelen, ip->i_ino,
3521                                    &first_block, &free_list, resblks);
3522         if (error)
3523                 goto error1;
3524         xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
3525         xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
3526
3527         /*
3528          * Bump the in memory version number of the parent directory
3529          * so that other processes accessing it will recognize that
3530          * the directory has changed.
3531          */
3532         dp->i_gen++;
3533
3534         /*
3535          * If this is a synchronous mount, make sure that the
3536          * symlink transaction goes to disk before returning to
3537          * the user.
3538          */
3539         if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
3540                 xfs_trans_set_sync(tp);
3541         }
3542
3543         /*
3544          * xfs_trans_commit normally decrements the vnode ref count
3545          * when it unlocks the inode. Since we want to return the
3546          * vnode to the caller, we bump the vnode ref count now.
3547          */
3548         IHOLD(ip);
3549
3550         error = xfs_bmap_finish(&tp, &free_list, &committed);
3551         if (error) {
3552                 goto error2;
3553         }
3554         error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
3555         XFS_QM_DQRELE(mp, udqp);
3556         XFS_QM_DQRELE(mp, gdqp);
3557
3558         /* Fall through to std_return with error = 0 or errno from
3559          * xfs_trans_commit     */
3560 std_return:
3561         if (DM_EVENT_ENABLED(dir_vp->v_vfsp, XFS_BHVTOI(dir_bdp),
3562                              DM_EVENT_POSTSYMLINK)) {
3563                 (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTSYMLINK,
3564                                         dir_vp, DM_RIGHT_NULL,
3565                                         error ? NULL : XFS_ITOV(ip),
3566                                         DM_RIGHT_NULL, link_name, target_path,
3567                                         0, error, 0);
3568         }
3569
3570         if (!error) {
3571                 bhv_vnode_t *vp;
3572
3573                 ASSERT(ip);
3574                 vp = XFS_ITOV(ip);
3575                 *vpp = vp;
3576         }
3577         return error;
3578
3579  error2:
3580         IRELE(ip);
3581  error1:
3582         xfs_bmap_cancel(&free_list);
3583         cancel_flags |= XFS_TRANS_ABORT;
3584  error_return:
3585         xfs_trans_cancel(tp, cancel_flags);
3586         XFS_QM_DQRELE(mp, udqp);
3587         XFS_QM_DQRELE(mp, gdqp);
3588
3589         if (!dp_joined_to_trans && (dp != NULL)) {
3590                 xfs_iunlock(dp, XFS_ILOCK_EXCL);
3591         }
3592
3593         goto std_return;
3594 }
3595
3596
3597 /*
3598  * xfs_fid2
3599  *
3600  * A fid routine that takes a pointer to a previously allocated
3601  * fid structure (like xfs_fast_fid) but uses a 64 bit inode number.
3602  */
3603 STATIC int
3604 xfs_fid2(
3605         bhv_desc_t      *bdp,
3606         fid_t           *fidp)
3607 {
3608         xfs_inode_t     *ip;
3609         xfs_fid2_t      *xfid;
3610
3611         vn_trace_entry(BHV_TO_VNODE(bdp), __FUNCTION__,
3612                                        (inst_t *)__return_address);
3613         ASSERT(sizeof(fid_t) >= sizeof(xfs_fid2_t));
3614
3615         xfid = (xfs_fid2_t *)fidp;
3616         ip = XFS_BHVTOI(bdp);
3617         xfid->fid_len = sizeof(xfs_fid2_t) - sizeof(xfid->fid_len);
3618         xfid->fid_pad = 0;
3619         /*
3620          * use memcpy because the inode is a long long and there's no
3621          * assurance that xfid->fid_ino is properly aligned.
3622          */
3623         memcpy(&xfid->fid_ino, &ip->i_ino, sizeof(xfid->fid_ino));
3624         xfid->fid_gen = ip->i_d.di_gen;
3625
3626         return 0;
3627 }
3628
3629
3630 /*
3631  * xfs_rwlock
3632  */
3633 int
3634 xfs_rwlock(
3635         bhv_desc_t      *bdp,
3636         bhv_vrwlock_t   locktype)
3637 {
3638         xfs_inode_t     *ip;
3639         bhv_vnode_t     *vp;
3640
3641         vp = BHV_TO_VNODE(bdp);
3642         if (VN_ISDIR(vp))
3643                 return 1;
3644         ip = XFS_BHVTOI(bdp);
3645         if (locktype == VRWLOCK_WRITE) {
3646                 xfs_ilock(ip, XFS_IOLOCK_EXCL);
3647         } else if (locktype == VRWLOCK_TRY_READ) {
3648                 return xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED);
3649         } else if (locktype == VRWLOCK_TRY_WRITE) {
3650                 return xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL);
3651         } else {
3652                 ASSERT((locktype == VRWLOCK_READ) ||
3653                        (locktype == VRWLOCK_WRITE_DIRECT));
3654                 xfs_ilock(ip, XFS_IOLOCK_SHARED);
3655         }
3656
3657         return 1;
3658 }
3659
3660
3661 /*
3662  * xfs_rwunlock
3663  */
3664 void
3665 xfs_rwunlock(
3666         bhv_desc_t      *bdp,
3667         bhv_vrwlock_t   locktype)
3668 {
3669         xfs_inode_t     *ip;
3670         bhv_vnode_t     *vp;
3671
3672         vp = BHV_TO_VNODE(bdp);
3673         if (VN_ISDIR(vp))
3674                 return;
3675         ip = XFS_BHVTOI(bdp);
3676         if (locktype == VRWLOCK_WRITE) {
3677                 /*
3678                  * In the write case, we may have added a new entry to
3679                  * the reference cache.  This might store a pointer to
3680                  * an inode to be released in this inode.  If it is there,
3681                  * clear the pointer and release the inode after unlocking
3682                  * this one.
3683                  */
3684                 xfs_refcache_iunlock(ip, XFS_IOLOCK_EXCL);
3685         } else {
3686                 ASSERT((locktype == VRWLOCK_READ) ||
3687                        (locktype == VRWLOCK_WRITE_DIRECT));
3688                 xfs_iunlock(ip, XFS_IOLOCK_SHARED);
3689         }
3690         return;
3691 }
3692
3693 STATIC int
3694 xfs_inode_flush(
3695         bhv_desc_t      *bdp,
3696         int             flags)
3697 {
3698         xfs_inode_t     *ip;
3699         xfs_mount_t     *mp;
3700         xfs_inode_log_item_t *iip;
3701         int             error = 0;
3702
3703         ip = XFS_BHVTOI(bdp);
3704         mp = ip->i_mount;
3705         iip = ip->i_itemp;
3706
3707         if (XFS_FORCED_SHUTDOWN(mp))
3708                 return XFS_ERROR(EIO);
3709
3710         /*
3711          * Bypass inodes which have already been cleaned by
3712          * the inode flush clustering code inside xfs_iflush
3713          */
3714         if ((ip->i_update_core == 0) &&
3715             ((iip == NULL) || !(iip->ili_format.ilf_fields & XFS_ILOG_ALL)))
3716                 return 0;
3717
3718         if (flags & FLUSH_LOG) {
3719                 if (iip && iip->ili_last_lsn) {
3720                         xlog_t          *log = mp->m_log;
3721                         xfs_lsn_t       sync_lsn;
3722                         int             s, log_flags = XFS_LOG_FORCE;
3723
3724                         s = GRANT_LOCK(log);
3725                         sync_lsn = log->l_last_sync_lsn;
3726                         GRANT_UNLOCK(log, s);
3727
3728                         if ((XFS_LSN_CMP(iip->ili_last_lsn, sync_lsn) <= 0))
3729                                 return 0;
3730
3731                         if (flags & FLUSH_SYNC)
3732                                 log_flags |= XFS_LOG_SYNC;
3733                         return xfs_log_force(mp, iip->ili_last_lsn, log_flags);
3734                 }
3735         }
3736
3737         /*
3738          * We make this non-blocking if the inode is contended,
3739          * return EAGAIN to indicate to the caller that they
3740          * did not succeed. This prevents the flush path from
3741          * blocking on inodes inside another operation right
3742          * now, they get caught later by xfs_sync.
3743          */
3744         if (flags & FLUSH_INODE) {
3745                 int     flush_flags;
3746
3747                 if (xfs_ipincount(ip))
3748                         return EAGAIN;
3749
3750                 if (flags & FLUSH_SYNC) {
3751                         xfs_ilock(ip, XFS_ILOCK_SHARED);
3752                         xfs_iflock(ip);
3753                 } else if (xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) {
3754                         if (xfs_ipincount(ip) || !xfs_iflock_nowait(ip)) {
3755                                 xfs_iunlock(ip, XFS_ILOCK_SHARED);
3756                                 return EAGAIN;
3757                         }
3758                 } else {
3759                         return EAGAIN;
3760                 }
3761
3762                 if (flags & FLUSH_SYNC)
3763                         flush_flags = XFS_IFLUSH_SYNC;
3764                 else
3765                         flush_flags = XFS_IFLUSH_ASYNC;
3766
3767                 error = xfs_iflush(ip, flush_flags);
3768                 xfs_iunlock(ip, XFS_ILOCK_SHARED);
3769         }
3770
3771         return error;
3772 }
3773
3774 int
3775 xfs_set_dmattrs (
3776         bhv_desc_t      *bdp,
3777         u_int           evmask,
3778         u_int16_t       state,
3779         cred_t          *credp)
3780 {
3781         xfs_inode_t     *ip;
3782         xfs_trans_t     *tp;
3783         xfs_mount_t     *mp;
3784         int             error;
3785
3786         if (!capable(CAP_SYS_ADMIN))
3787                 return XFS_ERROR(EPERM);
3788
3789         ip = XFS_BHVTOI(bdp);
3790         mp = ip->i_mount;
3791
3792         if (XFS_FORCED_SHUTDOWN(mp))
3793                 return XFS_ERROR(EIO);
3794
3795         tp = xfs_trans_alloc(mp, XFS_TRANS_SET_DMATTRS);
3796         error = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES (mp), 0, 0, 0);
3797         if (error) {
3798                 xfs_trans_cancel(tp, 0);
3799                 return error;
3800         }
3801         xfs_ilock(ip, XFS_ILOCK_EXCL);
3802         xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
3803
3804         ip->i_iocore.io_dmevmask = ip->i_d.di_dmevmask = evmask;
3805         ip->i_iocore.io_dmstate  = ip->i_d.di_dmstate  = state;
3806
3807         xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
3808         IHOLD(ip);
3809         error = xfs_trans_commit(tp, 0);
3810
3811         return error;
3812 }
3813
3814 STATIC int
3815 xfs_reclaim(
3816         bhv_desc_t      *bdp)
3817 {
3818         xfs_inode_t     *ip;
3819         bhv_vnode_t     *vp;
3820
3821         vp = BHV_TO_VNODE(bdp);
3822         ip = XFS_BHVTOI(bdp);
3823
3824         vn_trace_entry(vp, __FUNCTION__, (inst_t *)__return_address);
3825
3826         ASSERT(!VN_MAPPED(vp));
3827
3828         /* bad inode, get out here ASAP */
3829         if (VN_BAD(vp)) {
3830                 xfs_ireclaim(ip);
3831                 return 0;
3832         }
3833
3834         vn_iowait(vp);
3835
3836         ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || ip->i_delayed_blks == 0);
3837
3838         /*
3839          * Make sure the atime in the XFS inode is correct before freeing the
3840          * Linux inode.
3841          */
3842         xfs_synchronize_atime(ip);
3843
3844         /*
3845          * If we have nothing to flush with this inode then complete the
3846          * teardown now, otherwise break the link between the xfs inode and the
3847          * linux inode and clean up the xfs inode later. This avoids flushing
3848          * the inode to disk during the delete operation itself.
3849          *
3850          * When breaking the link, we need to set the XFS_IRECLAIMABLE flag
3851          * first to ensure that xfs_iunpin() will never see an xfs inode
3852          * that has a linux inode being reclaimed. Synchronisation is provided
3853          * by the i_flags_lock.
3854          */
3855         if (!ip->i_update_core && (ip->i_itemp == NULL)) {
3856                 xfs_ilock(ip, XFS_ILOCK_EXCL);
3857                 xfs_iflock(ip);
3858                 return xfs_finish_reclaim(ip, 1, XFS_IFLUSH_DELWRI_ELSE_SYNC);
3859         } else {
3860                 xfs_mount_t     *mp = ip->i_mount;
3861
3862                 /* Protect sync and unpin from us */
3863                 XFS_MOUNT_ILOCK(mp);
3864                 spin_lock(&ip->i_flags_lock);
3865                 __xfs_iflags_set(ip, XFS_IRECLAIMABLE);
3866                 vn_bhv_remove(VN_BHV_HEAD(vp), XFS_ITOBHV(ip));
3867                 spin_unlock(&ip->i_flags_lock);
3868                 list_add_tail(&ip->i_reclaim, &mp->m_del_inodes);
3869                 XFS_MOUNT_IUNLOCK(mp);
3870         }
3871         return 0;
3872 }
3873
3874 int
3875 xfs_finish_reclaim(
3876         xfs_inode_t     *ip,
3877         int             locked,
3878         int             sync_mode)
3879 {
3880         xfs_ihash_t     *ih = ip->i_hash;
3881         bhv_vnode_t     *vp = XFS_ITOV_NULL(ip);
3882         int             error;
3883
3884         if (vp && VN_BAD(vp))
3885                 goto reclaim;
3886
3887         /* The hash lock here protects a thread in xfs_iget_core from
3888          * racing with us on linking the inode back with a vnode.
3889          * Once we have the XFS_IRECLAIM flag set it will not touch
3890          * us.
3891          */
3892         write_lock(&ih->ih_lock);
3893         spin_lock(&ip->i_flags_lock);
3894         if (__xfs_iflags_test(ip, XFS_IRECLAIM) ||
3895             (!__xfs_iflags_test(ip, XFS_IRECLAIMABLE) && vp == NULL)) {
3896                 spin_unlock(&ip->i_flags_lock);
3897                 write_unlock(&ih->ih_lock);
3898                 if (locked) {
3899                         xfs_ifunlock(ip);
3900                         xfs_iunlock(ip, XFS_ILOCK_EXCL);
3901                 }
3902                 return 1;
3903         }
3904         __xfs_iflags_set(ip, XFS_IRECLAIM);
3905         spin_unlock(&ip->i_flags_lock);
3906         write_unlock(&ih->ih_lock);
3907
3908         /*
3909          * If the inode is still dirty, then flush it out.  If the inode
3910          * is not in the AIL, then it will be OK to flush it delwri as
3911          * long as xfs_iflush() does not keep any references to the inode.
3912          * We leave that decision up to xfs_iflush() since it has the
3913          * knowledge of whether it's OK to simply do a delwri flush of
3914          * the inode or whether we need to wait until the inode is
3915          * pulled from the AIL.
3916          * We get the flush lock regardless, though, just to make sure
3917          * we don't free it while it is being flushed.
3918          */
3919         if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
3920                 if (!locked) {
3921                         xfs_ilock(ip, XFS_ILOCK_EXCL);
3922                         xfs_iflock(ip);
3923                 }
3924
3925                 if (ip->i_update_core ||
3926                     ((ip->i_itemp != NULL) &&
3927                      (ip->i_itemp->ili_format.ilf_fields != 0))) {
3928                         error = xfs_iflush(ip, sync_mode);
3929                         /*
3930                          * If we hit an error, typically because of filesystem
3931                          * shutdown, we don't need to let vn_reclaim to know
3932                          * because we're gonna reclaim the inode anyway.
3933                          */
3934                         if (error) {
3935                                 xfs_iunlock(ip, XFS_ILOCK_EXCL);
3936                                 goto reclaim;
3937                         }
3938                         xfs_iflock(ip); /* synchronize with xfs_iflush_done */
3939                 }
3940
3941                 ASSERT(ip->i_update_core == 0);
3942                 ASSERT(ip->i_itemp == NULL ||
3943                        ip->i_itemp->ili_format.ilf_fields == 0);
3944                 xfs_iunlock(ip, XFS_ILOCK_EXCL);
3945         } else if (locked) {
3946                 /*
3947                  * We are not interested in doing an iflush if we're
3948                  * in the process of shutting down the filesystem forcibly.
3949                  * So, just reclaim the inode.
3950                  */
3951                 xfs_ifunlock(ip);
3952                 xfs_iunlock(ip, XFS_ILOCK_EXCL);
3953         }
3954
3955  reclaim:
3956         xfs_ireclaim(ip);
3957         return 0;
3958 }
3959
3960 int
3961 xfs_finish_reclaim_all(xfs_mount_t *mp, int noblock)
3962 {
3963         int             purged;
3964         xfs_inode_t     *ip, *n;
3965         int             done = 0;
3966
3967         while (!done) {
3968                 purged = 0;
3969                 XFS_MOUNT_ILOCK(mp);
3970                 list_for_each_entry_safe(ip, n, &mp->m_del_inodes, i_reclaim) {
3971                         if (noblock) {
3972                                 if (xfs_ilock_nowait(ip, XFS_ILOCK_EXCL) == 0)
3973                                         continue;
3974                                 if (xfs_ipincount(ip) ||
3975                                     !xfs_iflock_nowait(ip)) {
3976                                         xfs_iunlock(ip, XFS_ILOCK_EXCL);
3977                                         continue;
3978                                 }
3979                         }
3980                         XFS_MOUNT_IUNLOCK(mp);
3981                         if (xfs_finish_reclaim(ip, noblock,
3982                                         XFS_IFLUSH_DELWRI_ELSE_ASYNC))
3983                                 delay(1);
3984                         purged = 1;
3985                         break;
3986                 }
3987
3988                 done = !purged;
3989         }
3990
3991         XFS_MOUNT_IUNLOCK(mp);
3992         return 0;
3993 }
3994
3995 /*
3996  * xfs_alloc_file_space()
3997  *      This routine allocates disk space for the given file.
3998  *
3999  *      If alloc_type == 0, this request is for an ALLOCSP type
4000  *      request which will change the file size.  In this case, no
4001  *      DMAPI event will be generated by the call.  A TRUNCATE event
4002  *      will be generated later by xfs_setattr.
4003  *
4004  *      If alloc_type != 0, this request is for a RESVSP type
4005  *      request, and a DMAPI DM_EVENT_WRITE will be generated if the
4006  *      lower block boundary byte address is less than the file's
4007  *      length.
4008  *
4009  * RETURNS:
4010  *       0 on success
4011  *      errno on error
4012  *
4013  */
4014 STATIC int
4015 xfs_alloc_file_space(
4016         xfs_inode_t             *ip,
4017         xfs_off_t               offset,
4018         xfs_off_t               len,
4019         int                     alloc_type,
4020         int                     attr_flags)
4021 {
4022         xfs_mount_t             *mp = ip->i_mount;
4023         xfs_off_t               count;
4024         xfs_filblks_t           allocated_fsb;
4025         xfs_filblks_t           allocatesize_fsb;
4026         xfs_extlen_t            extsz, temp;
4027         xfs_fileoff_t           startoffset_fsb;
4028         xfs_fsblock_t           firstfsb;
4029         int                     nimaps;
4030         int                     bmapi_flag;
4031         int                     quota_flag;
4032         int                     rt;
4033         xfs_trans_t             *tp;
4034         xfs_bmbt_irec_t         imaps[1], *imapp;
4035         xfs_bmap_free_t         free_list;
4036         uint                    qblocks, resblks, resrtextents;
4037         int                     committed;
4038         int                     error;
4039
4040         vn_trace_entry(XFS_ITOV(ip), __FUNCTION__, (inst_t *)__return_address);
4041
4042         if (XFS_FORCED_SHUTDOWN(mp))
4043                 return XFS_ERROR(EIO);
4044
4045         if ((error = XFS_QM_DQATTACH(mp, ip, 0)))
4046                 return error;
4047
4048         if (len <= 0)
4049                 return XFS_ERROR(EINVAL);
4050
4051         rt = XFS_IS_REALTIME_INODE(ip);
4052         extsz = xfs_get_extsz_hint(ip);
4053
4054         count = len;
4055         imapp = &imaps[0];
4056         nimaps = 1;
4057         bmapi_flag = XFS_BMAPI_WRITE | (alloc_type ? XFS_BMAPI_PREALLOC : 0);
4058         startoffset_fsb = XFS_B_TO_FSBT(mp, offset);
4059         allocatesize_fsb = XFS_B_TO_FSB(mp, count);
4060
4061         /*      Generate a DMAPI event if needed.       */
4062         if (alloc_type != 0 && offset < ip->i_size &&
4063                         (attr_flags&ATTR_DMI) == 0  &&
4064                         DM_EVENT_ENABLED(XFS_MTOVFS(mp), ip, DM_EVENT_WRITE)) {
4065                 xfs_off_t           end_dmi_offset;
4066
4067                 end_dmi_offset = offset+len;
4068                 if (end_dmi_offset > ip->i_size)
4069                         end_dmi_offset = ip->i_size;
4070                 error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, XFS_ITOV(ip),
4071                         offset, end_dmi_offset - offset,
4072                         0, NULL);
4073                 if (error)
4074                         return error;
4075         }
4076
4077         /*
4078          * Allocate file space until done or until there is an error
4079          */
4080 retry:
4081         while (allocatesize_fsb && !error) {
4082                 xfs_fileoff_t   s, e;
4083
4084                 /*
4085                  * Determine space reservations for data/realtime.
4086                  */
4087                 if (unlikely(extsz)) {
4088                         s = startoffset_fsb;
4089                         do_div(s, extsz);
4090                         s *= extsz;
4091                         e = startoffset_fsb + allocatesize_fsb;
4092                         if ((temp = do_mod(startoffset_fsb, extsz)))
4093                                 e += temp;
4094                         if ((temp = do_mod(e, extsz)))
4095                                 e += extsz - temp;
4096                 } else {
4097                         s = 0;
4098                         e = allocatesize_fsb;
4099                 }
4100
4101                 if (unlikely(rt)) {
4102                         resrtextents = qblocks = (uint)(e - s);
4103                         resrtextents /= mp->m_sb.sb_rextsize;
4104                         resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
4105                         quota_flag = XFS_QMOPT_RES_RTBLKS;
4106                 } else {
4107                         resrtextents = 0;
4108                         resblks = qblocks = \
4109                                 XFS_DIOSTRAT_SPACE_RES(mp, (uint)(e - s));
4110                         quota_flag = XFS_QMOPT_RES_REGBLKS;
4111                 }
4112
4113                 /*
4114                  * Allocate and setup the transaction.
4115                  */
4116                 tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
4117                 error = xfs_trans_reserve(tp, resblks,
4118                                           XFS_WRITE_LOG_RES(mp), resrtextents,
4119                                           XFS_TRANS_PERM_LOG_RES,
4120                                           XFS_WRITE_LOG_COUNT);
4121                 /*
4122                  * Check for running out of space
4123                  */
4124                 if (error) {
4125                         /*
4126                          * Free the transaction structure.
4127                          */
4128                         ASSERT(error == ENOSPC || XFS_FORCED_SHUTDOWN(mp));
4129                         xfs_trans_cancel(tp, 0);
4130                         break;
4131                 }
4132                 xfs_ilock(ip, XFS_ILOCK_EXCL);
4133                 error = XFS_TRANS_RESERVE_QUOTA_NBLKS(mp, tp, ip,
4134                                                       qblocks, 0, quota_flag);
4135                 if (error)
4136                         goto error1;
4137
4138                 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
4139                 xfs_trans_ihold(tp, ip);
4140
4141                 /*
4142                  * Issue the xfs_bmapi() call to allocate the blocks
4143                  */
4144                 XFS_BMAP_INIT(&free_list, &firstfsb);
4145                 error = XFS_BMAPI(mp, tp, &ip->i_iocore, startoffset_fsb,
4146                                   allocatesize_fsb, bmapi_flag,
4147                                   &firstfsb, 0, imapp, &nimaps,
4148                                   &free_list, NULL);
4149                 if (error) {
4150                         goto error0;
4151                 }
4152
4153                 /*
4154                  * Complete the transaction
4155                  */
4156                 error = xfs_bmap_finish(&tp, &free_list, &committed);
4157                 if (error) {
4158                         goto error0;
4159                 }
4160
4161                 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
4162                 xfs_iunlock(ip, XFS_ILOCK_EXCL);
4163                 if (error) {
4164                         break;
4165                 }
4166
4167                 allocated_fsb = imapp->br_blockcount;
4168
4169                 if (nimaps == 0) {
4170                         error = XFS_ERROR(ENOSPC);
4171                         break;
4172                 }
4173
4174                 startoffset_fsb += allocated_fsb;
4175                 allocatesize_fsb -= allocated_fsb;
4176         }
4177 dmapi_enospc_check:
4178         if (error == ENOSPC && (attr_flags&ATTR_DMI) == 0 &&
4179             DM_EVENT_ENABLED(XFS_MTOVFS(mp), ip, DM_EVENT_NOSPACE)) {
4180
4181                 error = XFS_SEND_NAMESP(mp, DM_EVENT_NOSPACE,
4182                                 XFS_ITOV(ip), DM_RIGHT_NULL,
4183                                 XFS_ITOV(ip), DM_RIGHT_NULL,
4184                                 NULL, NULL, 0, 0, 0); /* Delay flag intentionally unused */
4185                 if (error == 0)
4186                         goto retry;     /* Maybe DMAPI app. has made space */
4187                 /* else fall through with error from XFS_SEND_DATA */
4188         }
4189
4190         return error;
4191
4192 error0: /* Cancel bmap, unlock inode, unreserve quota blocks, cancel trans */
4193         xfs_bmap_cancel(&free_list);
4194         XFS_TRANS_UNRESERVE_QUOTA_NBLKS(mp, tp, ip, qblocks, 0, quota_flag);
4195
4196 error1: /* Just cancel transaction */
4197         xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
4198         xfs_iunlock(ip, XFS_ILOCK_EXCL);
4199         goto dmapi_enospc_check;
4200 }
4201
4202 /*
4203  * Zero file bytes between startoff and endoff inclusive.
4204  * The iolock is held exclusive and no blocks are buffered.
4205  */
4206 STATIC int
4207 xfs_zero_remaining_bytes(
4208         xfs_inode_t             *ip,
4209         xfs_off_t               startoff,
4210         xfs_off_t               endoff)
4211 {
4212         xfs_bmbt_irec_t         imap;
4213         xfs_fileoff_t           offset_fsb;
4214         xfs_off_t               lastoffset;
4215         xfs_off_t               offset;
4216         xfs_buf_t               *bp;
4217         xfs_mount_t             *mp = ip->i_mount;
4218         int                     nimap;
4219         int                     error = 0;
4220
4221         bp = xfs_buf_get_noaddr(mp->m_sb.sb_blocksize,
4222                                 ip->i_d.di_flags & XFS_DIFLAG_REALTIME ?
4223                                 mp->m_rtdev_targp : mp->m_ddev_targp);
4224
4225         for (offset = startoff; offset <= endoff; offset = lastoffset + 1) {
4226                 offset_fsb = XFS_B_TO_FSBT(mp, offset);
4227                 nimap = 1;
4228                 error = XFS_BMAPI(mp, NULL, &ip->i_iocore, offset_fsb, 1, 0,
4229                         NULL, 0, &imap, &nimap, NULL, NULL);
4230                 if (error || nimap < 1)
4231                         break;
4232                 ASSERT(imap.br_blockcount >= 1);
4233                 ASSERT(imap.br_startoff == offset_fsb);
4234                 lastoffset = XFS_FSB_TO_B(mp, imap.br_startoff + 1) - 1;
4235                 if (lastoffset > endoff)
4236                         lastoffset = endoff;
4237                 if (imap.br_startblock == HOLESTARTBLOCK)
4238                         continue;
4239                 ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
4240                 if (imap.br_state == XFS_EXT_UNWRITTEN)
4241                         continue;
4242                 XFS_BUF_UNDONE(bp);
4243                 XFS_BUF_UNWRITE(bp);
4244                 XFS_BUF_READ(bp);
4245                 XFS_BUF_SET_ADDR(bp, XFS_FSB_TO_DB(ip, imap.br_startblock));
4246                 xfsbdstrat(mp, bp);
4247                 if ((error = xfs_iowait(bp))) {
4248                         xfs_ioerror_alert("xfs_zero_remaining_bytes(read)",
4249                                           mp, bp, XFS_BUF_ADDR(bp));
4250                         break;
4251                 }
4252                 memset(XFS_BUF_PTR(bp) +
4253                         (offset - XFS_FSB_TO_B(mp, imap.br_startoff)),
4254                       0, lastoffset - offset + 1);
4255                 XFS_BUF_UNDONE(bp);
4256                 XFS_BUF_UNREAD(bp);
4257                 XFS_BUF_WRITE(bp);
4258                 xfsbdstrat(mp, bp);
4259                 if ((error = xfs_iowait(bp))) {
4260                         xfs_ioerror_alert("xfs_zero_remaining_bytes(write)",
4261                                           mp, bp, XFS_BUF_ADDR(bp));
4262                         break;
4263                 }
4264         }
4265         xfs_buf_free(bp);
4266         return error;
4267 }
4268
4269 /*
4270  * xfs_free_file_space()
4271  *      This routine frees disk space for the given file.
4272  *
4273  *      This routine is only called by xfs_change_file_space
4274  *      for an UNRESVSP type call.
4275  *
4276  * RETURNS:
4277  *       0 on success
4278  *      errno on error
4279  *
4280  */
4281 STATIC int
4282 xfs_free_file_space(
4283         xfs_inode_t             *ip,
4284         xfs_off_t               offset,
4285         xfs_off_t               len,
4286         int                     attr_flags)
4287 {
4288         bhv_vnode_t             *vp;
4289         int                     committed;
4290         int                     done;
4291         xfs_off_t               end_dmi_offset;
4292         xfs_fileoff_t           endoffset_fsb;
4293         int                     error;
4294         xfs_fsblock_t           firstfsb;
4295         xfs_bmap_free_t         free_list;
4296         xfs_bmbt_irec_t         imap;
4297         xfs_off_t               ioffset;
4298         xfs_extlen_t            mod=0;
4299         xfs_mount_t             *mp;
4300         int                     nimap;
4301         uint                    resblks;
4302         uint                    rounding;
4303         int                     rt;
4304         xfs_fileoff_t           startoffset_fsb;
4305         xfs_trans_t             *tp;
4306         int                     need_iolock = 1;
4307
4308         vp = XFS_ITOV(ip);
4309         mp = ip->i_mount;
4310
4311         vn_trace_entry(vp, __FUNCTION__, (inst_t *)__return_address);
4312
4313         if ((error = XFS_QM_DQATTACH(mp, ip, 0)))
4314                 return error;
4315
4316         error = 0;
4317         if (len <= 0)   /* if nothing being freed */
4318                 return error;
4319         rt = (ip->i_d.di_flags & XFS_DIFLAG_REALTIME);
4320         startoffset_fsb = XFS_B_TO_FSB(mp, offset);
4321         end_dmi_offset = offset + len;
4322         endoffset_fsb = XFS_B_TO_FSBT(mp, end_dmi_offset);
4323
4324         if (offset < ip->i_size &&
4325             (attr_flags & ATTR_DMI) == 0 &&
4326             DM_EVENT_ENABLED(XFS_MTOVFS(mp), ip, DM_EVENT_WRITE)) {
4327                 if (end_dmi_offset > ip->i_size)
4328                         end_dmi_offset = ip->i_size;
4329                 error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, vp,
4330                                 offset, end_dmi_offset - offset,
4331                                 AT_DELAY_FLAG(attr_flags), NULL);
4332                 if (error)
4333                         return error;
4334         }
4335
4336         if (attr_flags & ATTR_NOLOCK)
4337                 need_iolock = 0;
4338         if (need_iolock) {
4339                 xfs_ilock(ip, XFS_IOLOCK_EXCL);
4340                 vn_iowait(vp);  /* wait for the completion of any pending DIOs */
4341         }
4342
4343         rounding = max_t(uint, 1 << mp->m_sb.sb_blocklog, NBPP);
4344         ioffset = offset & ~(rounding - 1);
4345
4346         if (VN_CACHED(vp) != 0) {
4347                 xfs_inval_cached_trace(&ip->i_iocore, ioffset, -1,
4348                                 ctooff(offtoct(ioffset)), -1);
4349                 error = bhv_vop_flushinval_pages(vp, ctooff(offtoct(ioffset)),
4350                                 -1, FI_REMAPF_LOCKED);
4351                 if (error)
4352                         goto out_unlock_iolock;
4353         }
4354
4355         /*
4356          * Need to zero the stuff we're not freeing, on disk.
4357          * If its a realtime file & can't use unwritten extents then we
4358          * actually need to zero the extent edges.  Otherwise xfs_bunmapi
4359          * will take care of it for us.
4360          */
4361         if (rt && !XFS_SB_VERSION_HASEXTFLGBIT(&mp->m_sb)) {
4362                 nimap = 1;
4363                 error = XFS_BMAPI(mp, NULL, &ip->i_iocore, startoffset_fsb,
4364                         1, 0, NULL, 0, &imap, &nimap, NULL, NULL);
4365                 if (error)
4366                         goto out_unlock_iolock;
4367                 ASSERT(nimap == 0 || nimap == 1);
4368                 if (nimap && imap.br_startblock != HOLESTARTBLOCK) {
4369                         xfs_daddr_t     block;
4370
4371                         ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
4372                         block = imap.br_startblock;
4373                         mod = do_div(block, mp->m_sb.sb_rextsize);
4374                         if (mod)
4375                                 startoffset_fsb += mp->m_sb.sb_rextsize - mod;
4376                 }
4377                 nimap = 1;
4378                 error = XFS_BMAPI(mp, NULL, &ip->i_iocore, endoffset_fsb - 1,
4379                         1, 0, NULL, 0, &imap, &nimap, NULL, NULL);
4380                 if (error)
4381                         goto out_unlock_iolock;
4382                 ASSERT(nimap == 0 || nimap == 1);
4383                 if (nimap && imap.br_startblock != HOLESTARTBLOCK) {
4384                         ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
4385                         mod++;
4386                         if (mod && (mod != mp->m_sb.sb_rextsize))
4387                                 endoffset_fsb -= mod;
4388                 }
4389         }
4390         if ((done = (endoffset_fsb <= startoffset_fsb)))
4391                 /*
4392                  * One contiguous piece to clear
4393                  */
4394                 error = xfs_zero_remaining_bytes(ip, offset, offset + len - 1);
4395         else {
4396                 /*
4397                  * Some full blocks, possibly two pieces to clear
4398                  */
4399                 if (offset < XFS_FSB_TO_B(mp, startoffset_fsb))
4400                         error = xfs_zero_remaining_bytes(ip, offset,
4401                                 XFS_FSB_TO_B(mp, startoffset_fsb) - 1);
4402                 if (!error &&
4403                     XFS_FSB_TO_B(mp, endoffset_fsb) < offset + len)
4404                         error = xfs_zero_remaining_bytes(ip,
4405                                 XFS_FSB_TO_B(mp, endoffset_fsb),
4406                                 offset + len - 1);
4407         }
4408
4409         /*
4410          * free file space until done or until there is an error
4411          */
4412         resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
4413         while (!error && !done) {
4414
4415                 /*
4416                  * allocate and setup the transaction
4417                  */
4418                 tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
4419                 error = xfs_trans_reserve(tp,
4420                                           resblks,
4421                                           XFS_WRITE_LOG_RES(mp),
4422                                           0,
4423                                           XFS_TRANS_PERM_LOG_RES,
4424                                           XFS_WRITE_LOG_COUNT);
4425
4426                 /*
4427                  * check for running out of space
4428                  */
4429                 if (error) {
4430                         /*
4431                          * Free the transaction structure.
4432                          */
4433                         ASSERT(error == ENOSPC || XFS_FORCED_SHUTDOWN(mp));
4434                         xfs_trans_cancel(tp, 0);
4435                         break;
4436                 }
4437                 xfs_ilock(ip, XFS_ILOCK_EXCL);
4438                 error = XFS_TRANS_RESERVE_QUOTA(mp, tp,
4439                                 ip->i_udquot, ip->i_gdquot, resblks, 0,
4440                                 XFS_QMOPT_RES_REGBLKS);
4441                 if (error)
4442                         goto error1;
4443
4444                 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
4445                 xfs_trans_ihold(tp, ip);
4446
4447                 /*
4448                  * issue the bunmapi() call to free the blocks
4449                  */
4450                 XFS_BMAP_INIT(&free_list, &firstfsb);
4451                 error = XFS_BUNMAPI(mp, tp, &ip->i_iocore, startoffset_fsb,
4452                                   endoffset_fsb - startoffset_fsb,
4453                                   0, 2, &firstfsb, &free_list, NULL, &done);
4454                 if (error) {
4455                         goto error0;
4456                 }
4457
4458                 /*
4459                  * complete the transaction
4460                  */
4461                 error = xfs_bmap_finish(&tp, &free_list, &committed);
4462                 if (error) {
4463                         goto error0;
4464                 }
4465
4466                 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
4467                 xfs_iunlock(ip, XFS_ILOCK_EXCL);
4468         }
4469
4470  out_unlock_iolock:
4471         if (need_iolock)
4472                 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
4473         return error;
4474
4475  error0:
4476         xfs_bmap_cancel(&free_list);
4477  error1:
4478         xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
4479         xfs_iunlock(ip, need_iolock ? (XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL) :
4480                     XFS_ILOCK_EXCL);
4481         return error;
4482 }
4483
4484 /*
4485  * xfs_change_file_space()
4486  *      This routine allocates or frees disk space for the given file.
4487  *      The user specified parameters are checked for alignment and size
4488  *      limitations.
4489  *
4490  * RETURNS:
4491  *       0 on success
4492  *      errno on error
4493  *
4494  */
4495 int
4496 xfs_change_file_space(
4497         bhv_desc_t      *bdp,
4498         int             cmd,
4499         xfs_flock64_t   *bf,
4500         xfs_off_t       offset,
4501         cred_t          *credp,
4502         int             attr_flags)
4503 {
4504         int             clrprealloc;
4505         int             error;
4506         xfs_fsize_t     fsize;
4507         xfs_inode_t     *ip;
4508         xfs_mount_t     *mp;
4509         int             setprealloc;
4510         xfs_off_t       startoffset;
4511         xfs_off_t       llen;
4512         xfs_trans_t     *tp;
4513         bhv_vattr_t     va;
4514         bhv_vnode_t     *vp;
4515
4516         vp = BHV_TO_VNODE(bdp);
4517         vn_trace_entry(vp, __FUNCTION__, (inst_t *)__return_address);
4518
4519         ip = XFS_BHVTOI(bdp);
4520         mp = ip->i_mount;
4521
4522         /*
4523          * must be a regular file and have write permission
4524          */
4525         if (!VN_ISREG(vp))
4526                 return XFS_ERROR(EINVAL);
4527
4528         xfs_ilock(ip, XFS_ILOCK_SHARED);
4529
4530         if ((error = xfs_iaccess(ip, S_IWUSR, credp))) {
4531                 xfs_iunlock(ip, XFS_ILOCK_SHARED);
4532                 return error;
4533         }
4534
4535         xfs_iunlock(ip, XFS_ILOCK_SHARED);
4536
4537         switch (bf->l_whence) {
4538         case 0: /*SEEK_SET*/
4539                 break;
4540         case 1: /*SEEK_CUR*/
4541                 bf->l_start += offset;
4542                 break;
4543         case 2: /*SEEK_END*/
4544                 bf->l_start += ip->i_size;
4545                 break;
4546         default:
4547                 return XFS_ERROR(EINVAL);
4548         }
4549
4550         llen = bf->l_len > 0 ? bf->l_len - 1 : bf->l_len;
4551
4552         if (   (bf->l_start < 0)
4553             || (bf->l_start > XFS_MAXIOFFSET(mp))
4554             || (bf->l_start + llen < 0)
4555             || (bf->l_start + llen > XFS_MAXIOFFSET(mp)))
4556                 return XFS_ERROR(EINVAL);
4557
4558         bf->l_whence = 0;
4559
4560         startoffset = bf->l_start;
4561         fsize = ip->i_size;
4562
4563         /*
4564          * XFS_IOC_RESVSP and XFS_IOC_UNRESVSP will reserve or unreserve
4565          * file space.
4566          * These calls do NOT zero the data space allocated to the file,
4567          * nor do they change the file size.
4568          *
4569          * XFS_IOC_ALLOCSP and XFS_IOC_FREESP will allocate and free file
4570          * space.
4571          * These calls cause the new file data to be zeroed and the file
4572          * size to be changed.
4573          */
4574         setprealloc = clrprealloc = 0;
4575
4576         switch (cmd) {
4577         case XFS_IOC_RESVSP:
4578         case XFS_IOC_RESVSP64:
4579                 error = xfs_alloc_file_space(ip, startoffset, bf->l_len,
4580                                                                 1, attr_flags);
4581                 if (error)
4582                         return error;
4583                 setprealloc = 1;
4584                 break;
4585
4586         case XFS_IOC_UNRESVSP:
4587         case XFS_IOC_UNRESVSP64:
4588                 if ((error = xfs_free_file_space(ip, startoffset, bf->l_len,
4589                                                                 attr_flags)))
4590                         return error;
4591                 break;
4592
4593         case XFS_IOC_ALLOCSP:
4594         case XFS_IOC_ALLOCSP64:
4595         case XFS_IOC_FREESP:
4596         case XFS_IOC_FREESP64:
4597                 if (startoffset > fsize) {
4598                         error = xfs_alloc_file_space(ip, fsize,
4599                                         startoffset - fsize, 0, attr_flags);
4600                         if (error)
4601                                 break;
4602                 }
4603
4604                 va.va_mask = XFS_AT_SIZE;
4605                 va.va_size = startoffset;
4606
4607                 error = xfs_setattr(bdp, &va, attr_flags, credp);
4608
4609                 if (error)
4610                         return error;
4611
4612                 clrprealloc = 1;
4613                 break;
4614
4615         default:
4616                 ASSERT(0);
4617                 return XFS_ERROR(EINVAL);
4618         }
4619
4620         /*
4621          * update the inode timestamp, mode, and prealloc flag bits
4622          */
4623         tp = xfs_trans_alloc(mp, XFS_TRANS_WRITEID);
4624
4625         if ((error = xfs_trans_reserve(tp, 0, XFS_WRITEID_LOG_RES(mp),
4626                                       0, 0, 0))) {
4627                 /* ASSERT(0); */
4628                 xfs_trans_cancel(tp, 0);
4629                 return error;
4630         }
4631
4632         xfs_ilock(ip, XFS_ILOCK_EXCL);
4633
4634         xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
4635         xfs_trans_ihold(tp, ip);
4636
4637         if ((attr_flags & ATTR_DMI) == 0) {
4638                 ip->i_d.di_mode &= ~S_ISUID;
4639
4640                 /*
4641                  * Note that we don't have to worry about mandatory
4642                  * file locking being disabled here because we only
4643                  * clear the S_ISGID bit if the Group execute bit is
4644                  * on, but if it was on then mandatory locking wouldn't
4645                  * have been enabled.
4646                  */
4647                 if (ip->i_d.di_mode & S_IXGRP)
4648                         ip->i_d.di_mode &= ~S_ISGID;
4649
4650                 xfs_ichgtime(ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
4651         }
4652         if (setprealloc)
4653                 ip->i_d.di_flags |= XFS_DIFLAG_PREALLOC;
4654         else if (clrprealloc)
4655                 ip->i_d.di_flags &= ~XFS_DIFLAG_PREALLOC;
4656
4657         xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
4658         xfs_trans_set_sync(tp);
4659
4660         error = xfs_trans_commit(tp, 0);
4661
4662         xfs_iunlock(ip, XFS_ILOCK_EXCL);
4663
4664         return error;
4665 }
4666
4667 bhv_vnodeops_t xfs_vnodeops = {
4668         BHV_IDENTITY_INIT(VN_BHV_XFS,VNODE_POSITION_XFS),
4669         .vop_open               = xfs_open,
4670         .vop_read               = xfs_read,
4671 #ifdef HAVE_SPLICE
4672         .vop_splice_read        = xfs_splice_read,
4673         .vop_splice_write       = xfs_splice_write,
4674 #endif
4675         .vop_write              = xfs_write,
4676         .vop_ioctl              = xfs_ioctl,
4677         .vop_getattr            = xfs_getattr,
4678         .vop_setattr            = xfs_setattr,
4679         .vop_access             = xfs_access,
4680         .vop_lookup             = xfs_lookup,
4681         .vop_create             = xfs_create,
4682         .vop_remove             = xfs_remove,
4683         .vop_link               = xfs_link,
4684         .vop_rename             = xfs_rename,
4685         .vop_mkdir              = xfs_mkdir,
4686         .vop_rmdir              = xfs_rmdir,
4687         .vop_readdir            = xfs_readdir,
4688         .vop_symlink            = xfs_symlink,
4689         .vop_readlink           = xfs_readlink,
4690         .vop_fsync              = xfs_fsync,
4691         .vop_inactive           = xfs_inactive,
4692         .vop_fid2               = xfs_fid2,
4693         .vop_rwlock             = xfs_rwlock,
4694         .vop_rwunlock           = xfs_rwunlock,
4695         .vop_bmap               = xfs_bmap,
4696         .vop_reclaim            = xfs_reclaim,
4697         .vop_attr_get           = xfs_attr_get,
4698         .vop_attr_set           = xfs_attr_set,
4699         .vop_attr_remove        = xfs_attr_remove,
4700         .vop_attr_list          = xfs_attr_list,
4701         .vop_link_removed       = (vop_link_removed_t)fs_noval,
4702         .vop_vnode_change       = (vop_vnode_change_t)fs_noval,
4703         .vop_tosspages          = fs_tosspages,
4704         .vop_flushinval_pages   = fs_flushinval_pages,
4705         .vop_flush_pages        = fs_flush_pages,
4706         .vop_release            = xfs_release,
4707         .vop_iflush             = xfs_inode_flush,
4708 };