]> www.pilppa.org Git - linux-2.6-omap-h63xx.git/blob - fs/xfs/xfs_vnodeops.c
[XFS] Now that xfs_setattr is only used for attributes set from ->setattr
[linux-2.6-omap-h63xx.git] / fs / xfs / xfs_vnodeops.c
1 /*
2  * Copyright (c) 2000-2006 Silicon Graphics, Inc.
3  * All Rights Reserved.
4  *
5  * This program is free software; you can redistribute it and/or
6  * modify it under the terms of the GNU General Public License as
7  * published by the Free Software Foundation.
8  *
9  * This program is distributed in the hope that it would be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12  * GNU General Public License for more details.
13  *
14  * You should have received a copy of the GNU General Public License
15  * along with this program; if not, write the Free Software Foundation,
16  * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
17  */
18
19 #include "xfs.h"
20 #include "xfs_fs.h"
21 #include "xfs_types.h"
22 #include "xfs_bit.h"
23 #include "xfs_log.h"
24 #include "xfs_inum.h"
25 #include "xfs_trans.h"
26 #include "xfs_sb.h"
27 #include "xfs_ag.h"
28 #include "xfs_dir2.h"
29 #include "xfs_dmapi.h"
30 #include "xfs_mount.h"
31 #include "xfs_da_btree.h"
32 #include "xfs_bmap_btree.h"
33 #include "xfs_alloc_btree.h"
34 #include "xfs_ialloc_btree.h"
35 #include "xfs_dir2_sf.h"
36 #include "xfs_attr_sf.h"
37 #include "xfs_dinode.h"
38 #include "xfs_inode.h"
39 #include "xfs_inode_item.h"
40 #include "xfs_itable.h"
41 #include "xfs_btree.h"
42 #include "xfs_ialloc.h"
43 #include "xfs_alloc.h"
44 #include "xfs_bmap.h"
45 #include "xfs_attr.h"
46 #include "xfs_rw.h"
47 #include "xfs_error.h"
48 #include "xfs_quota.h"
49 #include "xfs_utils.h"
50 #include "xfs_rtalloc.h"
51 #include "xfs_trans_space.h"
52 #include "xfs_log_priv.h"
53 #include "xfs_filestream.h"
54 #include "xfs_vnodeops.h"
55
56 int
57 xfs_open(
58         xfs_inode_t     *ip)
59 {
60         int             mode;
61
62         if (XFS_FORCED_SHUTDOWN(ip->i_mount))
63                 return XFS_ERROR(EIO);
64
65         /*
66          * If it's a directory with any blocks, read-ahead block 0
67          * as we're almost certain to have the next operation be a read there.
68          */
69         if (S_ISDIR(ip->i_d.di_mode) && ip->i_d.di_nextents > 0) {
70                 mode = xfs_ilock_map_shared(ip);
71                 if (ip->i_d.di_nextents > 0)
72                         (void)xfs_da_reada_buf(NULL, ip, 0, XFS_DATA_FORK);
73                 xfs_iunlock(ip, mode);
74         }
75         return 0;
76 }
77
78 int
79 xfs_setattr(
80         struct xfs_inode        *ip,
81         struct iattr            *iattr,
82         int                     flags,
83         cred_t                  *credp)
84 {
85         xfs_mount_t             *mp = ip->i_mount;
86         int                     mask = iattr->ia_valid;
87         xfs_trans_t             *tp;
88         int                     code;
89         uint                    lock_flags;
90         uint                    commit_flags=0;
91         uid_t                   uid=0, iuid=0;
92         gid_t                   gid=0, igid=0;
93         int                     timeflags = 0;
94         struct xfs_dquot        *udqp, *gdqp, *olddquot1, *olddquot2;
95         int                     file_owner;
96         int                     need_iolock = 1;
97
98         xfs_itrace_entry(ip);
99
100         if (mp->m_flags & XFS_MOUNT_RDONLY)
101                 return XFS_ERROR(EROFS);
102
103         if (XFS_FORCED_SHUTDOWN(mp))
104                 return XFS_ERROR(EIO);
105
106         olddquot1 = olddquot2 = NULL;
107         udqp = gdqp = NULL;
108
109         /*
110          * If disk quotas is on, we make sure that the dquots do exist on disk,
111          * before we start any other transactions. Trying to do this later
112          * is messy. We don't care to take a readlock to look at the ids
113          * in inode here, because we can't hold it across the trans_reserve.
114          * If the IDs do change before we take the ilock, we're covered
115          * because the i_*dquot fields will get updated anyway.
116          */
117         if (XFS_IS_QUOTA_ON(mp) && (mask & (ATTR_UID|ATTR_GID))) {
118                 uint    qflags = 0;
119
120                 if ((mask & ATTR_UID) && XFS_IS_UQUOTA_ON(mp)) {
121                         uid = iattr->ia_uid;
122                         qflags |= XFS_QMOPT_UQUOTA;
123                 } else {
124                         uid = ip->i_d.di_uid;
125                 }
126                 if ((mask & ATTR_GID) && XFS_IS_GQUOTA_ON(mp)) {
127                         gid = iattr->ia_gid;
128                         qflags |= XFS_QMOPT_GQUOTA;
129                 }  else {
130                         gid = ip->i_d.di_gid;
131                 }
132
133                 /*
134                  * We take a reference when we initialize udqp and gdqp,
135                  * so it is important that we never blindly double trip on
136                  * the same variable. See xfs_create() for an example.
137                  */
138                 ASSERT(udqp == NULL);
139                 ASSERT(gdqp == NULL);
140                 code = XFS_QM_DQVOPALLOC(mp, ip, uid, gid, ip->i_d.di_projid,
141                                          qflags, &udqp, &gdqp);
142                 if (code)
143                         return code;
144         }
145
146         /*
147          * For the other attributes, we acquire the inode lock and
148          * first do an error checking pass.
149          */
150         tp = NULL;
151         lock_flags = XFS_ILOCK_EXCL;
152         if (flags & XFS_ATTR_NOLOCK)
153                 need_iolock = 0;
154         if (!(mask & ATTR_SIZE)) {
155                 if ((mask != (ATTR_CTIME|ATTR_ATIME|ATTR_MTIME)) ||
156                     (mp->m_flags & XFS_MOUNT_WSYNC)) {
157                         tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE);
158                         commit_flags = 0;
159                         if ((code = xfs_trans_reserve(tp, 0,
160                                                      XFS_ICHANGE_LOG_RES(mp), 0,
161                                                      0, 0))) {
162                                 lock_flags = 0;
163                                 goto error_return;
164                         }
165                 }
166         } else {
167                 if (DM_EVENT_ENABLED(ip, DM_EVENT_TRUNCATE) &&
168                     !(flags & XFS_ATTR_DMI)) {
169                         int dmflags = AT_DELAY_FLAG(flags) | DM_SEM_FLAG_WR;
170                         code = XFS_SEND_DATA(mp, DM_EVENT_TRUNCATE, ip,
171                                 iattr->ia_size, 0, dmflags, NULL);
172                         if (code) {
173                                 lock_flags = 0;
174                                 goto error_return;
175                         }
176                 }
177                 if (need_iolock)
178                         lock_flags |= XFS_IOLOCK_EXCL;
179         }
180
181         xfs_ilock(ip, lock_flags);
182
183         /* boolean: are we the file owner? */
184         file_owner = (current_fsuid(credp) == ip->i_d.di_uid);
185
186         /*
187          * Change various properties of a file.
188          * Only the owner or users with CAP_FOWNER
189          * capability may do these things.
190          */
191         if (mask & (ATTR_MODE|ATTR_UID|ATTR_GID)) {
192                 /*
193                  * CAP_FOWNER overrides the following restrictions:
194                  *
195                  * The user ID of the calling process must be equal
196                  * to the file owner ID, except in cases where the
197                  * CAP_FSETID capability is applicable.
198                  */
199                 if (!file_owner && !capable(CAP_FOWNER)) {
200                         code = XFS_ERROR(EPERM);
201                         goto error_return;
202                 }
203
204                 /*
205                  * CAP_FSETID overrides the following restrictions:
206                  *
207                  * The effective user ID of the calling process shall match
208                  * the file owner when setting the set-user-ID and
209                  * set-group-ID bits on that file.
210                  *
211                  * The effective group ID or one of the supplementary group
212                  * IDs of the calling process shall match the group owner of
213                  * the file when setting the set-group-ID bit on that file
214                  */
215                 if (mask & ATTR_MODE) {
216                         mode_t m = 0;
217
218                         if ((iattr->ia_mode & S_ISUID) && !file_owner)
219                                 m |= S_ISUID;
220                         if ((iattr->ia_mode & S_ISGID) &&
221                             !in_group_p((gid_t)ip->i_d.di_gid))
222                                 m |= S_ISGID;
223 #if 0
224                         /* Linux allows this, Irix doesn't. */
225                         if ((iattr->ia_mode & S_ISVTX) && !S_ISDIR(ip->i_d.di_mode))
226                                 m |= S_ISVTX;
227 #endif
228                         if (m && !capable(CAP_FSETID))
229                                 iattr->ia_mode &= ~m;
230                 }
231         }
232
233         /*
234          * Change file ownership.  Must be the owner or privileged.
235          * If the system was configured with the "restricted_chown"
236          * option, the owner is not permitted to give away the file,
237          * and can change the group id only to a group of which he
238          * or she is a member.
239          */
240         if (mask & (ATTR_UID|ATTR_GID)) {
241                 /*
242                  * These IDs could have changed since we last looked at them.
243                  * But, we're assured that if the ownership did change
244                  * while we didn't have the inode locked, inode's dquot(s)
245                  * would have changed also.
246                  */
247                 iuid = ip->i_d.di_uid;
248                 igid = ip->i_d.di_gid;
249                 gid = (mask & ATTR_GID) ? iattr->ia_gid : igid;
250                 uid = (mask & ATTR_UID) ? iattr->ia_uid : iuid;
251
252                 /*
253                  * CAP_CHOWN overrides the following restrictions:
254                  *
255                  * If _POSIX_CHOWN_RESTRICTED is defined, this capability
256                  * shall override the restriction that a process cannot
257                  * change the user ID of a file it owns and the restriction
258                  * that the group ID supplied to the chown() function
259                  * shall be equal to either the group ID or one of the
260                  * supplementary group IDs of the calling process.
261                  */
262                 if (restricted_chown &&
263                     (iuid != uid || (igid != gid &&
264                                      !in_group_p((gid_t)gid))) &&
265                     !capable(CAP_CHOWN)) {
266                         code = XFS_ERROR(EPERM);
267                         goto error_return;
268                 }
269                 /*
270                  * Do a quota reservation only if uid/gid is actually
271                  * going to change.
272                  */
273                 if ((XFS_IS_UQUOTA_ON(mp) && iuid != uid) ||
274                     (XFS_IS_GQUOTA_ON(mp) && igid != gid)) {
275                         ASSERT(tp);
276                         code = XFS_QM_DQVOPCHOWNRESV(mp, tp, ip, udqp, gdqp,
277                                                 capable(CAP_FOWNER) ?
278                                                 XFS_QMOPT_FORCE_RES : 0);
279                         if (code)       /* out of quota */
280                                 goto error_return;
281                 }
282         }
283
284         /*
285          * Truncate file.  Must have write permission and not be a directory.
286          */
287         if (mask & ATTR_SIZE) {
288                 /* Short circuit the truncate case for zero length files */
289                 if (iattr->ia_size == 0 &&
290                     ip->i_size == 0 && ip->i_d.di_nextents == 0) {
291                         xfs_iunlock(ip, XFS_ILOCK_EXCL);
292                         lock_flags &= ~XFS_ILOCK_EXCL;
293                         if (mask & ATTR_CTIME)
294                                 xfs_ichgtime(ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
295                         code = 0;
296                         goto error_return;
297                 }
298
299                 if (S_ISDIR(ip->i_d.di_mode)) {
300                         code = XFS_ERROR(EISDIR);
301                         goto error_return;
302                 } else if (!S_ISREG(ip->i_d.di_mode)) {
303                         code = XFS_ERROR(EINVAL);
304                         goto error_return;
305                 }
306                 /*
307                  * Make sure that the dquots are attached to the inode.
308                  */
309                 if ((code = XFS_QM_DQATTACH(mp, ip, XFS_QMOPT_ILOCKED)))
310                         goto error_return;
311         }
312
313         /*
314          * Change file access or modified times.
315          */
316         if (mask & (ATTR_ATIME|ATTR_MTIME)) {
317                 if (!file_owner) {
318                         if ((mask & (ATTR_MTIME_SET|ATTR_ATIME_SET)) &&
319                             !capable(CAP_FOWNER)) {
320                                 code = XFS_ERROR(EPERM);
321                                 goto error_return;
322                         }
323                 }
324         }
325
326         /*
327          * Now we can make the changes.  Before we join the inode
328          * to the transaction, if ATTR_SIZE is set then take care of
329          * the part of the truncation that must be done without the
330          * inode lock.  This needs to be done before joining the inode
331          * to the transaction, because the inode cannot be unlocked
332          * once it is a part of the transaction.
333          */
334         if (mask & ATTR_SIZE) {
335                 code = 0;
336                 if (iattr->ia_size > ip->i_size) {
337                         /*
338                          * Do the first part of growing a file: zero any data
339                          * in the last block that is beyond the old EOF.  We
340                          * need to do this before the inode is joined to the
341                          * transaction to modify the i_size.
342                          */
343                         code = xfs_zero_eof(ip, iattr->ia_size, ip->i_size);
344                 }
345                 xfs_iunlock(ip, XFS_ILOCK_EXCL);
346
347                 /*
348                  * We are going to log the inode size change in this
349                  * transaction so any previous writes that are beyond the on
350                  * disk EOF and the new EOF that have not been written out need
351                  * to be written here. If we do not write the data out, we
352                  * expose ourselves to the null files problem.
353                  *
354                  * Only flush from the on disk size to the smaller of the in
355                  * memory file size or the new size as that's the range we
356                  * really care about here and prevents waiting for other data
357                  * not within the range we care about here.
358                  */
359                 if (!code &&
360                     ip->i_size != ip->i_d.di_size &&
361                     iattr->ia_size > ip->i_d.di_size) {
362                         code = xfs_flush_pages(ip,
363                                         ip->i_d.di_size, iattr->ia_size,
364                                         XFS_B_ASYNC, FI_NONE);
365                 }
366
367                 /* wait for all I/O to complete */
368                 vn_iowait(ip);
369
370                 if (!code)
371                         code = xfs_itruncate_data(ip, iattr->ia_size);
372                 if (code) {
373                         ASSERT(tp == NULL);
374                         lock_flags &= ~XFS_ILOCK_EXCL;
375                         ASSERT(lock_flags == XFS_IOLOCK_EXCL);
376                         goto error_return;
377                 }
378                 tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_SIZE);
379                 if ((code = xfs_trans_reserve(tp, 0,
380                                              XFS_ITRUNCATE_LOG_RES(mp), 0,
381                                              XFS_TRANS_PERM_LOG_RES,
382                                              XFS_ITRUNCATE_LOG_COUNT))) {
383                         xfs_trans_cancel(tp, 0);
384                         if (need_iolock)
385                                 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
386                         return code;
387                 }
388                 commit_flags = XFS_TRANS_RELEASE_LOG_RES;
389                 xfs_ilock(ip, XFS_ILOCK_EXCL);
390         }
391
392         if (tp) {
393                 xfs_trans_ijoin(tp, ip, lock_flags);
394                 xfs_trans_ihold(tp, ip);
395         }
396
397         /*
398          * Truncate file.  Must have write permission and not be a directory.
399          */
400         if (mask & ATTR_SIZE) {
401                 /*
402                  * Only change the c/mtime if we are changing the size
403                  * or we are explicitly asked to change it. This handles
404                  * the semantic difference between truncate() and ftruncate()
405                  * as implemented in the VFS.
406                  */
407                 if (iattr->ia_size != ip->i_size || (mask & ATTR_CTIME))
408                         timeflags |= XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG;
409
410                 if (iattr->ia_size > ip->i_size) {
411                         ip->i_d.di_size = iattr->ia_size;
412                         ip->i_size = iattr->ia_size;
413                         if (!(flags & XFS_ATTR_DMI))
414                                 xfs_ichgtime(ip, XFS_ICHGTIME_CHG);
415                         xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
416                 } else if (iattr->ia_size <= ip->i_size ||
417                            (iattr->ia_size == 0 && ip->i_d.di_nextents)) {
418                         /*
419                          * signal a sync transaction unless
420                          * we're truncating an already unlinked
421                          * file on a wsync filesystem
422                          */
423                         code = xfs_itruncate_finish(&tp, ip, iattr->ia_size,
424                                             XFS_DATA_FORK,
425                                             ((ip->i_d.di_nlink != 0 ||
426                                               !(mp->m_flags & XFS_MOUNT_WSYNC))
427                                              ? 1 : 0));
428                         if (code)
429                                 goto abort_return;
430                         /*
431                          * Truncated "down", so we're removing references
432                          * to old data here - if we now delay flushing for
433                          * a long time, we expose ourselves unduly to the
434                          * notorious NULL files problem.  So, we mark this
435                          * vnode and flush it when the file is closed, and
436                          * do not wait the usual (long) time for writeout.
437                          */
438                         xfs_iflags_set(ip, XFS_ITRUNCATED);
439                 }
440         }
441
442         /*
443          * Change file access modes.
444          */
445         if (mask & ATTR_MODE) {
446                 ip->i_d.di_mode &= S_IFMT;
447                 ip->i_d.di_mode |= iattr->ia_mode & ~S_IFMT;
448
449                 xfs_trans_log_inode (tp, ip, XFS_ILOG_CORE);
450                 timeflags |= XFS_ICHGTIME_CHG;
451         }
452
453         /*
454          * Change file ownership.  Must be the owner or privileged.
455          * If the system was configured with the "restricted_chown"
456          * option, the owner is not permitted to give away the file,
457          * and can change the group id only to a group of which he
458          * or she is a member.
459          */
460         if (mask & (ATTR_UID|ATTR_GID)) {
461                 /*
462                  * CAP_FSETID overrides the following restrictions:
463                  *
464                  * The set-user-ID and set-group-ID bits of a file will be
465                  * cleared upon successful return from chown()
466                  */
467                 if ((ip->i_d.di_mode & (S_ISUID|S_ISGID)) &&
468                     !capable(CAP_FSETID)) {
469                         ip->i_d.di_mode &= ~(S_ISUID|S_ISGID);
470                 }
471
472                 /*
473                  * Change the ownerships and register quota modifications
474                  * in the transaction.
475                  */
476                 if (iuid != uid) {
477                         if (XFS_IS_UQUOTA_ON(mp)) {
478                                 ASSERT(mask & ATTR_UID);
479                                 ASSERT(udqp);
480                                 olddquot1 = XFS_QM_DQVOPCHOWN(mp, tp, ip,
481                                                         &ip->i_udquot, udqp);
482                         }
483                         ip->i_d.di_uid = uid;
484                 }
485                 if (igid != gid) {
486                         if (XFS_IS_GQUOTA_ON(mp)) {
487                                 ASSERT(!XFS_IS_PQUOTA_ON(mp));
488                                 ASSERT(mask & ATTR_GID);
489                                 ASSERT(gdqp);
490                                 olddquot2 = XFS_QM_DQVOPCHOWN(mp, tp, ip,
491                                                         &ip->i_gdquot, gdqp);
492                         }
493                         ip->i_d.di_gid = gid;
494                 }
495
496                 xfs_trans_log_inode (tp, ip, XFS_ILOG_CORE);
497                 timeflags |= XFS_ICHGTIME_CHG;
498         }
499
500
501         /*
502          * Change file access or modified times.
503          */
504         if (mask & (ATTR_ATIME|ATTR_MTIME)) {
505                 if (mask & ATTR_ATIME) {
506                         ip->i_d.di_atime.t_sec = iattr->ia_atime.tv_sec;
507                         ip->i_d.di_atime.t_nsec = iattr->ia_atime.tv_nsec;
508                         ip->i_update_core = 1;
509                         timeflags &= ~XFS_ICHGTIME_ACC;
510                 }
511                 if (mask & ATTR_MTIME) {
512                         ip->i_d.di_mtime.t_sec = iattr->ia_mtime.tv_sec;
513                         ip->i_d.di_mtime.t_nsec = iattr->ia_mtime.tv_nsec;
514                         timeflags &= ~XFS_ICHGTIME_MOD;
515                         timeflags |= XFS_ICHGTIME_CHG;
516                 }
517                 if (tp && (mask & (ATTR_MTIME_SET|ATTR_ATIME_SET)))
518                         xfs_trans_log_inode (tp, ip, XFS_ILOG_CORE);
519         }
520
521         /*
522          * Change file inode change time only if ATTR_CTIME set
523          * AND we have been called by a DMI function.
524          */
525
526         if ((flags & XFS_ATTR_DMI) && (mask & ATTR_CTIME)) {
527                 ip->i_d.di_ctime.t_sec = iattr->ia_ctime.tv_sec;
528                 ip->i_d.di_ctime.t_nsec = iattr->ia_ctime.tv_nsec;
529                 ip->i_update_core = 1;
530                 timeflags &= ~XFS_ICHGTIME_CHG;
531         }
532
533         /*
534          * Send out timestamp changes that need to be set to the
535          * current time.  Not done when called by a DMI function.
536          */
537         if (timeflags && !(flags & XFS_ATTR_DMI))
538                 xfs_ichgtime(ip, timeflags);
539
540         XFS_STATS_INC(xs_ig_attrchg);
541
542         /*
543          * If this is a synchronous mount, make sure that the
544          * transaction goes to disk before returning to the user.
545          * This is slightly sub-optimal in that truncates require
546          * two sync transactions instead of one for wsync filesystems.
547          * One for the truncate and one for the timestamps since we
548          * don't want to change the timestamps unless we're sure the
549          * truncate worked.  Truncates are less than 1% of the laddis
550          * mix so this probably isn't worth the trouble to optimize.
551          */
552         code = 0;
553         if (tp) {
554                 if (mp->m_flags & XFS_MOUNT_WSYNC)
555                         xfs_trans_set_sync(tp);
556
557                 code = xfs_trans_commit(tp, commit_flags);
558         }
559
560         xfs_iunlock(ip, lock_flags);
561
562         /*
563          * Release any dquot(s) the inode had kept before chown.
564          */
565         XFS_QM_DQRELE(mp, olddquot1);
566         XFS_QM_DQRELE(mp, olddquot2);
567         XFS_QM_DQRELE(mp, udqp);
568         XFS_QM_DQRELE(mp, gdqp);
569
570         if (code) {
571                 return code;
572         }
573
574         if (DM_EVENT_ENABLED(ip, DM_EVENT_ATTRIBUTE) &&
575             !(flags & XFS_ATTR_DMI)) {
576                 (void) XFS_SEND_NAMESP(mp, DM_EVENT_ATTRIBUTE, ip, DM_RIGHT_NULL,
577                                         NULL, DM_RIGHT_NULL, NULL, NULL,
578                                         0, 0, AT_DELAY_FLAG(flags));
579         }
580         return 0;
581
582  abort_return:
583         commit_flags |= XFS_TRANS_ABORT;
584         /* FALLTHROUGH */
585  error_return:
586         XFS_QM_DQRELE(mp, udqp);
587         XFS_QM_DQRELE(mp, gdqp);
588         if (tp) {
589                 xfs_trans_cancel(tp, commit_flags);
590         }
591         if (lock_flags != 0) {
592                 xfs_iunlock(ip, lock_flags);
593         }
594         return code;
595 }
596
597 /*
598  * The maximum pathlen is 1024 bytes. Since the minimum file system
599  * blocksize is 512 bytes, we can get a max of 2 extents back from
600  * bmapi.
601  */
602 #define SYMLINK_MAPS 2
603
604 STATIC int
605 xfs_readlink_bmap(
606         xfs_inode_t     *ip,
607         char            *link)
608 {
609         xfs_mount_t     *mp = ip->i_mount;
610         int             pathlen = ip->i_d.di_size;
611         int             nmaps = SYMLINK_MAPS;
612         xfs_bmbt_irec_t mval[SYMLINK_MAPS];
613         xfs_daddr_t     d;
614         int             byte_cnt;
615         int             n;
616         xfs_buf_t       *bp;
617         int             error = 0;
618
619         error = xfs_bmapi(NULL, ip, 0, XFS_B_TO_FSB(mp, pathlen), 0, NULL, 0,
620                         mval, &nmaps, NULL, NULL);
621         if (error)
622                 goto out;
623
624         for (n = 0; n < nmaps; n++) {
625                 d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock);
626                 byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount);
627
628                 bp = xfs_buf_read(mp->m_ddev_targp, d, BTOBB(byte_cnt), 0);
629                 error = XFS_BUF_GETERROR(bp);
630                 if (error) {
631                         xfs_ioerror_alert("xfs_readlink",
632                                   ip->i_mount, bp, XFS_BUF_ADDR(bp));
633                         xfs_buf_relse(bp);
634                         goto out;
635                 }
636                 if (pathlen < byte_cnt)
637                         byte_cnt = pathlen;
638                 pathlen -= byte_cnt;
639
640                 memcpy(link, XFS_BUF_PTR(bp), byte_cnt);
641                 xfs_buf_relse(bp);
642         }
643
644         link[ip->i_d.di_size] = '\0';
645         error = 0;
646
647  out:
648         return error;
649 }
650
651 int
652 xfs_readlink(
653         xfs_inode_t     *ip,
654         char            *link)
655 {
656         xfs_mount_t     *mp = ip->i_mount;
657         int             pathlen;
658         int             error = 0;
659
660         xfs_itrace_entry(ip);
661
662         if (XFS_FORCED_SHUTDOWN(mp))
663                 return XFS_ERROR(EIO);
664
665         xfs_ilock(ip, XFS_ILOCK_SHARED);
666
667         ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFLNK);
668         ASSERT(ip->i_d.di_size <= MAXPATHLEN);
669
670         pathlen = ip->i_d.di_size;
671         if (!pathlen)
672                 goto out;
673
674         if (ip->i_df.if_flags & XFS_IFINLINE) {
675                 memcpy(link, ip->i_df.if_u1.if_data, pathlen);
676                 link[pathlen] = '\0';
677         } else {
678                 error = xfs_readlink_bmap(ip, link);
679         }
680
681  out:
682         xfs_iunlock(ip, XFS_ILOCK_SHARED);
683         return error;
684 }
685
686 /*
687  * xfs_fsync
688  *
689  * This is called to sync the inode and its data out to disk.  We need to hold
690  * the I/O lock while flushing the data, and the inode lock while flushing the
691  * inode.  The inode lock CANNOT be held while flushing the data, so acquire
692  * after we're done with that.
693  */
694 int
695 xfs_fsync(
696         xfs_inode_t     *ip)
697 {
698         xfs_trans_t     *tp;
699         int             error;
700         int             log_flushed = 0, changed = 1;
701
702         xfs_itrace_entry(ip);
703
704         if (XFS_FORCED_SHUTDOWN(ip->i_mount))
705                 return XFS_ERROR(EIO);
706
707         /* capture size updates in I/O completion before writing the inode. */
708         error = filemap_fdatawait(vn_to_inode(XFS_ITOV(ip))->i_mapping);
709         if (error)
710                 return XFS_ERROR(error);
711
712         /*
713          * We always need to make sure that the required inode state is safe on
714          * disk.  The vnode might be clean but we still might need to force the
715          * log because of committed transactions that haven't hit the disk yet.
716          * Likewise, there could be unflushed non-transactional changes to the
717          * inode core that have to go to disk and this requires us to issue
718          * a synchronous transaction to capture these changes correctly.
719          *
720          * This code relies on the assumption that if the update_* fields
721          * of the inode are clear and the inode is unpinned then it is clean
722          * and no action is required.
723          */
724         xfs_ilock(ip, XFS_ILOCK_SHARED);
725
726         if (!(ip->i_update_size || ip->i_update_core)) {
727                 /*
728                  * Timestamps/size haven't changed since last inode flush or
729                  * inode transaction commit.  That means either nothing got
730                  * written or a transaction committed which caught the updates.
731                  * If the latter happened and the transaction hasn't hit the
732                  * disk yet, the inode will be still be pinned.  If it is,
733                  * force the log.
734                  */
735
736                 xfs_iunlock(ip, XFS_ILOCK_SHARED);
737
738                 if (xfs_ipincount(ip)) {
739                         error = _xfs_log_force(ip->i_mount, (xfs_lsn_t)0,
740                                       XFS_LOG_FORCE | XFS_LOG_SYNC,
741                                       &log_flushed);
742                 } else {
743                         /*
744                          * If the inode is not pinned and nothing has changed
745                          * we don't need to flush the cache.
746                          */
747                         changed = 0;
748                 }
749         } else  {
750                 /*
751                  * Kick off a transaction to log the inode core to get the
752                  * updates.  The sync transaction will also force the log.
753                  */
754                 xfs_iunlock(ip, XFS_ILOCK_SHARED);
755                 tp = xfs_trans_alloc(ip->i_mount, XFS_TRANS_FSYNC_TS);
756                 error = xfs_trans_reserve(tp, 0,
757                                 XFS_FSYNC_TS_LOG_RES(ip->i_mount), 0, 0, 0);
758                 if (error) {
759                         xfs_trans_cancel(tp, 0);
760                         return error;
761                 }
762                 xfs_ilock(ip, XFS_ILOCK_EXCL);
763
764                 /*
765                  * Note - it's possible that we might have pushed ourselves out
766                  * of the way during trans_reserve which would flush the inode.
767                  * But there's no guarantee that the inode buffer has actually
768                  * gone out yet (it's delwri).  Plus the buffer could be pinned
769                  * anyway if it's part of an inode in another recent
770                  * transaction.  So we play it safe and fire off the
771                  * transaction anyway.
772                  */
773                 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
774                 xfs_trans_ihold(tp, ip);
775                 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
776                 xfs_trans_set_sync(tp);
777                 error = _xfs_trans_commit(tp, 0, &log_flushed);
778
779                 xfs_iunlock(ip, XFS_ILOCK_EXCL);
780         }
781
782         if ((ip->i_mount->m_flags & XFS_MOUNT_BARRIER) && changed) {
783                 /*
784                  * If the log write didn't issue an ordered tag we need
785                  * to flush the disk cache for the data device now.
786                  */
787                 if (!log_flushed)
788                         xfs_blkdev_issue_flush(ip->i_mount->m_ddev_targp);
789
790                 /*
791                  * If this inode is on the RT dev we need to flush that
792                  * cache as well.
793                  */
794                 if (XFS_IS_REALTIME_INODE(ip))
795                         xfs_blkdev_issue_flush(ip->i_mount->m_rtdev_targp);
796         }
797
798         return error;
799 }
800
801 /*
802  * This is called by xfs_inactive to free any blocks beyond eof
803  * when the link count isn't zero and by xfs_dm_punch_hole() when
804  * punching a hole to EOF.
805  */
806 int
807 xfs_free_eofblocks(
808         xfs_mount_t     *mp,
809         xfs_inode_t     *ip,
810         int             flags)
811 {
812         xfs_trans_t     *tp;
813         int             error;
814         xfs_fileoff_t   end_fsb;
815         xfs_fileoff_t   last_fsb;
816         xfs_filblks_t   map_len;
817         int             nimaps;
818         xfs_bmbt_irec_t imap;
819         int             use_iolock = (flags & XFS_FREE_EOF_LOCK);
820
821         /*
822          * Figure out if there are any blocks beyond the end
823          * of the file.  If not, then there is nothing to do.
824          */
825         end_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)ip->i_size));
826         last_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_MAXIOFFSET(mp));
827         map_len = last_fsb - end_fsb;
828         if (map_len <= 0)
829                 return 0;
830
831         nimaps = 1;
832         xfs_ilock(ip, XFS_ILOCK_SHARED);
833         error = xfs_bmapi(NULL, ip, end_fsb, map_len, 0,
834                           NULL, 0, &imap, &nimaps, NULL, NULL);
835         xfs_iunlock(ip, XFS_ILOCK_SHARED);
836
837         if (!error && (nimaps != 0) &&
838             (imap.br_startblock != HOLESTARTBLOCK ||
839              ip->i_delayed_blks)) {
840                 /*
841                  * Attach the dquots to the inode up front.
842                  */
843                 if ((error = XFS_QM_DQATTACH(mp, ip, 0)))
844                         return error;
845
846                 /*
847                  * There are blocks after the end of file.
848                  * Free them up now by truncating the file to
849                  * its current size.
850                  */
851                 tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
852
853                 /*
854                  * Do the xfs_itruncate_start() call before
855                  * reserving any log space because
856                  * itruncate_start will call into the buffer
857                  * cache and we can't
858                  * do that within a transaction.
859                  */
860                 if (use_iolock)
861                         xfs_ilock(ip, XFS_IOLOCK_EXCL);
862                 error = xfs_itruncate_start(ip, XFS_ITRUNC_DEFINITE,
863                                     ip->i_size);
864                 if (error) {
865                         xfs_trans_cancel(tp, 0);
866                         if (use_iolock)
867                                 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
868                         return error;
869                 }
870
871                 error = xfs_trans_reserve(tp, 0,
872                                           XFS_ITRUNCATE_LOG_RES(mp),
873                                           0, XFS_TRANS_PERM_LOG_RES,
874                                           XFS_ITRUNCATE_LOG_COUNT);
875                 if (error) {
876                         ASSERT(XFS_FORCED_SHUTDOWN(mp));
877                         xfs_trans_cancel(tp, 0);
878                         xfs_iunlock(ip, XFS_IOLOCK_EXCL);
879                         return error;
880                 }
881
882                 xfs_ilock(ip, XFS_ILOCK_EXCL);
883                 xfs_trans_ijoin(tp, ip,
884                                 XFS_IOLOCK_EXCL |
885                                 XFS_ILOCK_EXCL);
886                 xfs_trans_ihold(tp, ip);
887
888                 error = xfs_itruncate_finish(&tp, ip,
889                                              ip->i_size,
890                                              XFS_DATA_FORK,
891                                              0);
892                 /*
893                  * If we get an error at this point we
894                  * simply don't bother truncating the file.
895                  */
896                 if (error) {
897                         xfs_trans_cancel(tp,
898                                          (XFS_TRANS_RELEASE_LOG_RES |
899                                           XFS_TRANS_ABORT));
900                 } else {
901                         error = xfs_trans_commit(tp,
902                                                 XFS_TRANS_RELEASE_LOG_RES);
903                 }
904                 xfs_iunlock(ip, (use_iolock ? (XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL)
905                                             : XFS_ILOCK_EXCL));
906         }
907         return error;
908 }
909
910 /*
911  * Free a symlink that has blocks associated with it.
912  */
913 STATIC int
914 xfs_inactive_symlink_rmt(
915         xfs_inode_t     *ip,
916         xfs_trans_t     **tpp)
917 {
918         xfs_buf_t       *bp;
919         int             committed;
920         int             done;
921         int             error;
922         xfs_fsblock_t   first_block;
923         xfs_bmap_free_t free_list;
924         int             i;
925         xfs_mount_t     *mp;
926         xfs_bmbt_irec_t mval[SYMLINK_MAPS];
927         int             nmaps;
928         xfs_trans_t     *ntp;
929         int             size;
930         xfs_trans_t     *tp;
931
932         tp = *tpp;
933         mp = ip->i_mount;
934         ASSERT(ip->i_d.di_size > XFS_IFORK_DSIZE(ip));
935         /*
936          * We're freeing a symlink that has some
937          * blocks allocated to it.  Free the
938          * blocks here.  We know that we've got
939          * either 1 or 2 extents and that we can
940          * free them all in one bunmapi call.
941          */
942         ASSERT(ip->i_d.di_nextents > 0 && ip->i_d.di_nextents <= 2);
943         if ((error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0,
944                         XFS_TRANS_PERM_LOG_RES, XFS_ITRUNCATE_LOG_COUNT))) {
945                 ASSERT(XFS_FORCED_SHUTDOWN(mp));
946                 xfs_trans_cancel(tp, 0);
947                 *tpp = NULL;
948                 return error;
949         }
950         /*
951          * Lock the inode, fix the size, and join it to the transaction.
952          * Hold it so in the normal path, we still have it locked for
953          * the second transaction.  In the error paths we need it
954          * held so the cancel won't rele it, see below.
955          */
956         xfs_ilock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
957         size = (int)ip->i_d.di_size;
958         ip->i_d.di_size = 0;
959         xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
960         xfs_trans_ihold(tp, ip);
961         xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
962         /*
963          * Find the block(s) so we can inval and unmap them.
964          */
965         done = 0;
966         XFS_BMAP_INIT(&free_list, &first_block);
967         nmaps = ARRAY_SIZE(mval);
968         if ((error = xfs_bmapi(tp, ip, 0, XFS_B_TO_FSB(mp, size),
969                         XFS_BMAPI_METADATA, &first_block, 0, mval, &nmaps,
970                         &free_list, NULL)))
971                 goto error0;
972         /*
973          * Invalidate the block(s).
974          */
975         for (i = 0; i < nmaps; i++) {
976                 bp = xfs_trans_get_buf(tp, mp->m_ddev_targp,
977                         XFS_FSB_TO_DADDR(mp, mval[i].br_startblock),
978                         XFS_FSB_TO_BB(mp, mval[i].br_blockcount), 0);
979                 xfs_trans_binval(tp, bp);
980         }
981         /*
982          * Unmap the dead block(s) to the free_list.
983          */
984         if ((error = xfs_bunmapi(tp, ip, 0, size, XFS_BMAPI_METADATA, nmaps,
985                         &first_block, &free_list, NULL, &done)))
986                 goto error1;
987         ASSERT(done);
988         /*
989          * Commit the first transaction.  This logs the EFI and the inode.
990          */
991         if ((error = xfs_bmap_finish(&tp, &free_list, &committed)))
992                 goto error1;
993         /*
994          * The transaction must have been committed, since there were
995          * actually extents freed by xfs_bunmapi.  See xfs_bmap_finish.
996          * The new tp has the extent freeing and EFDs.
997          */
998         ASSERT(committed);
999         /*
1000          * The first xact was committed, so add the inode to the new one.
1001          * Mark it dirty so it will be logged and moved forward in the log as
1002          * part of every commit.
1003          */
1004         xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
1005         xfs_trans_ihold(tp, ip);
1006         xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1007         /*
1008          * Get a new, empty transaction to return to our caller.
1009          */
1010         ntp = xfs_trans_dup(tp);
1011         /*
1012          * Commit the transaction containing extent freeing and EFDs.
1013          * If we get an error on the commit here or on the reserve below,
1014          * we need to unlock the inode since the new transaction doesn't
1015          * have the inode attached.
1016          */
1017         error = xfs_trans_commit(tp, 0);
1018         tp = ntp;
1019         if (error) {
1020                 ASSERT(XFS_FORCED_SHUTDOWN(mp));
1021                 goto error0;
1022         }
1023         /*
1024          * Remove the memory for extent descriptions (just bookkeeping).
1025          */
1026         if (ip->i_df.if_bytes)
1027                 xfs_idata_realloc(ip, -ip->i_df.if_bytes, XFS_DATA_FORK);
1028         ASSERT(ip->i_df.if_bytes == 0);
1029         /*
1030          * Put an itruncate log reservation in the new transaction
1031          * for our caller.
1032          */
1033         if ((error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0,
1034                         XFS_TRANS_PERM_LOG_RES, XFS_ITRUNCATE_LOG_COUNT))) {
1035                 ASSERT(XFS_FORCED_SHUTDOWN(mp));
1036                 goto error0;
1037         }
1038         /*
1039          * Return with the inode locked but not joined to the transaction.
1040          */
1041         *tpp = tp;
1042         return 0;
1043
1044  error1:
1045         xfs_bmap_cancel(&free_list);
1046  error0:
1047         /*
1048          * Have to come here with the inode locked and either
1049          * (held and in the transaction) or (not in the transaction).
1050          * If the inode isn't held then cancel would iput it, but
1051          * that's wrong since this is inactive and the vnode ref
1052          * count is 0 already.
1053          * Cancel won't do anything to the inode if held, but it still
1054          * needs to be locked until the cancel is done, if it was
1055          * joined to the transaction.
1056          */
1057         xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
1058         xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1059         *tpp = NULL;
1060         return error;
1061
1062 }
1063
1064 STATIC int
1065 xfs_inactive_symlink_local(
1066         xfs_inode_t     *ip,
1067         xfs_trans_t     **tpp)
1068 {
1069         int             error;
1070
1071         ASSERT(ip->i_d.di_size <= XFS_IFORK_DSIZE(ip));
1072         /*
1073          * We're freeing a symlink which fit into
1074          * the inode.  Just free the memory used
1075          * to hold the old symlink.
1076          */
1077         error = xfs_trans_reserve(*tpp, 0,
1078                                   XFS_ITRUNCATE_LOG_RES(ip->i_mount),
1079                                   0, XFS_TRANS_PERM_LOG_RES,
1080                                   XFS_ITRUNCATE_LOG_COUNT);
1081
1082         if (error) {
1083                 xfs_trans_cancel(*tpp, 0);
1084                 *tpp = NULL;
1085                 return error;
1086         }
1087         xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
1088
1089         /*
1090          * Zero length symlinks _can_ exist.
1091          */
1092         if (ip->i_df.if_bytes > 0) {
1093                 xfs_idata_realloc(ip,
1094                                   -(ip->i_df.if_bytes),
1095                                   XFS_DATA_FORK);
1096                 ASSERT(ip->i_df.if_bytes == 0);
1097         }
1098         return 0;
1099 }
1100
1101 STATIC int
1102 xfs_inactive_attrs(
1103         xfs_inode_t     *ip,
1104         xfs_trans_t     **tpp)
1105 {
1106         xfs_trans_t     *tp;
1107         int             error;
1108         xfs_mount_t     *mp;
1109
1110         ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
1111         tp = *tpp;
1112         mp = ip->i_mount;
1113         ASSERT(ip->i_d.di_forkoff != 0);
1114         error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
1115         xfs_iunlock(ip, XFS_ILOCK_EXCL);
1116         if (error)
1117                 goto error_unlock;
1118
1119         error = xfs_attr_inactive(ip);
1120         if (error)
1121                 goto error_unlock;
1122
1123         tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
1124         error = xfs_trans_reserve(tp, 0,
1125                                   XFS_IFREE_LOG_RES(mp),
1126                                   0, XFS_TRANS_PERM_LOG_RES,
1127                                   XFS_INACTIVE_LOG_COUNT);
1128         if (error)
1129                 goto error_cancel;
1130
1131         xfs_ilock(ip, XFS_ILOCK_EXCL);
1132         xfs_trans_ijoin(tp, ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1133         xfs_trans_ihold(tp, ip);
1134         xfs_idestroy_fork(ip, XFS_ATTR_FORK);
1135
1136         ASSERT(ip->i_d.di_anextents == 0);
1137
1138         *tpp = tp;
1139         return 0;
1140
1141 error_cancel:
1142         ASSERT(XFS_FORCED_SHUTDOWN(mp));
1143         xfs_trans_cancel(tp, 0);
1144 error_unlock:
1145         *tpp = NULL;
1146         xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1147         return error;
1148 }
1149
1150 int
1151 xfs_release(
1152         xfs_inode_t     *ip)
1153 {
1154         bhv_vnode_t     *vp = XFS_ITOV(ip);
1155         xfs_mount_t     *mp = ip->i_mount;
1156         int             error;
1157
1158         if (!S_ISREG(ip->i_d.di_mode) || (ip->i_d.di_mode == 0))
1159                 return 0;
1160
1161         /* If this is a read-only mount, don't do this (would generate I/O) */
1162         if (mp->m_flags & XFS_MOUNT_RDONLY)
1163                 return 0;
1164
1165         if (!XFS_FORCED_SHUTDOWN(mp)) {
1166                 int truncated;
1167
1168                 /*
1169                  * If we are using filestreams, and we have an unlinked
1170                  * file that we are processing the last close on, then nothing
1171                  * will be able to reopen and write to this file. Purge this
1172                  * inode from the filestreams cache so that it doesn't delay
1173                  * teardown of the inode.
1174                  */
1175                 if ((ip->i_d.di_nlink == 0) && xfs_inode_is_filestream(ip))
1176                         xfs_filestream_deassociate(ip);
1177
1178                 /*
1179                  * If we previously truncated this file and removed old data
1180                  * in the process, we want to initiate "early" writeout on
1181                  * the last close.  This is an attempt to combat the notorious
1182                  * NULL files problem which is particularly noticable from a
1183                  * truncate down, buffered (re-)write (delalloc), followed by
1184                  * a crash.  What we are effectively doing here is
1185                  * significantly reducing the time window where we'd otherwise
1186                  * be exposed to that problem.
1187                  */
1188                 truncated = xfs_iflags_test_and_clear(ip, XFS_ITRUNCATED);
1189                 if (truncated && VN_DIRTY(vp) && ip->i_delayed_blks > 0)
1190                         xfs_flush_pages(ip, 0, -1, XFS_B_ASYNC, FI_NONE);
1191         }
1192
1193         if (ip->i_d.di_nlink != 0) {
1194                 if ((((ip->i_d.di_mode & S_IFMT) == S_IFREG) &&
1195                      ((ip->i_size > 0) || (VN_CACHED(vp) > 0 ||
1196                        ip->i_delayed_blks > 0)) &&
1197                      (ip->i_df.if_flags & XFS_IFEXTENTS))  &&
1198                     (!(ip->i_d.di_flags &
1199                                 (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)))) {
1200                         error = xfs_free_eofblocks(mp, ip, XFS_FREE_EOF_LOCK);
1201                         if (error)
1202                                 return error;
1203                 }
1204         }
1205
1206         return 0;
1207 }
1208
1209 /*
1210  * xfs_inactive
1211  *
1212  * This is called when the vnode reference count for the vnode
1213  * goes to zero.  If the file has been unlinked, then it must
1214  * now be truncated.  Also, we clear all of the read-ahead state
1215  * kept for the inode here since the file is now closed.
1216  */
1217 int
1218 xfs_inactive(
1219         xfs_inode_t     *ip)
1220 {
1221         bhv_vnode_t     *vp = XFS_ITOV(ip);
1222         xfs_bmap_free_t free_list;
1223         xfs_fsblock_t   first_block;
1224         int             committed;
1225         xfs_trans_t     *tp;
1226         xfs_mount_t     *mp;
1227         int             error;
1228         int             truncate;
1229
1230         xfs_itrace_entry(ip);
1231
1232         /*
1233          * If the inode is already free, then there can be nothing
1234          * to clean up here.
1235          */
1236         if (ip->i_d.di_mode == 0 || VN_BAD(vp)) {
1237                 ASSERT(ip->i_df.if_real_bytes == 0);
1238                 ASSERT(ip->i_df.if_broot_bytes == 0);
1239                 return VN_INACTIVE_CACHE;
1240         }
1241
1242         /*
1243          * Only do a truncate if it's a regular file with
1244          * some actual space in it.  It's OK to look at the
1245          * inode's fields without the lock because we're the
1246          * only one with a reference to the inode.
1247          */
1248         truncate = ((ip->i_d.di_nlink == 0) &&
1249             ((ip->i_d.di_size != 0) || (ip->i_size != 0) ||
1250              (ip->i_d.di_nextents > 0) || (ip->i_delayed_blks > 0)) &&
1251             ((ip->i_d.di_mode & S_IFMT) == S_IFREG));
1252
1253         mp = ip->i_mount;
1254
1255         if (ip->i_d.di_nlink == 0 && DM_EVENT_ENABLED(ip, DM_EVENT_DESTROY))
1256                 XFS_SEND_DESTROY(mp, ip, DM_RIGHT_NULL);
1257
1258         error = 0;
1259
1260         /* If this is a read-only mount, don't do this (would generate I/O) */
1261         if (mp->m_flags & XFS_MOUNT_RDONLY)
1262                 goto out;
1263
1264         if (ip->i_d.di_nlink != 0) {
1265                 if ((((ip->i_d.di_mode & S_IFMT) == S_IFREG) &&
1266                      ((ip->i_size > 0) || (VN_CACHED(vp) > 0 ||
1267                        ip->i_delayed_blks > 0)) &&
1268                       (ip->i_df.if_flags & XFS_IFEXTENTS) &&
1269                      (!(ip->i_d.di_flags &
1270                                 (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)) ||
1271                       (ip->i_delayed_blks != 0)))) {
1272                         error = xfs_free_eofblocks(mp, ip, XFS_FREE_EOF_LOCK);
1273                         if (error)
1274                                 return VN_INACTIVE_CACHE;
1275                 }
1276                 goto out;
1277         }
1278
1279         ASSERT(ip->i_d.di_nlink == 0);
1280
1281         if ((error = XFS_QM_DQATTACH(mp, ip, 0)))
1282                 return VN_INACTIVE_CACHE;
1283
1284         tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
1285         if (truncate) {
1286                 /*
1287                  * Do the xfs_itruncate_start() call before
1288                  * reserving any log space because itruncate_start
1289                  * will call into the buffer cache and we can't
1290                  * do that within a transaction.
1291                  */
1292                 xfs_ilock(ip, XFS_IOLOCK_EXCL);
1293
1294                 error = xfs_itruncate_start(ip, XFS_ITRUNC_DEFINITE, 0);
1295                 if (error) {
1296                         xfs_trans_cancel(tp, 0);
1297                         xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1298                         return VN_INACTIVE_CACHE;
1299                 }
1300
1301                 error = xfs_trans_reserve(tp, 0,
1302                                           XFS_ITRUNCATE_LOG_RES(mp),
1303                                           0, XFS_TRANS_PERM_LOG_RES,
1304                                           XFS_ITRUNCATE_LOG_COUNT);
1305                 if (error) {
1306                         /* Don't call itruncate_cleanup */
1307                         ASSERT(XFS_FORCED_SHUTDOWN(mp));
1308                         xfs_trans_cancel(tp, 0);
1309                         xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1310                         return VN_INACTIVE_CACHE;
1311                 }
1312
1313                 xfs_ilock(ip, XFS_ILOCK_EXCL);
1314                 xfs_trans_ijoin(tp, ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1315                 xfs_trans_ihold(tp, ip);
1316
1317                 /*
1318                  * normally, we have to run xfs_itruncate_finish sync.
1319                  * But if filesystem is wsync and we're in the inactive
1320                  * path, then we know that nlink == 0, and that the
1321                  * xaction that made nlink == 0 is permanently committed
1322                  * since xfs_remove runs as a synchronous transaction.
1323                  */
1324                 error = xfs_itruncate_finish(&tp, ip, 0, XFS_DATA_FORK,
1325                                 (!(mp->m_flags & XFS_MOUNT_WSYNC) ? 1 : 0));
1326
1327                 if (error) {
1328                         xfs_trans_cancel(tp,
1329                                 XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
1330                         xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1331                         return VN_INACTIVE_CACHE;
1332                 }
1333         } else if ((ip->i_d.di_mode & S_IFMT) == S_IFLNK) {
1334
1335                 /*
1336                  * If we get an error while cleaning up a
1337                  * symlink we bail out.
1338                  */
1339                 error = (ip->i_d.di_size > XFS_IFORK_DSIZE(ip)) ?
1340                         xfs_inactive_symlink_rmt(ip, &tp) :
1341                         xfs_inactive_symlink_local(ip, &tp);
1342
1343                 if (error) {
1344                         ASSERT(tp == NULL);
1345                         return VN_INACTIVE_CACHE;
1346                 }
1347
1348                 xfs_trans_ijoin(tp, ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1349                 xfs_trans_ihold(tp, ip);
1350         } else {
1351                 error = xfs_trans_reserve(tp, 0,
1352                                           XFS_IFREE_LOG_RES(mp),
1353                                           0, XFS_TRANS_PERM_LOG_RES,
1354                                           XFS_INACTIVE_LOG_COUNT);
1355                 if (error) {
1356                         ASSERT(XFS_FORCED_SHUTDOWN(mp));
1357                         xfs_trans_cancel(tp, 0);
1358                         return VN_INACTIVE_CACHE;
1359                 }
1360
1361                 xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
1362                 xfs_trans_ijoin(tp, ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1363                 xfs_trans_ihold(tp, ip);
1364         }
1365
1366         /*
1367          * If there are attributes associated with the file
1368          * then blow them away now.  The code calls a routine
1369          * that recursively deconstructs the attribute fork.
1370          * We need to just commit the current transaction
1371          * because we can't use it for xfs_attr_inactive().
1372          */
1373         if (ip->i_d.di_anextents > 0) {
1374                 error = xfs_inactive_attrs(ip, &tp);
1375                 /*
1376                  * If we got an error, the transaction is already
1377                  * cancelled, and the inode is unlocked. Just get out.
1378                  */
1379                  if (error)
1380                          return VN_INACTIVE_CACHE;
1381         } else if (ip->i_afp) {
1382                 xfs_idestroy_fork(ip, XFS_ATTR_FORK);
1383         }
1384
1385         /*
1386          * Free the inode.
1387          */
1388         XFS_BMAP_INIT(&free_list, &first_block);
1389         error = xfs_ifree(tp, ip, &free_list);
1390         if (error) {
1391                 /*
1392                  * If we fail to free the inode, shut down.  The cancel
1393                  * might do that, we need to make sure.  Otherwise the
1394                  * inode might be lost for a long time or forever.
1395                  */
1396                 if (!XFS_FORCED_SHUTDOWN(mp)) {
1397                         cmn_err(CE_NOTE,
1398                 "xfs_inactive:  xfs_ifree() returned an error = %d on %s",
1399                                 error, mp->m_fsname);
1400                         xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
1401                 }
1402                 xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
1403         } else {
1404                 /*
1405                  * Credit the quota account(s). The inode is gone.
1406                  */
1407                 XFS_TRANS_MOD_DQUOT_BYINO(mp, tp, ip, XFS_TRANS_DQ_ICOUNT, -1);
1408
1409                 /*
1410                  * Just ignore errors at this point.  There is nothing we can
1411                  * do except to try to keep going. Make sure it's not a silent
1412                  * error.
1413                  */
1414                 error = xfs_bmap_finish(&tp,  &free_list, &committed);
1415                 if (error)
1416                         xfs_fs_cmn_err(CE_NOTE, mp, "xfs_inactive: "
1417                                 "xfs_bmap_finish() returned error %d", error);
1418                 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
1419                 if (error)
1420                         xfs_fs_cmn_err(CE_NOTE, mp, "xfs_inactive: "
1421                                 "xfs_trans_commit() returned error %d", error);
1422         }
1423         /*
1424          * Release the dquots held by inode, if any.
1425          */
1426         XFS_QM_DQDETACH(mp, ip);
1427
1428         xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1429
1430  out:
1431         return VN_INACTIVE_CACHE;
1432 }
1433
1434 /*
1435  * Lookups up an inode from "name". If ci_name is not NULL, then a CI match
1436  * is allowed, otherwise it has to be an exact match. If a CI match is found,
1437  * ci_name->name will point to a the actual name (caller must free) or
1438  * will be set to NULL if an exact match is found.
1439  */
1440 int
1441 xfs_lookup(
1442         xfs_inode_t             *dp,
1443         struct xfs_name         *name,
1444         xfs_inode_t             **ipp,
1445         struct xfs_name         *ci_name)
1446 {
1447         xfs_ino_t               inum;
1448         int                     error;
1449         uint                    lock_mode;
1450
1451         xfs_itrace_entry(dp);
1452
1453         if (XFS_FORCED_SHUTDOWN(dp->i_mount))
1454                 return XFS_ERROR(EIO);
1455
1456         lock_mode = xfs_ilock_map_shared(dp);
1457         error = xfs_dir_lookup(NULL, dp, name, &inum, ci_name);
1458         xfs_iunlock_map_shared(dp, lock_mode);
1459
1460         if (error)
1461                 goto out;
1462
1463         error = xfs_iget(dp->i_mount, NULL, inum, 0, 0, ipp, 0);
1464         if (error)
1465                 goto out_free_name;
1466
1467         xfs_itrace_ref(*ipp);
1468         return 0;
1469
1470 out_free_name:
1471         if (ci_name)
1472                 kmem_free(ci_name->name);
1473 out:
1474         *ipp = NULL;
1475         return error;
1476 }
1477
1478 int
1479 xfs_create(
1480         xfs_inode_t             *dp,
1481         struct xfs_name         *name,
1482         mode_t                  mode,
1483         xfs_dev_t               rdev,
1484         xfs_inode_t             **ipp,
1485         cred_t                  *credp)
1486 {
1487         xfs_mount_t             *mp = dp->i_mount;
1488         xfs_inode_t             *ip;
1489         xfs_trans_t             *tp;
1490         int                     error;
1491         xfs_bmap_free_t         free_list;
1492         xfs_fsblock_t           first_block;
1493         boolean_t               unlock_dp_on_error = B_FALSE;
1494         int                     dm_event_sent = 0;
1495         uint                    cancel_flags;
1496         int                     committed;
1497         xfs_prid_t              prid;
1498         struct xfs_dquot        *udqp, *gdqp;
1499         uint                    resblks;
1500
1501         ASSERT(!*ipp);
1502         xfs_itrace_entry(dp);
1503
1504         if (DM_EVENT_ENABLED(dp, DM_EVENT_CREATE)) {
1505                 error = XFS_SEND_NAMESP(mp, DM_EVENT_CREATE,
1506                                 dp, DM_RIGHT_NULL, NULL,
1507                                 DM_RIGHT_NULL, name->name, NULL,
1508                                 mode, 0, 0);
1509
1510                 if (error)
1511                         return error;
1512                 dm_event_sent = 1;
1513         }
1514
1515         if (XFS_FORCED_SHUTDOWN(mp))
1516                 return XFS_ERROR(EIO);
1517
1518         /* Return through std_return after this point. */
1519
1520         udqp = gdqp = NULL;
1521         if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
1522                 prid = dp->i_d.di_projid;
1523         else
1524                 prid = (xfs_prid_t)dfltprid;
1525
1526         /*
1527          * Make sure that we have allocated dquot(s) on disk.
1528          */
1529         error = XFS_QM_DQVOPALLOC(mp, dp,
1530                         current_fsuid(credp), current_fsgid(credp), prid,
1531                         XFS_QMOPT_QUOTALL|XFS_QMOPT_INHERIT, &udqp, &gdqp);
1532         if (error)
1533                 goto std_return;
1534
1535         ip = NULL;
1536
1537         tp = xfs_trans_alloc(mp, XFS_TRANS_CREATE);
1538         cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
1539         resblks = XFS_CREATE_SPACE_RES(mp, name->len);
1540         /*
1541          * Initially assume that the file does not exist and
1542          * reserve the resources for that case.  If that is not
1543          * the case we'll drop the one we have and get a more
1544          * appropriate transaction later.
1545          */
1546         error = xfs_trans_reserve(tp, resblks, XFS_CREATE_LOG_RES(mp), 0,
1547                         XFS_TRANS_PERM_LOG_RES, XFS_CREATE_LOG_COUNT);
1548         if (error == ENOSPC) {
1549                 resblks = 0;
1550                 error = xfs_trans_reserve(tp, 0, XFS_CREATE_LOG_RES(mp), 0,
1551                                 XFS_TRANS_PERM_LOG_RES, XFS_CREATE_LOG_COUNT);
1552         }
1553         if (error) {
1554                 cancel_flags = 0;
1555                 goto error_return;
1556         }
1557
1558         xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
1559         unlock_dp_on_error = B_TRUE;
1560
1561         XFS_BMAP_INIT(&free_list, &first_block);
1562
1563         ASSERT(ip == NULL);
1564
1565         /*
1566          * Reserve disk quota and the inode.
1567          */
1568         error = XFS_TRANS_RESERVE_QUOTA(mp, tp, udqp, gdqp, resblks, 1, 0);
1569         if (error)
1570                 goto error_return;
1571
1572         error = xfs_dir_canenter(tp, dp, name, resblks);
1573         if (error)
1574                 goto error_return;
1575         error = xfs_dir_ialloc(&tp, dp, mode, 1,
1576                         rdev, credp, prid, resblks > 0,
1577                         &ip, &committed);
1578         if (error) {
1579                 if (error == ENOSPC)
1580                         goto error_return;
1581                 goto abort_return;
1582         }
1583         xfs_itrace_ref(ip);
1584
1585         /*
1586          * At this point, we've gotten a newly allocated inode.
1587          * It is locked (and joined to the transaction).
1588          */
1589
1590         ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
1591
1592         /*
1593          * Now we join the directory inode to the transaction.  We do not do it
1594          * earlier because xfs_dir_ialloc might commit the previous transaction
1595          * (and release all the locks).  An error from here on will result in
1596          * the transaction cancel unlocking dp so don't do it explicitly in the
1597          * error path.
1598          */
1599         IHOLD(dp);
1600         xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
1601         unlock_dp_on_error = B_FALSE;
1602
1603         error = xfs_dir_createname(tp, dp, name, ip->i_ino,
1604                                         &first_block, &free_list, resblks ?
1605                                         resblks - XFS_IALLOC_SPACE_RES(mp) : 0);
1606         if (error) {
1607                 ASSERT(error != ENOSPC);
1608                 goto abort_return;
1609         }
1610         xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
1611         xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
1612
1613         /*
1614          * If this is a synchronous mount, make sure that the
1615          * create transaction goes to disk before returning to
1616          * the user.
1617          */
1618         if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
1619                 xfs_trans_set_sync(tp);
1620         }
1621
1622         dp->i_gen++;
1623
1624         /*
1625          * Attach the dquot(s) to the inodes and modify them incore.
1626          * These ids of the inode couldn't have changed since the new
1627          * inode has been locked ever since it was created.
1628          */
1629         XFS_QM_DQVOPCREATE(mp, tp, ip, udqp, gdqp);
1630
1631         /*
1632          * xfs_trans_commit normally decrements the vnode ref count
1633          * when it unlocks the inode. Since we want to return the
1634          * vnode to the caller, we bump the vnode ref count now.
1635          */
1636         IHOLD(ip);
1637
1638         error = xfs_bmap_finish(&tp, &free_list, &committed);
1639         if (error) {
1640                 xfs_bmap_cancel(&free_list);
1641                 goto abort_rele;
1642         }
1643
1644         error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
1645         if (error) {
1646                 IRELE(ip);
1647                 tp = NULL;
1648                 goto error_return;
1649         }
1650
1651         XFS_QM_DQRELE(mp, udqp);
1652         XFS_QM_DQRELE(mp, gdqp);
1653
1654         *ipp = ip;
1655
1656         /* Fallthrough to std_return with error = 0  */
1657
1658 std_return:
1659         if ((*ipp || (error != 0 && dm_event_sent != 0)) &&
1660             DM_EVENT_ENABLED(dp, DM_EVENT_POSTCREATE)) {
1661                 (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTCREATE,
1662                         dp, DM_RIGHT_NULL,
1663                         *ipp ? ip : NULL,
1664                         DM_RIGHT_NULL, name->name, NULL,
1665                         mode, error, 0);
1666         }
1667         return error;
1668
1669  abort_return:
1670         cancel_flags |= XFS_TRANS_ABORT;
1671         /* FALLTHROUGH */
1672
1673  error_return:
1674         if (tp != NULL)
1675                 xfs_trans_cancel(tp, cancel_flags);
1676
1677         XFS_QM_DQRELE(mp, udqp);
1678         XFS_QM_DQRELE(mp, gdqp);
1679
1680         if (unlock_dp_on_error)
1681                 xfs_iunlock(dp, XFS_ILOCK_EXCL);
1682
1683         goto std_return;
1684
1685  abort_rele:
1686         /*
1687          * Wait until after the current transaction is aborted to
1688          * release the inode.  This prevents recursive transactions
1689          * and deadlocks from xfs_inactive.
1690          */
1691         cancel_flags |= XFS_TRANS_ABORT;
1692         xfs_trans_cancel(tp, cancel_flags);
1693         IRELE(ip);
1694
1695         XFS_QM_DQRELE(mp, udqp);
1696         XFS_QM_DQRELE(mp, gdqp);
1697
1698         goto std_return;
1699 }
1700
1701 #ifdef DEBUG
1702 /*
1703  * Some counters to see if (and how often) we are hitting some deadlock
1704  * prevention code paths.
1705  */
1706
1707 int xfs_rm_locks;
1708 int xfs_rm_lock_delays;
1709 int xfs_rm_attempts;
1710 #endif
1711
1712 /*
1713  * The following routine will lock the inodes associated with the
1714  * directory and the named entry in the directory. The locks are
1715  * acquired in increasing inode number.
1716  *
1717  * If the entry is "..", then only the directory is locked. The
1718  * vnode ref count will still include that from the .. entry in
1719  * this case.
1720  *
1721  * There is a deadlock we need to worry about. If the locked directory is
1722  * in the AIL, it might be blocking up the log. The next inode we lock
1723  * could be already locked by another thread waiting for log space (e.g
1724  * a permanent log reservation with a long running transaction (see
1725  * xfs_itruncate_finish)). To solve this, we must check if the directory
1726  * is in the ail and use lock_nowait. If we can't lock, we need to
1727  * drop the inode lock on the directory and try again. xfs_iunlock will
1728  * potentially push the tail if we were holding up the log.
1729  */
1730 STATIC int
1731 xfs_lock_dir_and_entry(
1732         xfs_inode_t     *dp,
1733         xfs_inode_t     *ip)    /* inode of entry 'name' */
1734 {
1735         int             attempts;
1736         xfs_ino_t       e_inum;
1737         xfs_inode_t     *ips[2];
1738         xfs_log_item_t  *lp;
1739
1740 #ifdef DEBUG
1741         xfs_rm_locks++;
1742 #endif
1743         attempts = 0;
1744
1745 again:
1746         xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
1747
1748         e_inum = ip->i_ino;
1749
1750         xfs_itrace_ref(ip);
1751
1752         /*
1753          * We want to lock in increasing inum. Since we've already
1754          * acquired the lock on the directory, we may need to release
1755          * if if the inum of the entry turns out to be less.
1756          */
1757         if (e_inum > dp->i_ino) {
1758                 /*
1759                  * We are already in the right order, so just
1760                  * lock on the inode of the entry.
1761                  * We need to use nowait if dp is in the AIL.
1762                  */
1763
1764                 lp = (xfs_log_item_t *)dp->i_itemp;
1765                 if (lp && (lp->li_flags & XFS_LI_IN_AIL)) {
1766                         if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
1767                                 attempts++;
1768 #ifdef DEBUG
1769                                 xfs_rm_attempts++;
1770 #endif
1771
1772                                 /*
1773                                  * Unlock dp and try again.
1774                                  * xfs_iunlock will try to push the tail
1775                                  * if the inode is in the AIL.
1776                                  */
1777
1778                                 xfs_iunlock(dp, XFS_ILOCK_EXCL);
1779
1780                                 if ((attempts % 5) == 0) {
1781                                         delay(1); /* Don't just spin the CPU */
1782 #ifdef DEBUG
1783                                         xfs_rm_lock_delays++;
1784 #endif
1785                                 }
1786                                 goto again;
1787                         }
1788                 } else {
1789                         xfs_ilock(ip, XFS_ILOCK_EXCL);
1790                 }
1791         } else if (e_inum < dp->i_ino) {
1792                 xfs_iunlock(dp, XFS_ILOCK_EXCL);
1793
1794                 ips[0] = ip;
1795                 ips[1] = dp;
1796                 xfs_lock_inodes(ips, 2, XFS_ILOCK_EXCL);
1797         }
1798         /* else  e_inum == dp->i_ino */
1799         /*     This can happen if we're asked to lock /x/..
1800          *     the entry is "..", which is also the parent directory.
1801          */
1802
1803         return 0;
1804 }
1805
1806 #ifdef DEBUG
1807 int xfs_locked_n;
1808 int xfs_small_retries;
1809 int xfs_middle_retries;
1810 int xfs_lots_retries;
1811 int xfs_lock_delays;
1812 #endif
1813
1814 /*
1815  * Bump the subclass so xfs_lock_inodes() acquires each lock with
1816  * a different value
1817  */
1818 static inline int
1819 xfs_lock_inumorder(int lock_mode, int subclass)
1820 {
1821         if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL))
1822                 lock_mode |= (subclass + XFS_LOCK_INUMORDER) << XFS_IOLOCK_SHIFT;
1823         if (lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL))
1824                 lock_mode |= (subclass + XFS_LOCK_INUMORDER) << XFS_ILOCK_SHIFT;
1825
1826         return lock_mode;
1827 }
1828
1829 /*
1830  * The following routine will lock n inodes in exclusive mode.
1831  * We assume the caller calls us with the inodes in i_ino order.
1832  *
1833  * We need to detect deadlock where an inode that we lock
1834  * is in the AIL and we start waiting for another inode that is locked
1835  * by a thread in a long running transaction (such as truncate). This can
1836  * result in deadlock since the long running trans might need to wait
1837  * for the inode we just locked in order to push the tail and free space
1838  * in the log.
1839  */
1840 void
1841 xfs_lock_inodes(
1842         xfs_inode_t     **ips,
1843         int             inodes,
1844         uint            lock_mode)
1845 {
1846         int             attempts = 0, i, j, try_lock;
1847         xfs_log_item_t  *lp;
1848
1849         ASSERT(ips && (inodes >= 2)); /* we need at least two */
1850
1851         try_lock = 0;
1852         i = 0;
1853
1854 again:
1855         for (; i < inodes; i++) {
1856                 ASSERT(ips[i]);
1857
1858                 if (i && (ips[i] == ips[i-1]))  /* Already locked */
1859                         continue;
1860
1861                 /*
1862                  * If try_lock is not set yet, make sure all locked inodes
1863                  * are not in the AIL.
1864                  * If any are, set try_lock to be used later.
1865                  */
1866
1867                 if (!try_lock) {
1868                         for (j = (i - 1); j >= 0 && !try_lock; j--) {
1869                                 lp = (xfs_log_item_t *)ips[j]->i_itemp;
1870                                 if (lp && (lp->li_flags & XFS_LI_IN_AIL)) {
1871                                         try_lock++;
1872                                 }
1873                         }
1874                 }
1875
1876                 /*
1877                  * If any of the previous locks we have locked is in the AIL,
1878                  * we must TRY to get the second and subsequent locks. If
1879                  * we can't get any, we must release all we have
1880                  * and try again.
1881                  */
1882
1883                 if (try_lock) {
1884                         /* try_lock must be 0 if i is 0. */
1885                         /*
1886                          * try_lock means we have an inode locked
1887                          * that is in the AIL.
1888                          */
1889                         ASSERT(i != 0);
1890                         if (!xfs_ilock_nowait(ips[i], xfs_lock_inumorder(lock_mode, i))) {
1891                                 attempts++;
1892
1893                                 /*
1894                                  * Unlock all previous guys and try again.
1895                                  * xfs_iunlock will try to push the tail
1896                                  * if the inode is in the AIL.
1897                                  */
1898
1899                                 for(j = i - 1; j >= 0; j--) {
1900
1901                                         /*
1902                                          * Check to see if we've already
1903                                          * unlocked this one.
1904                                          * Not the first one going back,
1905                                          * and the inode ptr is the same.
1906                                          */
1907                                         if ((j != (i - 1)) && ips[j] ==
1908                                                                 ips[j+1])
1909                                                 continue;
1910
1911                                         xfs_iunlock(ips[j], lock_mode);
1912                                 }
1913
1914                                 if ((attempts % 5) == 0) {
1915                                         delay(1); /* Don't just spin the CPU */
1916 #ifdef DEBUG
1917                                         xfs_lock_delays++;
1918 #endif
1919                                 }
1920                                 i = 0;
1921                                 try_lock = 0;
1922                                 goto again;
1923                         }
1924                 } else {
1925                         xfs_ilock(ips[i], xfs_lock_inumorder(lock_mode, i));
1926                 }
1927         }
1928
1929 #ifdef DEBUG
1930         if (attempts) {
1931                 if (attempts < 5) xfs_small_retries++;
1932                 else if (attempts < 100) xfs_middle_retries++;
1933                 else xfs_lots_retries++;
1934         } else {
1935                 xfs_locked_n++;
1936         }
1937 #endif
1938 }
1939
1940 int
1941 xfs_remove(
1942         xfs_inode_t             *dp,
1943         struct xfs_name         *name,
1944         xfs_inode_t             *ip)
1945 {
1946         xfs_mount_t             *mp = dp->i_mount;
1947         xfs_trans_t             *tp = NULL;
1948         int                     is_dir = S_ISDIR(ip->i_d.di_mode);
1949         int                     error = 0;
1950         xfs_bmap_free_t         free_list;
1951         xfs_fsblock_t           first_block;
1952         int                     cancel_flags;
1953         int                     committed;
1954         int                     link_zero;
1955         uint                    resblks;
1956         uint                    log_count;
1957
1958         xfs_itrace_entry(dp);
1959         xfs_itrace_entry(ip);
1960
1961         if (XFS_FORCED_SHUTDOWN(mp))
1962                 return XFS_ERROR(EIO);
1963
1964         if (DM_EVENT_ENABLED(dp, DM_EVENT_REMOVE)) {
1965                 error = XFS_SEND_NAMESP(mp, DM_EVENT_REMOVE, dp, DM_RIGHT_NULL,
1966                                         NULL, DM_RIGHT_NULL, name->name, NULL,
1967                                         ip->i_d.di_mode, 0, 0);
1968                 if (error)
1969                         return error;
1970         }
1971
1972         error = XFS_QM_DQATTACH(mp, dp, 0);
1973         if (error)
1974                 goto std_return;
1975
1976         error = XFS_QM_DQATTACH(mp, ip, 0);
1977         if (error)
1978                 goto std_return;
1979
1980         if (is_dir) {
1981                 tp = xfs_trans_alloc(mp, XFS_TRANS_RMDIR);
1982                 log_count = XFS_DEFAULT_LOG_COUNT;
1983         } else {
1984                 tp = xfs_trans_alloc(mp, XFS_TRANS_REMOVE);
1985                 log_count = XFS_REMOVE_LOG_COUNT;
1986         }
1987         cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
1988
1989         /*
1990          * We try to get the real space reservation first,
1991          * allowing for directory btree deletion(s) implying
1992          * possible bmap insert(s).  If we can't get the space
1993          * reservation then we use 0 instead, and avoid the bmap
1994          * btree insert(s) in the directory code by, if the bmap
1995          * insert tries to happen, instead trimming the LAST
1996          * block from the directory.
1997          */
1998         resblks = XFS_REMOVE_SPACE_RES(mp);
1999         error = xfs_trans_reserve(tp, resblks, XFS_REMOVE_LOG_RES(mp), 0,
2000                                   XFS_TRANS_PERM_LOG_RES, log_count);
2001         if (error == ENOSPC) {
2002                 resblks = 0;
2003                 error = xfs_trans_reserve(tp, 0, XFS_REMOVE_LOG_RES(mp), 0,
2004                                           XFS_TRANS_PERM_LOG_RES, log_count);
2005         }
2006         if (error) {
2007                 ASSERT(error != ENOSPC);
2008                 cancel_flags = 0;
2009                 goto out_trans_cancel;
2010         }
2011
2012         error = xfs_lock_dir_and_entry(dp, ip);
2013         if (error)
2014                 goto out_trans_cancel;
2015
2016         /*
2017          * At this point, we've gotten both the directory and the entry
2018          * inodes locked.
2019          */
2020         IHOLD(ip);
2021         xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
2022
2023         IHOLD(dp);
2024         xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
2025
2026         /*
2027          * If we're removing a directory perform some additional validation.
2028          */
2029         if (is_dir) {
2030                 ASSERT(ip->i_d.di_nlink >= 2);
2031                 if (ip->i_d.di_nlink != 2) {
2032                         error = XFS_ERROR(ENOTEMPTY);
2033                         goto out_trans_cancel;
2034                 }
2035                 if (!xfs_dir_isempty(ip)) {
2036                         error = XFS_ERROR(ENOTEMPTY);
2037                         goto out_trans_cancel;
2038                 }
2039         }
2040
2041         /*
2042          * Entry must exist since we did a lookup in xfs_lock_dir_and_entry.
2043          */
2044         XFS_BMAP_INIT(&free_list, &first_block);
2045         error = xfs_dir_removename(tp, dp, name, ip->i_ino,
2046                                         &first_block, &free_list, resblks);
2047         if (error) {
2048                 ASSERT(error != ENOENT);
2049                 goto out_bmap_cancel;
2050         }
2051         xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
2052
2053         /*
2054          * Bump the in memory generation count on the parent
2055          * directory so that other can know that it has changed.
2056          */
2057         dp->i_gen++;
2058         xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
2059
2060         if (is_dir) {
2061                 /*
2062                  * Drop the link from ip's "..".
2063                  */
2064                 error = xfs_droplink(tp, dp);
2065                 if (error)
2066                         goto out_bmap_cancel;
2067
2068                 /*
2069                  * Drop the link from dp to ip.
2070                  */
2071                 error = xfs_droplink(tp, ip);
2072                 if (error)
2073                         goto out_bmap_cancel;
2074         } else {
2075                 /*
2076                  * When removing a non-directory we need to log the parent
2077                  * inode here for the i_gen update.  For a directory this is
2078                  * done implicitly by the xfs_droplink call for the ".." entry.
2079                  */
2080                 xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
2081         }
2082
2083         /*
2084          * Drop the "." link from ip to self.
2085          */
2086         error = xfs_droplink(tp, ip);
2087         if (error)
2088                 goto out_bmap_cancel;
2089
2090         /*
2091          * Determine if this is the last link while
2092          * we are in the transaction.
2093          */
2094         link_zero = (ip->i_d.di_nlink == 0);
2095
2096         /*
2097          * If this is a synchronous mount, make sure that the
2098          * remove transaction goes to disk before returning to
2099          * the user.
2100          */
2101         if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC))
2102                 xfs_trans_set_sync(tp);
2103
2104         error = xfs_bmap_finish(&tp, &free_list, &committed);
2105         if (error)
2106                 goto out_bmap_cancel;
2107
2108         error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
2109         if (error)
2110                 goto std_return;
2111
2112         /*
2113          * If we are using filestreams, kill the stream association.
2114          * If the file is still open it may get a new one but that
2115          * will get killed on last close in xfs_close() so we don't
2116          * have to worry about that.
2117          */
2118         if (!is_dir && link_zero && xfs_inode_is_filestream(ip))
2119                 xfs_filestream_deassociate(ip);
2120
2121         xfs_itrace_exit(ip);
2122         xfs_itrace_exit(dp);
2123
2124  std_return:
2125         if (DM_EVENT_ENABLED(dp, DM_EVENT_POSTREMOVE)) {
2126                 XFS_SEND_NAMESP(mp, DM_EVENT_POSTREMOVE, dp, DM_RIGHT_NULL,
2127                                 NULL, DM_RIGHT_NULL, name->name, NULL,
2128                                 ip->i_d.di_mode, error, 0);
2129         }
2130
2131         return error;
2132
2133  out_bmap_cancel:
2134         xfs_bmap_cancel(&free_list);
2135         cancel_flags |= XFS_TRANS_ABORT;
2136  out_trans_cancel:
2137         xfs_trans_cancel(tp, cancel_flags);
2138         goto std_return;
2139 }
2140
2141 int
2142 xfs_link(
2143         xfs_inode_t             *tdp,
2144         xfs_inode_t             *sip,
2145         struct xfs_name         *target_name)
2146 {
2147         xfs_mount_t             *mp = tdp->i_mount;
2148         xfs_trans_t             *tp;
2149         xfs_inode_t             *ips[2];
2150         int                     error;
2151         xfs_bmap_free_t         free_list;
2152         xfs_fsblock_t           first_block;
2153         int                     cancel_flags;
2154         int                     committed;
2155         int                     resblks;
2156
2157         xfs_itrace_entry(tdp);
2158         xfs_itrace_entry(sip);
2159
2160         ASSERT(!S_ISDIR(sip->i_d.di_mode));
2161
2162         if (XFS_FORCED_SHUTDOWN(mp))
2163                 return XFS_ERROR(EIO);
2164
2165         if (DM_EVENT_ENABLED(tdp, DM_EVENT_LINK)) {
2166                 error = XFS_SEND_NAMESP(mp, DM_EVENT_LINK,
2167                                         tdp, DM_RIGHT_NULL,
2168                                         sip, DM_RIGHT_NULL,
2169                                         target_name->name, NULL, 0, 0, 0);
2170                 if (error)
2171                         return error;
2172         }
2173
2174         /* Return through std_return after this point. */
2175
2176         error = XFS_QM_DQATTACH(mp, sip, 0);
2177         if (!error && sip != tdp)
2178                 error = XFS_QM_DQATTACH(mp, tdp, 0);
2179         if (error)
2180                 goto std_return;
2181
2182         tp = xfs_trans_alloc(mp, XFS_TRANS_LINK);
2183         cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
2184         resblks = XFS_LINK_SPACE_RES(mp, target_name->len);
2185         error = xfs_trans_reserve(tp, resblks, XFS_LINK_LOG_RES(mp), 0,
2186                         XFS_TRANS_PERM_LOG_RES, XFS_LINK_LOG_COUNT);
2187         if (error == ENOSPC) {
2188                 resblks = 0;
2189                 error = xfs_trans_reserve(tp, 0, XFS_LINK_LOG_RES(mp), 0,
2190                                 XFS_TRANS_PERM_LOG_RES, XFS_LINK_LOG_COUNT);
2191         }
2192         if (error) {
2193                 cancel_flags = 0;
2194                 goto error_return;
2195         }
2196
2197         if (sip->i_ino < tdp->i_ino) {
2198                 ips[0] = sip;
2199                 ips[1] = tdp;
2200         } else {
2201                 ips[0] = tdp;
2202                 ips[1] = sip;
2203         }
2204
2205         xfs_lock_inodes(ips, 2, XFS_ILOCK_EXCL);
2206
2207         /*
2208          * Increment vnode ref counts since xfs_trans_commit &
2209          * xfs_trans_cancel will both unlock the inodes and
2210          * decrement the associated ref counts.
2211          */
2212         IHOLD(sip);
2213         IHOLD(tdp);
2214         xfs_trans_ijoin(tp, sip, XFS_ILOCK_EXCL);
2215         xfs_trans_ijoin(tp, tdp, XFS_ILOCK_EXCL);
2216
2217         /*
2218          * If the source has too many links, we can't make any more to it.
2219          */
2220         if (sip->i_d.di_nlink >= XFS_MAXLINK) {
2221                 error = XFS_ERROR(EMLINK);
2222                 goto error_return;
2223         }
2224
2225         /*
2226          * If we are using project inheritance, we only allow hard link
2227          * creation in our tree when the project IDs are the same; else
2228          * the tree quota mechanism could be circumvented.
2229          */
2230         if (unlikely((tdp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) &&
2231                      (tdp->i_d.di_projid != sip->i_d.di_projid))) {
2232                 error = XFS_ERROR(EXDEV);
2233                 goto error_return;
2234         }
2235
2236         error = xfs_dir_canenter(tp, tdp, target_name, resblks);
2237         if (error)
2238                 goto error_return;
2239
2240         XFS_BMAP_INIT(&free_list, &first_block);
2241
2242         error = xfs_dir_createname(tp, tdp, target_name, sip->i_ino,
2243                                         &first_block, &free_list, resblks);
2244         if (error)
2245                 goto abort_return;
2246         xfs_ichgtime(tdp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
2247         tdp->i_gen++;
2248         xfs_trans_log_inode(tp, tdp, XFS_ILOG_CORE);
2249
2250         error = xfs_bumplink(tp, sip);
2251         if (error)
2252                 goto abort_return;
2253
2254         /*
2255          * If this is a synchronous mount, make sure that the
2256          * link transaction goes to disk before returning to
2257          * the user.
2258          */
2259         if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
2260                 xfs_trans_set_sync(tp);
2261         }
2262
2263         error = xfs_bmap_finish (&tp, &free_list, &committed);
2264         if (error) {
2265                 xfs_bmap_cancel(&free_list);
2266                 goto abort_return;
2267         }
2268
2269         error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
2270         if (error)
2271                 goto std_return;
2272
2273         /* Fall through to std_return with error = 0. */
2274 std_return:
2275         if (DM_EVENT_ENABLED(sip, DM_EVENT_POSTLINK)) {
2276                 (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTLINK,
2277                                 tdp, DM_RIGHT_NULL,
2278                                 sip, DM_RIGHT_NULL,
2279                                 target_name->name, NULL, 0, error, 0);
2280         }
2281         return error;
2282
2283  abort_return:
2284         cancel_flags |= XFS_TRANS_ABORT;
2285         /* FALLTHROUGH */
2286
2287  error_return:
2288         xfs_trans_cancel(tp, cancel_flags);
2289         goto std_return;
2290 }
2291
2292
2293 int
2294 xfs_mkdir(
2295         xfs_inode_t             *dp,
2296         struct xfs_name         *dir_name,
2297         mode_t                  mode,
2298         xfs_inode_t             **ipp,
2299         cred_t                  *credp)
2300 {
2301         xfs_mount_t             *mp = dp->i_mount;
2302         xfs_inode_t             *cdp;   /* inode of created dir */
2303         xfs_trans_t             *tp;
2304         int                     cancel_flags;
2305         int                     error;
2306         int                     committed;
2307         xfs_bmap_free_t         free_list;
2308         xfs_fsblock_t           first_block;
2309         boolean_t               unlock_dp_on_error = B_FALSE;
2310         boolean_t               created = B_FALSE;
2311         int                     dm_event_sent = 0;
2312         xfs_prid_t              prid;
2313         struct xfs_dquot        *udqp, *gdqp;
2314         uint                    resblks;
2315
2316         if (XFS_FORCED_SHUTDOWN(mp))
2317                 return XFS_ERROR(EIO);
2318
2319         tp = NULL;
2320
2321         if (DM_EVENT_ENABLED(dp, DM_EVENT_CREATE)) {
2322                 error = XFS_SEND_NAMESP(mp, DM_EVENT_CREATE,
2323                                         dp, DM_RIGHT_NULL, NULL,
2324                                         DM_RIGHT_NULL, dir_name->name, NULL,
2325                                         mode, 0, 0);
2326                 if (error)
2327                         return error;
2328                 dm_event_sent = 1;
2329         }
2330
2331         /* Return through std_return after this point. */
2332
2333         xfs_itrace_entry(dp);
2334
2335         mp = dp->i_mount;
2336         udqp = gdqp = NULL;
2337         if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
2338                 prid = dp->i_d.di_projid;
2339         else
2340                 prid = (xfs_prid_t)dfltprid;
2341
2342         /*
2343          * Make sure that we have allocated dquot(s) on disk.
2344          */
2345         error = XFS_QM_DQVOPALLOC(mp, dp,
2346                         current_fsuid(credp), current_fsgid(credp), prid,
2347                         XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp);
2348         if (error)
2349                 goto std_return;
2350
2351         tp = xfs_trans_alloc(mp, XFS_TRANS_MKDIR);
2352         cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
2353         resblks = XFS_MKDIR_SPACE_RES(mp, dir_name->len);
2354         error = xfs_trans_reserve(tp, resblks, XFS_MKDIR_LOG_RES(mp), 0,
2355                                   XFS_TRANS_PERM_LOG_RES, XFS_MKDIR_LOG_COUNT);
2356         if (error == ENOSPC) {
2357                 resblks = 0;
2358                 error = xfs_trans_reserve(tp, 0, XFS_MKDIR_LOG_RES(mp), 0,
2359                                           XFS_TRANS_PERM_LOG_RES,
2360                                           XFS_MKDIR_LOG_COUNT);
2361         }
2362         if (error) {
2363                 cancel_flags = 0;
2364                 goto error_return;
2365         }
2366
2367         xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
2368         unlock_dp_on_error = B_TRUE;
2369
2370         /*
2371          * Check for directory link count overflow.
2372          */
2373         if (dp->i_d.di_nlink >= XFS_MAXLINK) {
2374                 error = XFS_ERROR(EMLINK);
2375                 goto error_return;
2376         }
2377
2378         /*
2379          * Reserve disk quota and the inode.
2380          */
2381         error = XFS_TRANS_RESERVE_QUOTA(mp, tp, udqp, gdqp, resblks, 1, 0);
2382         if (error)
2383                 goto error_return;
2384
2385         error = xfs_dir_canenter(tp, dp, dir_name, resblks);
2386         if (error)
2387                 goto error_return;
2388         /*
2389          * create the directory inode.
2390          */
2391         error = xfs_dir_ialloc(&tp, dp, mode, 2,
2392                         0, credp, prid, resblks > 0,
2393                 &cdp, NULL);
2394         if (error) {
2395                 if (error == ENOSPC)
2396                         goto error_return;
2397                 goto abort_return;
2398         }
2399         xfs_itrace_ref(cdp);
2400
2401         /*
2402          * Now we add the directory inode to the transaction.
2403          * We waited until now since xfs_dir_ialloc might start
2404          * a new transaction.  Had we joined the transaction
2405          * earlier, the locks might have gotten released. An error
2406          * from here on will result in the transaction cancel
2407          * unlocking dp so don't do it explicitly in the error path.
2408          */
2409         IHOLD(dp);
2410         xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
2411         unlock_dp_on_error = B_FALSE;
2412
2413         XFS_BMAP_INIT(&free_list, &first_block);
2414
2415         error = xfs_dir_createname(tp, dp, dir_name, cdp->i_ino,
2416                                         &first_block, &free_list, resblks ?
2417                                         resblks - XFS_IALLOC_SPACE_RES(mp) : 0);
2418         if (error) {
2419                 ASSERT(error != ENOSPC);
2420                 goto error1;
2421         }
2422         xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
2423
2424         /*
2425          * Bump the in memory version number of the parent directory
2426          * so that other processes accessing it will recognize that
2427          * the directory has changed.
2428          */
2429         dp->i_gen++;
2430
2431         error = xfs_dir_init(tp, cdp, dp);
2432         if (error)
2433                 goto error2;
2434
2435         cdp->i_gen = 1;
2436         error = xfs_bumplink(tp, dp);
2437         if (error)
2438                 goto error2;
2439
2440         created = B_TRUE;
2441
2442         *ipp = cdp;
2443         IHOLD(cdp);
2444
2445         /*
2446          * Attach the dquots to the new inode and modify the icount incore.
2447          */
2448         XFS_QM_DQVOPCREATE(mp, tp, cdp, udqp, gdqp);
2449
2450         /*
2451          * If this is a synchronous mount, make sure that the
2452          * mkdir transaction goes to disk before returning to
2453          * the user.
2454          */
2455         if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
2456                 xfs_trans_set_sync(tp);
2457         }
2458
2459         error = xfs_bmap_finish(&tp, &free_list, &committed);
2460         if (error) {
2461                 IRELE(cdp);
2462                 goto error2;
2463         }
2464
2465         error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
2466         XFS_QM_DQRELE(mp, udqp);
2467         XFS_QM_DQRELE(mp, gdqp);
2468         if (error) {
2469                 IRELE(cdp);
2470         }
2471
2472         /* Fall through to std_return with error = 0 or errno from
2473          * xfs_trans_commit. */
2474
2475 std_return:
2476         if ((created || (error != 0 && dm_event_sent != 0)) &&
2477             DM_EVENT_ENABLED(dp, DM_EVENT_POSTCREATE)) {
2478                 (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTCREATE,
2479                                         dp, DM_RIGHT_NULL,
2480                                         created ? cdp : NULL,
2481                                         DM_RIGHT_NULL,
2482                                         dir_name->name, NULL,
2483                                         mode, error, 0);
2484         }
2485         return error;
2486
2487  error2:
2488  error1:
2489         xfs_bmap_cancel(&free_list);
2490  abort_return:
2491         cancel_flags |= XFS_TRANS_ABORT;
2492  error_return:
2493         xfs_trans_cancel(tp, cancel_flags);
2494         XFS_QM_DQRELE(mp, udqp);
2495         XFS_QM_DQRELE(mp, gdqp);
2496
2497         if (unlock_dp_on_error)
2498                 xfs_iunlock(dp, XFS_ILOCK_EXCL);
2499
2500         goto std_return;
2501 }
2502
2503 int
2504 xfs_symlink(
2505         xfs_inode_t             *dp,
2506         struct xfs_name         *link_name,
2507         const char              *target_path,
2508         mode_t                  mode,
2509         xfs_inode_t             **ipp,
2510         cred_t                  *credp)
2511 {
2512         xfs_mount_t             *mp = dp->i_mount;
2513         xfs_trans_t             *tp;
2514         xfs_inode_t             *ip;
2515         int                     error;
2516         int                     pathlen;
2517         xfs_bmap_free_t         free_list;
2518         xfs_fsblock_t           first_block;
2519         boolean_t               unlock_dp_on_error = B_FALSE;
2520         uint                    cancel_flags;
2521         int                     committed;
2522         xfs_fileoff_t           first_fsb;
2523         xfs_filblks_t           fs_blocks;
2524         int                     nmaps;
2525         xfs_bmbt_irec_t         mval[SYMLINK_MAPS];
2526         xfs_daddr_t             d;
2527         const char              *cur_chunk;
2528         int                     byte_cnt;
2529         int                     n;
2530         xfs_buf_t               *bp;
2531         xfs_prid_t              prid;
2532         struct xfs_dquot        *udqp, *gdqp;
2533         uint                    resblks;
2534
2535         *ipp = NULL;
2536         error = 0;
2537         ip = NULL;
2538         tp = NULL;
2539
2540         xfs_itrace_entry(dp);
2541
2542         if (XFS_FORCED_SHUTDOWN(mp))
2543                 return XFS_ERROR(EIO);
2544
2545         /*
2546          * Check component lengths of the target path name.
2547          */
2548         pathlen = strlen(target_path);
2549         if (pathlen >= MAXPATHLEN)      /* total string too long */
2550                 return XFS_ERROR(ENAMETOOLONG);
2551
2552         if (DM_EVENT_ENABLED(dp, DM_EVENT_SYMLINK)) {
2553                 error = XFS_SEND_NAMESP(mp, DM_EVENT_SYMLINK, dp,
2554                                         DM_RIGHT_NULL, NULL, DM_RIGHT_NULL,
2555                                         link_name->name, target_path, 0, 0, 0);
2556                 if (error)
2557                         return error;
2558         }
2559
2560         /* Return through std_return after this point. */
2561
2562         udqp = gdqp = NULL;
2563         if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
2564                 prid = dp->i_d.di_projid;
2565         else
2566                 prid = (xfs_prid_t)dfltprid;
2567
2568         /*
2569          * Make sure that we have allocated dquot(s) on disk.
2570          */
2571         error = XFS_QM_DQVOPALLOC(mp, dp,
2572                         current_fsuid(credp), current_fsgid(credp), prid,
2573                         XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp);
2574         if (error)
2575                 goto std_return;
2576
2577         tp = xfs_trans_alloc(mp, XFS_TRANS_SYMLINK);
2578         cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
2579         /*
2580          * The symlink will fit into the inode data fork?
2581          * There can't be any attributes so we get the whole variable part.
2582          */
2583         if (pathlen <= XFS_LITINO(mp))
2584                 fs_blocks = 0;
2585         else
2586                 fs_blocks = XFS_B_TO_FSB(mp, pathlen);
2587         resblks = XFS_SYMLINK_SPACE_RES(mp, link_name->len, fs_blocks);
2588         error = xfs_trans_reserve(tp, resblks, XFS_SYMLINK_LOG_RES(mp), 0,
2589                         XFS_TRANS_PERM_LOG_RES, XFS_SYMLINK_LOG_COUNT);
2590         if (error == ENOSPC && fs_blocks == 0) {
2591                 resblks = 0;
2592                 error = xfs_trans_reserve(tp, 0, XFS_SYMLINK_LOG_RES(mp), 0,
2593                                 XFS_TRANS_PERM_LOG_RES, XFS_SYMLINK_LOG_COUNT);
2594         }
2595         if (error) {
2596                 cancel_flags = 0;
2597                 goto error_return;
2598         }
2599
2600         xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
2601         unlock_dp_on_error = B_TRUE;
2602
2603         /*
2604          * Check whether the directory allows new symlinks or not.
2605          */
2606         if (dp->i_d.di_flags & XFS_DIFLAG_NOSYMLINKS) {
2607                 error = XFS_ERROR(EPERM);
2608                 goto error_return;
2609         }
2610
2611         /*
2612          * Reserve disk quota : blocks and inode.
2613          */
2614         error = XFS_TRANS_RESERVE_QUOTA(mp, tp, udqp, gdqp, resblks, 1, 0);
2615         if (error)
2616                 goto error_return;
2617
2618         /*
2619          * Check for ability to enter directory entry, if no space reserved.
2620          */
2621         error = xfs_dir_canenter(tp, dp, link_name, resblks);
2622         if (error)
2623                 goto error_return;
2624         /*
2625          * Initialize the bmap freelist prior to calling either
2626          * bmapi or the directory create code.
2627          */
2628         XFS_BMAP_INIT(&free_list, &first_block);
2629
2630         /*
2631          * Allocate an inode for the symlink.
2632          */
2633         error = xfs_dir_ialloc(&tp, dp, S_IFLNK | (mode & ~S_IFMT),
2634                                1, 0, credp, prid, resblks > 0, &ip, NULL);
2635         if (error) {
2636                 if (error == ENOSPC)
2637                         goto error_return;
2638                 goto error1;
2639         }
2640         xfs_itrace_ref(ip);
2641
2642         /*
2643          * An error after we've joined dp to the transaction will result in the
2644          * transaction cancel unlocking dp so don't do it explicitly in the
2645          * error path.
2646          */
2647         IHOLD(dp);
2648         xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
2649         unlock_dp_on_error = B_FALSE;
2650
2651         /*
2652          * Also attach the dquot(s) to it, if applicable.
2653          */
2654         XFS_QM_DQVOPCREATE(mp, tp, ip, udqp, gdqp);
2655
2656         if (resblks)
2657                 resblks -= XFS_IALLOC_SPACE_RES(mp);
2658         /*
2659          * If the symlink will fit into the inode, write it inline.
2660          */
2661         if (pathlen <= XFS_IFORK_DSIZE(ip)) {
2662                 xfs_idata_realloc(ip, pathlen, XFS_DATA_FORK);
2663                 memcpy(ip->i_df.if_u1.if_data, target_path, pathlen);
2664                 ip->i_d.di_size = pathlen;
2665
2666                 /*
2667                  * The inode was initially created in extent format.
2668                  */
2669                 ip->i_df.if_flags &= ~(XFS_IFEXTENTS | XFS_IFBROOT);
2670                 ip->i_df.if_flags |= XFS_IFINLINE;
2671
2672                 ip->i_d.di_format = XFS_DINODE_FMT_LOCAL;
2673                 xfs_trans_log_inode(tp, ip, XFS_ILOG_DDATA | XFS_ILOG_CORE);
2674
2675         } else {
2676                 first_fsb = 0;
2677                 nmaps = SYMLINK_MAPS;
2678
2679                 error = xfs_bmapi(tp, ip, first_fsb, fs_blocks,
2680                                   XFS_BMAPI_WRITE | XFS_BMAPI_METADATA,
2681                                   &first_block, resblks, mval, &nmaps,
2682                                   &free_list, NULL);
2683                 if (error) {
2684                         goto error1;
2685                 }
2686
2687                 if (resblks)
2688                         resblks -= fs_blocks;
2689                 ip->i_d.di_size = pathlen;
2690                 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
2691
2692                 cur_chunk = target_path;
2693                 for (n = 0; n < nmaps; n++) {
2694                         d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock);
2695                         byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount);
2696                         bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d,
2697                                                BTOBB(byte_cnt), 0);
2698                         ASSERT(bp && !XFS_BUF_GETERROR(bp));
2699                         if (pathlen < byte_cnt) {
2700                                 byte_cnt = pathlen;
2701                         }
2702                         pathlen -= byte_cnt;
2703
2704                         memcpy(XFS_BUF_PTR(bp), cur_chunk, byte_cnt);
2705                         cur_chunk += byte_cnt;
2706
2707                         xfs_trans_log_buf(tp, bp, 0, byte_cnt - 1);
2708                 }
2709         }
2710
2711         /*
2712          * Create the directory entry for the symlink.
2713          */
2714         error = xfs_dir_createname(tp, dp, link_name, ip->i_ino,
2715                                         &first_block, &free_list, resblks);
2716         if (error)
2717                 goto error1;
2718         xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
2719         xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
2720
2721         /*
2722          * Bump the in memory version number of the parent directory
2723          * so that other processes accessing it will recognize that
2724          * the directory has changed.
2725          */
2726         dp->i_gen++;
2727
2728         /*
2729          * If this is a synchronous mount, make sure that the
2730          * symlink transaction goes to disk before returning to
2731          * the user.
2732          */
2733         if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
2734                 xfs_trans_set_sync(tp);
2735         }
2736
2737         /*
2738          * xfs_trans_commit normally decrements the vnode ref count
2739          * when it unlocks the inode. Since we want to return the
2740          * vnode to the caller, we bump the vnode ref count now.
2741          */
2742         IHOLD(ip);
2743
2744         error = xfs_bmap_finish(&tp, &free_list, &committed);
2745         if (error) {
2746                 goto error2;
2747         }
2748         error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
2749         XFS_QM_DQRELE(mp, udqp);
2750         XFS_QM_DQRELE(mp, gdqp);
2751
2752         /* Fall through to std_return with error = 0 or errno from
2753          * xfs_trans_commit     */
2754 std_return:
2755         if (DM_EVENT_ENABLED(dp, DM_EVENT_POSTSYMLINK)) {
2756                 (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTSYMLINK,
2757                                         dp, DM_RIGHT_NULL,
2758                                         error ? NULL : ip,
2759                                         DM_RIGHT_NULL, link_name->name,
2760                                         target_path, 0, error, 0);
2761         }
2762
2763         if (!error)
2764                 *ipp = ip;
2765         return error;
2766
2767  error2:
2768         IRELE(ip);
2769  error1:
2770         xfs_bmap_cancel(&free_list);
2771         cancel_flags |= XFS_TRANS_ABORT;
2772  error_return:
2773         xfs_trans_cancel(tp, cancel_flags);
2774         XFS_QM_DQRELE(mp, udqp);
2775         XFS_QM_DQRELE(mp, gdqp);
2776
2777         if (unlock_dp_on_error)
2778                 xfs_iunlock(dp, XFS_ILOCK_EXCL);
2779
2780         goto std_return;
2781 }
2782
2783 int
2784 xfs_inode_flush(
2785         xfs_inode_t     *ip,
2786         int             flags)
2787 {
2788         xfs_mount_t     *mp = ip->i_mount;
2789         int             error = 0;
2790
2791         if (XFS_FORCED_SHUTDOWN(mp))
2792                 return XFS_ERROR(EIO);
2793
2794         /*
2795          * Bypass inodes which have already been cleaned by
2796          * the inode flush clustering code inside xfs_iflush
2797          */
2798         if (xfs_inode_clean(ip))
2799                 return 0;
2800
2801         /*
2802          * We make this non-blocking if the inode is contended,
2803          * return EAGAIN to indicate to the caller that they
2804          * did not succeed. This prevents the flush path from
2805          * blocking on inodes inside another operation right
2806          * now, they get caught later by xfs_sync.
2807          */
2808         if (flags & FLUSH_SYNC) {
2809                 xfs_ilock(ip, XFS_ILOCK_SHARED);
2810                 xfs_iflock(ip);
2811         } else if (xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) {
2812                 if (xfs_ipincount(ip) || !xfs_iflock_nowait(ip)) {
2813                         xfs_iunlock(ip, XFS_ILOCK_SHARED);
2814                         return EAGAIN;
2815                 }
2816         } else {
2817                 return EAGAIN;
2818         }
2819
2820         error = xfs_iflush(ip, (flags & FLUSH_SYNC) ? XFS_IFLUSH_SYNC
2821                                                     : XFS_IFLUSH_ASYNC_NOBLOCK);
2822         xfs_iunlock(ip, XFS_ILOCK_SHARED);
2823
2824         return error;
2825 }
2826
2827
2828 int
2829 xfs_set_dmattrs(
2830         xfs_inode_t     *ip,
2831         u_int           evmask,
2832         u_int16_t       state)
2833 {
2834         xfs_mount_t     *mp = ip->i_mount;
2835         xfs_trans_t     *tp;
2836         int             error;
2837
2838         if (!capable(CAP_SYS_ADMIN))
2839                 return XFS_ERROR(EPERM);
2840
2841         if (XFS_FORCED_SHUTDOWN(mp))
2842                 return XFS_ERROR(EIO);
2843
2844         tp = xfs_trans_alloc(mp, XFS_TRANS_SET_DMATTRS);
2845         error = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES (mp), 0, 0, 0);
2846         if (error) {
2847                 xfs_trans_cancel(tp, 0);
2848                 return error;
2849         }
2850         xfs_ilock(ip, XFS_ILOCK_EXCL);
2851         xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
2852
2853         ip->i_d.di_dmevmask = evmask;
2854         ip->i_d.di_dmstate  = state;
2855
2856         xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
2857         IHOLD(ip);
2858         error = xfs_trans_commit(tp, 0);
2859
2860         return error;
2861 }
2862
2863 int
2864 xfs_reclaim(
2865         xfs_inode_t     *ip)
2866 {
2867         bhv_vnode_t     *vp = XFS_ITOV(ip);
2868
2869         xfs_itrace_entry(ip);
2870
2871         ASSERT(!VN_MAPPED(vp));
2872
2873         /* bad inode, get out here ASAP */
2874         if (VN_BAD(vp)) {
2875                 xfs_ireclaim(ip);
2876                 return 0;
2877         }
2878
2879         vn_iowait(ip);
2880
2881         ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || ip->i_delayed_blks == 0);
2882
2883         /*
2884          * Make sure the atime in the XFS inode is correct before freeing the
2885          * Linux inode.
2886          */
2887         xfs_synchronize_atime(ip);
2888
2889         /*
2890          * If we have nothing to flush with this inode then complete the
2891          * teardown now, otherwise break the link between the xfs inode and the
2892          * linux inode and clean up the xfs inode later. This avoids flushing
2893          * the inode to disk during the delete operation itself.
2894          *
2895          * When breaking the link, we need to set the XFS_IRECLAIMABLE flag
2896          * first to ensure that xfs_iunpin() will never see an xfs inode
2897          * that has a linux inode being reclaimed. Synchronisation is provided
2898          * by the i_flags_lock.
2899          */
2900         if (!ip->i_update_core && (ip->i_itemp == NULL)) {
2901                 xfs_ilock(ip, XFS_ILOCK_EXCL);
2902                 xfs_iflock(ip);
2903                 return xfs_finish_reclaim(ip, 1, XFS_IFLUSH_DELWRI_ELSE_SYNC);
2904         } else {
2905                 xfs_mount_t     *mp = ip->i_mount;
2906
2907                 /* Protect sync and unpin from us */
2908                 XFS_MOUNT_ILOCK(mp);
2909                 spin_lock(&ip->i_flags_lock);
2910                 __xfs_iflags_set(ip, XFS_IRECLAIMABLE);
2911                 vn_to_inode(vp)->i_private = NULL;
2912                 ip->i_vnode = NULL;
2913                 spin_unlock(&ip->i_flags_lock);
2914                 list_add_tail(&ip->i_reclaim, &mp->m_del_inodes);
2915                 XFS_MOUNT_IUNLOCK(mp);
2916         }
2917         return 0;
2918 }
2919
2920 int
2921 xfs_finish_reclaim(
2922         xfs_inode_t     *ip,
2923         int             locked,
2924         int             sync_mode)
2925 {
2926         xfs_perag_t     *pag = xfs_get_perag(ip->i_mount, ip->i_ino);
2927         bhv_vnode_t     *vp = XFS_ITOV_NULL(ip);
2928
2929         if (vp && VN_BAD(vp))
2930                 goto reclaim;
2931
2932         /* The hash lock here protects a thread in xfs_iget_core from
2933          * racing with us on linking the inode back with a vnode.
2934          * Once we have the XFS_IRECLAIM flag set it will not touch
2935          * us.
2936          */
2937         write_lock(&pag->pag_ici_lock);
2938         spin_lock(&ip->i_flags_lock);
2939         if (__xfs_iflags_test(ip, XFS_IRECLAIM) ||
2940             (!__xfs_iflags_test(ip, XFS_IRECLAIMABLE) && vp == NULL)) {
2941                 spin_unlock(&ip->i_flags_lock);
2942                 write_unlock(&pag->pag_ici_lock);
2943                 if (locked) {
2944                         xfs_ifunlock(ip);
2945                         xfs_iunlock(ip, XFS_ILOCK_EXCL);
2946                 }
2947                 return 1;
2948         }
2949         __xfs_iflags_set(ip, XFS_IRECLAIM);
2950         spin_unlock(&ip->i_flags_lock);
2951         write_unlock(&pag->pag_ici_lock);
2952         xfs_put_perag(ip->i_mount, pag);
2953
2954         /*
2955          * If the inode is still dirty, then flush it out.  If the inode
2956          * is not in the AIL, then it will be OK to flush it delwri as
2957          * long as xfs_iflush() does not keep any references to the inode.
2958          * We leave that decision up to xfs_iflush() since it has the
2959          * knowledge of whether it's OK to simply do a delwri flush of
2960          * the inode or whether we need to wait until the inode is
2961          * pulled from the AIL.
2962          * We get the flush lock regardless, though, just to make sure
2963          * we don't free it while it is being flushed.
2964          */
2965         if (!locked) {
2966                 xfs_ilock(ip, XFS_ILOCK_EXCL);
2967                 xfs_iflock(ip);
2968         }
2969
2970         /*
2971          * In the case of a forced shutdown we rely on xfs_iflush() to
2972          * wait for the inode to be unpinned before returning an error.
2973          */
2974         if (xfs_iflush(ip, sync_mode) == 0) {
2975                 /* synchronize with xfs_iflush_done */
2976                 xfs_iflock(ip);
2977                 xfs_ifunlock(ip);
2978         }
2979
2980         xfs_iunlock(ip, XFS_ILOCK_EXCL);
2981
2982  reclaim:
2983         xfs_ireclaim(ip);
2984         return 0;
2985 }
2986
2987 int
2988 xfs_finish_reclaim_all(xfs_mount_t *mp, int noblock)
2989 {
2990         int             purged;
2991         xfs_inode_t     *ip, *n;
2992         int             done = 0;
2993
2994         while (!done) {
2995                 purged = 0;
2996                 XFS_MOUNT_ILOCK(mp);
2997                 list_for_each_entry_safe(ip, n, &mp->m_del_inodes, i_reclaim) {
2998                         if (noblock) {
2999                                 if (xfs_ilock_nowait(ip, XFS_ILOCK_EXCL) == 0)
3000                                         continue;
3001                                 if (xfs_ipincount(ip) ||
3002                                     !xfs_iflock_nowait(ip)) {
3003                                         xfs_iunlock(ip, XFS_ILOCK_EXCL);
3004                                         continue;
3005                                 }
3006                         }
3007                         XFS_MOUNT_IUNLOCK(mp);
3008                         if (xfs_finish_reclaim(ip, noblock,
3009                                         XFS_IFLUSH_DELWRI_ELSE_ASYNC))
3010                                 delay(1);
3011                         purged = 1;
3012                         break;
3013                 }
3014
3015                 done = !purged;
3016         }
3017
3018         XFS_MOUNT_IUNLOCK(mp);
3019         return 0;
3020 }
3021
3022 /*
3023  * xfs_alloc_file_space()
3024  *      This routine allocates disk space for the given file.
3025  *
3026  *      If alloc_type == 0, this request is for an ALLOCSP type
3027  *      request which will change the file size.  In this case, no
3028  *      DMAPI event will be generated by the call.  A TRUNCATE event
3029  *      will be generated later by xfs_setattr.
3030  *
3031  *      If alloc_type != 0, this request is for a RESVSP type
3032  *      request, and a DMAPI DM_EVENT_WRITE will be generated if the
3033  *      lower block boundary byte address is less than the file's
3034  *      length.
3035  *
3036  * RETURNS:
3037  *       0 on success
3038  *      errno on error
3039  *
3040  */
3041 STATIC int
3042 xfs_alloc_file_space(
3043         xfs_inode_t             *ip,
3044         xfs_off_t               offset,
3045         xfs_off_t               len,
3046         int                     alloc_type,
3047         int                     attr_flags)
3048 {
3049         xfs_mount_t             *mp = ip->i_mount;
3050         xfs_off_t               count;
3051         xfs_filblks_t           allocated_fsb;
3052         xfs_filblks_t           allocatesize_fsb;
3053         xfs_extlen_t            extsz, temp;
3054         xfs_fileoff_t           startoffset_fsb;
3055         xfs_fsblock_t           firstfsb;
3056         int                     nimaps;
3057         int                     bmapi_flag;
3058         int                     quota_flag;
3059         int                     rt;
3060         xfs_trans_t             *tp;
3061         xfs_bmbt_irec_t         imaps[1], *imapp;
3062         xfs_bmap_free_t         free_list;
3063         uint                    qblocks, resblks, resrtextents;
3064         int                     committed;
3065         int                     error;
3066
3067         xfs_itrace_entry(ip);
3068
3069         if (XFS_FORCED_SHUTDOWN(mp))
3070                 return XFS_ERROR(EIO);
3071
3072         if ((error = XFS_QM_DQATTACH(mp, ip, 0)))
3073                 return error;
3074
3075         if (len <= 0)
3076                 return XFS_ERROR(EINVAL);
3077
3078         rt = XFS_IS_REALTIME_INODE(ip);
3079         extsz = xfs_get_extsz_hint(ip);
3080
3081         count = len;
3082         imapp = &imaps[0];
3083         nimaps = 1;
3084         bmapi_flag = XFS_BMAPI_WRITE | (alloc_type ? XFS_BMAPI_PREALLOC : 0);
3085         startoffset_fsb = XFS_B_TO_FSBT(mp, offset);
3086         allocatesize_fsb = XFS_B_TO_FSB(mp, count);
3087
3088         /*      Generate a DMAPI event if needed.       */
3089         if (alloc_type != 0 && offset < ip->i_size &&
3090                         (attr_flags & XFS_ATTR_DMI) == 0  &&
3091                         DM_EVENT_ENABLED(ip, DM_EVENT_WRITE)) {
3092                 xfs_off_t           end_dmi_offset;
3093
3094                 end_dmi_offset = offset+len;
3095                 if (end_dmi_offset > ip->i_size)
3096                         end_dmi_offset = ip->i_size;
3097                 error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, ip, offset,
3098                                       end_dmi_offset - offset, 0, NULL);
3099                 if (error)
3100                         return error;
3101         }
3102
3103         /*
3104          * Allocate file space until done or until there is an error
3105          */
3106 retry:
3107         while (allocatesize_fsb && !error) {
3108                 xfs_fileoff_t   s, e;
3109
3110                 /*
3111                  * Determine space reservations for data/realtime.
3112                  */
3113                 if (unlikely(extsz)) {
3114                         s = startoffset_fsb;
3115                         do_div(s, extsz);
3116                         s *= extsz;
3117                         e = startoffset_fsb + allocatesize_fsb;
3118                         if ((temp = do_mod(startoffset_fsb, extsz)))
3119                                 e += temp;
3120                         if ((temp = do_mod(e, extsz)))
3121                                 e += extsz - temp;
3122                 } else {
3123                         s = 0;
3124                         e = allocatesize_fsb;
3125                 }
3126
3127                 if (unlikely(rt)) {
3128                         resrtextents = qblocks = (uint)(e - s);
3129                         resrtextents /= mp->m_sb.sb_rextsize;
3130                         resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
3131                         quota_flag = XFS_QMOPT_RES_RTBLKS;
3132                 } else {
3133                         resrtextents = 0;
3134                         resblks = qblocks = \
3135                                 XFS_DIOSTRAT_SPACE_RES(mp, (uint)(e - s));
3136                         quota_flag = XFS_QMOPT_RES_REGBLKS;
3137                 }
3138
3139                 /*
3140                  * Allocate and setup the transaction.
3141                  */
3142                 tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
3143                 error = xfs_trans_reserve(tp, resblks,
3144                                           XFS_WRITE_LOG_RES(mp), resrtextents,
3145                                           XFS_TRANS_PERM_LOG_RES,
3146                                           XFS_WRITE_LOG_COUNT);
3147                 /*
3148                  * Check for running out of space
3149                  */
3150                 if (error) {
3151                         /*
3152                          * Free the transaction structure.
3153                          */
3154                         ASSERT(error == ENOSPC || XFS_FORCED_SHUTDOWN(mp));
3155                         xfs_trans_cancel(tp, 0);
3156                         break;
3157                 }
3158                 xfs_ilock(ip, XFS_ILOCK_EXCL);
3159                 error = XFS_TRANS_RESERVE_QUOTA_NBLKS(mp, tp, ip,
3160                                                       qblocks, 0, quota_flag);
3161                 if (error)
3162                         goto error1;
3163
3164                 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
3165                 xfs_trans_ihold(tp, ip);
3166
3167                 /*
3168                  * Issue the xfs_bmapi() call to allocate the blocks
3169                  */
3170                 XFS_BMAP_INIT(&free_list, &firstfsb);
3171                 error = xfs_bmapi(tp, ip, startoffset_fsb,
3172                                   allocatesize_fsb, bmapi_flag,
3173                                   &firstfsb, 0, imapp, &nimaps,
3174                                   &free_list, NULL);
3175                 if (error) {
3176                         goto error0;
3177                 }
3178
3179                 /*
3180                  * Complete the transaction
3181                  */
3182                 error = xfs_bmap_finish(&tp, &free_list, &committed);
3183                 if (error) {
3184                         goto error0;
3185                 }
3186
3187                 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
3188                 xfs_iunlock(ip, XFS_ILOCK_EXCL);
3189                 if (error) {
3190                         break;
3191                 }
3192
3193                 allocated_fsb = imapp->br_blockcount;
3194
3195                 if (nimaps == 0) {
3196                         error = XFS_ERROR(ENOSPC);
3197                         break;
3198                 }
3199
3200                 startoffset_fsb += allocated_fsb;
3201                 allocatesize_fsb -= allocated_fsb;
3202         }
3203 dmapi_enospc_check:
3204         if (error == ENOSPC && (attr_flags & XFS_ATTR_DMI) == 0 &&
3205             DM_EVENT_ENABLED(ip, DM_EVENT_NOSPACE)) {
3206                 error = XFS_SEND_NAMESP(mp, DM_EVENT_NOSPACE,
3207                                 ip, DM_RIGHT_NULL,
3208                                 ip, DM_RIGHT_NULL,
3209                                 NULL, NULL, 0, 0, 0); /* Delay flag intentionally unused */
3210                 if (error == 0)
3211                         goto retry;     /* Maybe DMAPI app. has made space */
3212                 /* else fall through with error from XFS_SEND_DATA */
3213         }
3214
3215         return error;
3216
3217 error0: /* Cancel bmap, unlock inode, unreserve quota blocks, cancel trans */
3218         xfs_bmap_cancel(&free_list);
3219         XFS_TRANS_UNRESERVE_QUOTA_NBLKS(mp, tp, ip, qblocks, 0, quota_flag);
3220
3221 error1: /* Just cancel transaction */
3222         xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
3223         xfs_iunlock(ip, XFS_ILOCK_EXCL);
3224         goto dmapi_enospc_check;
3225 }
3226
3227 /*
3228  * Zero file bytes between startoff and endoff inclusive.
3229  * The iolock is held exclusive and no blocks are buffered.
3230  */
3231 STATIC int
3232 xfs_zero_remaining_bytes(
3233         xfs_inode_t             *ip,
3234         xfs_off_t               startoff,
3235         xfs_off_t               endoff)
3236 {
3237         xfs_bmbt_irec_t         imap;
3238         xfs_fileoff_t           offset_fsb;
3239         xfs_off_t               lastoffset;
3240         xfs_off_t               offset;
3241         xfs_buf_t               *bp;
3242         xfs_mount_t             *mp = ip->i_mount;
3243         int                     nimap;
3244         int                     error = 0;
3245
3246         bp = xfs_buf_get_noaddr(mp->m_sb.sb_blocksize,
3247                                 XFS_IS_REALTIME_INODE(ip) ?
3248                                 mp->m_rtdev_targp : mp->m_ddev_targp);
3249
3250         for (offset = startoff; offset <= endoff; offset = lastoffset + 1) {
3251                 offset_fsb = XFS_B_TO_FSBT(mp, offset);
3252                 nimap = 1;
3253                 error = xfs_bmapi(NULL, ip, offset_fsb, 1, 0,
3254                         NULL, 0, &imap, &nimap, NULL, NULL);
3255                 if (error || nimap < 1)
3256                         break;
3257                 ASSERT(imap.br_blockcount >= 1);
3258                 ASSERT(imap.br_startoff == offset_fsb);
3259                 lastoffset = XFS_FSB_TO_B(mp, imap.br_startoff + 1) - 1;
3260                 if (lastoffset > endoff)
3261                         lastoffset = endoff;
3262                 if (imap.br_startblock == HOLESTARTBLOCK)
3263                         continue;
3264                 ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
3265                 if (imap.br_state == XFS_EXT_UNWRITTEN)
3266                         continue;
3267                 XFS_BUF_UNDONE(bp);
3268                 XFS_BUF_UNWRITE(bp);
3269                 XFS_BUF_READ(bp);
3270                 XFS_BUF_SET_ADDR(bp, XFS_FSB_TO_DB(ip, imap.br_startblock));
3271                 xfsbdstrat(mp, bp);
3272                 error = xfs_iowait(bp);
3273                 if (error) {
3274                         xfs_ioerror_alert("xfs_zero_remaining_bytes(read)",
3275                                           mp, bp, XFS_BUF_ADDR(bp));
3276                         break;
3277                 }
3278                 memset(XFS_BUF_PTR(bp) +
3279                         (offset - XFS_FSB_TO_B(mp, imap.br_startoff)),
3280                       0, lastoffset - offset + 1);
3281                 XFS_BUF_UNDONE(bp);
3282                 XFS_BUF_UNREAD(bp);
3283                 XFS_BUF_WRITE(bp);
3284                 xfsbdstrat(mp, bp);
3285                 error = xfs_iowait(bp);
3286                 if (error) {
3287                         xfs_ioerror_alert("xfs_zero_remaining_bytes(write)",
3288                                           mp, bp, XFS_BUF_ADDR(bp));
3289                         break;
3290                 }
3291         }
3292         xfs_buf_free(bp);
3293         return error;
3294 }
3295
3296 /*
3297  * xfs_free_file_space()
3298  *      This routine frees disk space for the given file.
3299  *
3300  *      This routine is only called by xfs_change_file_space
3301  *      for an UNRESVSP type call.
3302  *
3303  * RETURNS:
3304  *       0 on success
3305  *      errno on error
3306  *
3307  */
3308 STATIC int
3309 xfs_free_file_space(
3310         xfs_inode_t             *ip,
3311         xfs_off_t               offset,
3312         xfs_off_t               len,
3313         int                     attr_flags)
3314 {
3315         bhv_vnode_t             *vp;
3316         int                     committed;
3317         int                     done;
3318         xfs_off_t               end_dmi_offset;
3319         xfs_fileoff_t           endoffset_fsb;
3320         int                     error;
3321         xfs_fsblock_t           firstfsb;
3322         xfs_bmap_free_t         free_list;
3323         xfs_bmbt_irec_t         imap;
3324         xfs_off_t               ioffset;
3325         xfs_extlen_t            mod=0;
3326         xfs_mount_t             *mp;
3327         int                     nimap;
3328         uint                    resblks;
3329         uint                    rounding;
3330         int                     rt;
3331         xfs_fileoff_t           startoffset_fsb;
3332         xfs_trans_t             *tp;
3333         int                     need_iolock = 1;
3334
3335         vp = XFS_ITOV(ip);
3336         mp = ip->i_mount;
3337
3338         xfs_itrace_entry(ip);
3339
3340         if ((error = XFS_QM_DQATTACH(mp, ip, 0)))
3341                 return error;
3342
3343         error = 0;
3344         if (len <= 0)   /* if nothing being freed */
3345                 return error;
3346         rt = XFS_IS_REALTIME_INODE(ip);
3347         startoffset_fsb = XFS_B_TO_FSB(mp, offset);
3348         end_dmi_offset = offset + len;
3349         endoffset_fsb = XFS_B_TO_FSBT(mp, end_dmi_offset);
3350
3351         if (offset < ip->i_size && (attr_flags & XFS_ATTR_DMI) == 0 &&
3352             DM_EVENT_ENABLED(ip, DM_EVENT_WRITE)) {
3353                 if (end_dmi_offset > ip->i_size)
3354                         end_dmi_offset = ip->i_size;
3355                 error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, ip,
3356                                 offset, end_dmi_offset - offset,
3357                                 AT_DELAY_FLAG(attr_flags), NULL);
3358                 if (error)
3359                         return error;
3360         }
3361
3362         if (attr_flags & XFS_ATTR_NOLOCK)
3363                 need_iolock = 0;
3364         if (need_iolock) {
3365                 xfs_ilock(ip, XFS_IOLOCK_EXCL);
3366                 vn_iowait(ip);  /* wait for the completion of any pending DIOs */
3367         }
3368
3369         rounding = max_t(uint, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE);
3370         ioffset = offset & ~(rounding - 1);
3371
3372         if (VN_CACHED(vp) != 0) {
3373                 xfs_inval_cached_trace(ip, ioffset, -1, ioffset, -1);
3374                 error = xfs_flushinval_pages(ip, ioffset, -1, FI_REMAPF_LOCKED);
3375                 if (error)
3376                         goto out_unlock_iolock;
3377         }
3378
3379         /*
3380          * Need to zero the stuff we're not freeing, on disk.
3381          * If its a realtime file & can't use unwritten extents then we
3382          * actually need to zero the extent edges.  Otherwise xfs_bunmapi
3383          * will take care of it for us.
3384          */
3385         if (rt && !xfs_sb_version_hasextflgbit(&mp->m_sb)) {
3386                 nimap = 1;
3387                 error = xfs_bmapi(NULL, ip, startoffset_fsb,
3388                         1, 0, NULL, 0, &imap, &nimap, NULL, NULL);
3389                 if (error)
3390                         goto out_unlock_iolock;
3391                 ASSERT(nimap == 0 || nimap == 1);
3392                 if (nimap && imap.br_startblock != HOLESTARTBLOCK) {
3393                         xfs_daddr_t     block;
3394
3395                         ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
3396                         block = imap.br_startblock;
3397                         mod = do_div(block, mp->m_sb.sb_rextsize);
3398                         if (mod)
3399                                 startoffset_fsb += mp->m_sb.sb_rextsize - mod;
3400                 }
3401                 nimap = 1;
3402                 error = xfs_bmapi(NULL, ip, endoffset_fsb - 1,
3403                         1, 0, NULL, 0, &imap, &nimap, NULL, NULL);
3404                 if (error)
3405                         goto out_unlock_iolock;
3406                 ASSERT(nimap == 0 || nimap == 1);
3407                 if (nimap && imap.br_startblock != HOLESTARTBLOCK) {
3408                         ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
3409                         mod++;
3410                         if (mod && (mod != mp->m_sb.sb_rextsize))
3411                                 endoffset_fsb -= mod;
3412                 }
3413         }
3414         if ((done = (endoffset_fsb <= startoffset_fsb)))
3415                 /*
3416                  * One contiguous piece to clear
3417                  */
3418                 error = xfs_zero_remaining_bytes(ip, offset, offset + len - 1);
3419         else {
3420                 /*
3421                  * Some full blocks, possibly two pieces to clear
3422                  */
3423                 if (offset < XFS_FSB_TO_B(mp, startoffset_fsb))
3424                         error = xfs_zero_remaining_bytes(ip, offset,
3425                                 XFS_FSB_TO_B(mp, startoffset_fsb) - 1);
3426                 if (!error &&
3427                     XFS_FSB_TO_B(mp, endoffset_fsb) < offset + len)
3428                         error = xfs_zero_remaining_bytes(ip,
3429                                 XFS_FSB_TO_B(mp, endoffset_fsb),
3430                                 offset + len - 1);
3431         }
3432
3433         /*
3434          * free file space until done or until there is an error
3435          */
3436         resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
3437         while (!error && !done) {
3438
3439                 /*
3440                  * allocate and setup the transaction. Allow this
3441                  * transaction to dip into the reserve blocks to ensure
3442                  * the freeing of the space succeeds at ENOSPC.
3443                  */
3444                 tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
3445                 tp->t_flags |= XFS_TRANS_RESERVE;
3446                 error = xfs_trans_reserve(tp,
3447                                           resblks,
3448                                           XFS_WRITE_LOG_RES(mp),
3449                                           0,
3450                                           XFS_TRANS_PERM_LOG_RES,
3451                                           XFS_WRITE_LOG_COUNT);
3452
3453                 /*
3454                  * check for running out of space
3455                  */
3456                 if (error) {
3457                         /*
3458                          * Free the transaction structure.
3459                          */
3460                         ASSERT(error == ENOSPC || XFS_FORCED_SHUTDOWN(mp));
3461                         xfs_trans_cancel(tp, 0);
3462                         break;
3463                 }
3464                 xfs_ilock(ip, XFS_ILOCK_EXCL);
3465                 error = XFS_TRANS_RESERVE_QUOTA(mp, tp,
3466                                 ip->i_udquot, ip->i_gdquot, resblks, 0,
3467                                 XFS_QMOPT_RES_REGBLKS);
3468                 if (error)
3469                         goto error1;
3470
3471                 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
3472                 xfs_trans_ihold(tp, ip);
3473
3474                 /*
3475                  * issue the bunmapi() call to free the blocks
3476                  */
3477                 XFS_BMAP_INIT(&free_list, &firstfsb);
3478                 error = xfs_bunmapi(tp, ip, startoffset_fsb,
3479                                   endoffset_fsb - startoffset_fsb,
3480                                   0, 2, &firstfsb, &free_list, NULL, &done);
3481                 if (error) {
3482                         goto error0;
3483                 }
3484
3485                 /*
3486                  * complete the transaction
3487                  */
3488                 error = xfs_bmap_finish(&tp, &free_list, &committed);
3489                 if (error) {
3490                         goto error0;
3491                 }
3492
3493                 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
3494                 xfs_iunlock(ip, XFS_ILOCK_EXCL);
3495         }
3496
3497  out_unlock_iolock:
3498         if (need_iolock)
3499                 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
3500         return error;
3501
3502  error0:
3503         xfs_bmap_cancel(&free_list);
3504  error1:
3505         xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
3506         xfs_iunlock(ip, need_iolock ? (XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL) :
3507                     XFS_ILOCK_EXCL);
3508         return error;
3509 }
3510
3511 /*
3512  * xfs_change_file_space()
3513  *      This routine allocates or frees disk space for the given file.
3514  *      The user specified parameters are checked for alignment and size
3515  *      limitations.
3516  *
3517  * RETURNS:
3518  *       0 on success
3519  *      errno on error
3520  *
3521  */
3522 int
3523 xfs_change_file_space(
3524         xfs_inode_t     *ip,
3525         int             cmd,
3526         xfs_flock64_t   *bf,
3527         xfs_off_t       offset,
3528         cred_t          *credp,
3529         int             attr_flags)
3530 {
3531         xfs_mount_t     *mp = ip->i_mount;
3532         int             clrprealloc;
3533         int             error;
3534         xfs_fsize_t     fsize;
3535         int             setprealloc;
3536         xfs_off_t       startoffset;
3537         xfs_off_t       llen;
3538         xfs_trans_t     *tp;
3539         struct iattr    iattr;
3540
3541         xfs_itrace_entry(ip);
3542
3543         if (!S_ISREG(ip->i_d.di_mode))
3544                 return XFS_ERROR(EINVAL);
3545
3546         switch (bf->l_whence) {
3547         case 0: /*SEEK_SET*/
3548                 break;
3549         case 1: /*SEEK_CUR*/
3550                 bf->l_start += offset;
3551                 break;
3552         case 2: /*SEEK_END*/
3553                 bf->l_start += ip->i_size;
3554                 break;
3555         default:
3556                 return XFS_ERROR(EINVAL);
3557         }
3558
3559         llen = bf->l_len > 0 ? bf->l_len - 1 : bf->l_len;
3560
3561         if (   (bf->l_start < 0)
3562             || (bf->l_start > XFS_MAXIOFFSET(mp))
3563             || (bf->l_start + llen < 0)
3564             || (bf->l_start + llen > XFS_MAXIOFFSET(mp)))
3565                 return XFS_ERROR(EINVAL);
3566
3567         bf->l_whence = 0;
3568
3569         startoffset = bf->l_start;
3570         fsize = ip->i_size;
3571
3572         /*
3573          * XFS_IOC_RESVSP and XFS_IOC_UNRESVSP will reserve or unreserve
3574          * file space.
3575          * These calls do NOT zero the data space allocated to the file,
3576          * nor do they change the file size.
3577          *
3578          * XFS_IOC_ALLOCSP and XFS_IOC_FREESP will allocate and free file
3579          * space.
3580          * These calls cause the new file data to be zeroed and the file
3581          * size to be changed.
3582          */
3583         setprealloc = clrprealloc = 0;
3584
3585         switch (cmd) {
3586         case XFS_IOC_RESVSP:
3587         case XFS_IOC_RESVSP64:
3588                 error = xfs_alloc_file_space(ip, startoffset, bf->l_len,
3589                                                                 1, attr_flags);
3590                 if (error)
3591                         return error;
3592                 setprealloc = 1;
3593                 break;
3594
3595         case XFS_IOC_UNRESVSP:
3596         case XFS_IOC_UNRESVSP64:
3597                 if ((error = xfs_free_file_space(ip, startoffset, bf->l_len,
3598                                                                 attr_flags)))
3599                         return error;
3600                 break;
3601
3602         case XFS_IOC_ALLOCSP:
3603         case XFS_IOC_ALLOCSP64:
3604         case XFS_IOC_FREESP:
3605         case XFS_IOC_FREESP64:
3606                 if (startoffset > fsize) {
3607                         error = xfs_alloc_file_space(ip, fsize,
3608                                         startoffset - fsize, 0, attr_flags);
3609                         if (error)
3610                                 break;
3611                 }
3612
3613                 iattr.ia_valid = ATTR_SIZE;
3614                 iattr.ia_size = startoffset;
3615
3616                 error = xfs_setattr(ip, &iattr, attr_flags, credp);
3617
3618                 if (error)
3619                         return error;
3620
3621                 clrprealloc = 1;
3622                 break;
3623
3624         default:
3625                 ASSERT(0);
3626                 return XFS_ERROR(EINVAL);
3627         }
3628
3629         /*
3630          * update the inode timestamp, mode, and prealloc flag bits
3631          */
3632         tp = xfs_trans_alloc(mp, XFS_TRANS_WRITEID);
3633
3634         if ((error = xfs_trans_reserve(tp, 0, XFS_WRITEID_LOG_RES(mp),
3635                                       0, 0, 0))) {
3636                 /* ASSERT(0); */
3637                 xfs_trans_cancel(tp, 0);
3638                 return error;
3639         }
3640
3641         xfs_ilock(ip, XFS_ILOCK_EXCL);
3642
3643         xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
3644         xfs_trans_ihold(tp, ip);
3645
3646         if ((attr_flags & XFS_ATTR_DMI) == 0) {
3647                 ip->i_d.di_mode &= ~S_ISUID;
3648
3649                 /*
3650                  * Note that we don't have to worry about mandatory
3651                  * file locking being disabled here because we only
3652                  * clear the S_ISGID bit if the Group execute bit is
3653                  * on, but if it was on then mandatory locking wouldn't
3654                  * have been enabled.
3655                  */
3656                 if (ip->i_d.di_mode & S_IXGRP)
3657                         ip->i_d.di_mode &= ~S_ISGID;
3658
3659                 xfs_ichgtime(ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
3660         }
3661         if (setprealloc)
3662                 ip->i_d.di_flags |= XFS_DIFLAG_PREALLOC;
3663         else if (clrprealloc)
3664                 ip->i_d.di_flags &= ~XFS_DIFLAG_PREALLOC;
3665
3666         xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
3667         xfs_trans_set_sync(tp);
3668
3669         error = xfs_trans_commit(tp, 0);
3670
3671         xfs_iunlock(ip, XFS_ILOCK_EXCL);
3672
3673         return error;
3674 }