]> www.pilppa.org Git - linux-2.6-omap-h63xx.git/blob - fs/xfs/xfs_vnodeops.c
[XFS] xfs_setattr currently doesn't just handle the attributes set through
[linux-2.6-omap-h63xx.git] / fs / xfs / xfs_vnodeops.c
1 /*
2  * Copyright (c) 2000-2006 Silicon Graphics, Inc.
3  * All Rights Reserved.
4  *
5  * This program is free software; you can redistribute it and/or
6  * modify it under the terms of the GNU General Public License as
7  * published by the Free Software Foundation.
8  *
9  * This program is distributed in the hope that it would be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12  * GNU General Public License for more details.
13  *
14  * You should have received a copy of the GNU General Public License
15  * along with this program; if not, write the Free Software Foundation,
16  * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
17  */
18
19 #include "xfs.h"
20 #include "xfs_fs.h"
21 #include "xfs_types.h"
22 #include "xfs_bit.h"
23 #include "xfs_log.h"
24 #include "xfs_inum.h"
25 #include "xfs_trans.h"
26 #include "xfs_sb.h"
27 #include "xfs_ag.h"
28 #include "xfs_dir2.h"
29 #include "xfs_dmapi.h"
30 #include "xfs_mount.h"
31 #include "xfs_da_btree.h"
32 #include "xfs_bmap_btree.h"
33 #include "xfs_alloc_btree.h"
34 #include "xfs_ialloc_btree.h"
35 #include "xfs_dir2_sf.h"
36 #include "xfs_attr_sf.h"
37 #include "xfs_dinode.h"
38 #include "xfs_inode.h"
39 #include "xfs_inode_item.h"
40 #include "xfs_itable.h"
41 #include "xfs_btree.h"
42 #include "xfs_ialloc.h"
43 #include "xfs_alloc.h"
44 #include "xfs_bmap.h"
45 #include "xfs_attr.h"
46 #include "xfs_rw.h"
47 #include "xfs_error.h"
48 #include "xfs_quota.h"
49 #include "xfs_utils.h"
50 #include "xfs_rtalloc.h"
51 #include "xfs_trans_space.h"
52 #include "xfs_log_priv.h"
53 #include "xfs_filestream.h"
54 #include "xfs_vnodeops.h"
55
56 int
57 xfs_open(
58         xfs_inode_t     *ip)
59 {
60         int             mode;
61
62         if (XFS_FORCED_SHUTDOWN(ip->i_mount))
63                 return XFS_ERROR(EIO);
64
65         /*
66          * If it's a directory with any blocks, read-ahead block 0
67          * as we're almost certain to have the next operation be a read there.
68          */
69         if (S_ISDIR(ip->i_d.di_mode) && ip->i_d.di_nextents > 0) {
70                 mode = xfs_ilock_map_shared(ip);
71                 if (ip->i_d.di_nextents > 0)
72                         (void)xfs_da_reada_buf(NULL, ip, 0, XFS_DATA_FORK);
73                 xfs_iunlock(ip, mode);
74         }
75         return 0;
76 }
77
78 /*
79  * xfs_setattr
80  */
81 int
82 xfs_setattr(
83         xfs_inode_t             *ip,
84         bhv_vattr_t             *vap,
85         int                     flags,
86         cred_t                  *credp)
87 {
88         xfs_mount_t             *mp = ip->i_mount;
89         xfs_trans_t             *tp;
90         int                     mask;
91         int                     code;
92         uint                    lock_flags;
93         uint                    commit_flags=0;
94         uid_t                   uid=0, iuid=0;
95         gid_t                   gid=0, igid=0;
96         int                     timeflags = 0;
97         struct xfs_dquot        *udqp, *gdqp, *olddquot1, *olddquot2;
98         int                     file_owner;
99         int                     need_iolock = 1;
100
101         xfs_itrace_entry(ip);
102
103         if (mp->m_flags & XFS_MOUNT_RDONLY)
104                 return XFS_ERROR(EROFS);
105
106         /*
107          * Cannot set certain attributes.
108          */
109         mask = vap->va_mask;
110         if (mask & XFS_AT_NOSET) {
111                 return XFS_ERROR(EINVAL);
112         }
113
114         if (XFS_FORCED_SHUTDOWN(mp))
115                 return XFS_ERROR(EIO);
116
117         /*
118          * Timestamps do not need to be logged and hence do not
119          * need to be done within a transaction.
120          */
121         if (mask & XFS_AT_UPDTIMES) {
122                 ASSERT((mask & ~XFS_AT_UPDTIMES) == 0);
123                 timeflags = ((mask & XFS_AT_UPDATIME) ? XFS_ICHGTIME_ACC : 0) |
124                             ((mask & XFS_AT_UPDCTIME) ? XFS_ICHGTIME_CHG : 0) |
125                             ((mask & XFS_AT_UPDMTIME) ? XFS_ICHGTIME_MOD : 0);
126                 xfs_ichgtime(ip, timeflags);
127                 return 0;
128         }
129
130         olddquot1 = olddquot2 = NULL;
131         udqp = gdqp = NULL;
132
133         /*
134          * If disk quotas is on, we make sure that the dquots do exist on disk,
135          * before we start any other transactions. Trying to do this later
136          * is messy. We don't care to take a readlock to look at the ids
137          * in inode here, because we can't hold it across the trans_reserve.
138          * If the IDs do change before we take the ilock, we're covered
139          * because the i_*dquot fields will get updated anyway.
140          */
141         if (XFS_IS_QUOTA_ON(mp) && (mask & (XFS_AT_UID|XFS_AT_GID))) {
142                 uint    qflags = 0;
143
144                 if ((mask & XFS_AT_UID) && XFS_IS_UQUOTA_ON(mp)) {
145                         uid = vap->va_uid;
146                         qflags |= XFS_QMOPT_UQUOTA;
147                 } else {
148                         uid = ip->i_d.di_uid;
149                 }
150                 if ((mask & XFS_AT_GID) && XFS_IS_GQUOTA_ON(mp)) {
151                         gid = vap->va_gid;
152                         qflags |= XFS_QMOPT_GQUOTA;
153                 }  else {
154                         gid = ip->i_d.di_gid;
155                 }
156
157                 /*
158                  * We take a reference when we initialize udqp and gdqp,
159                  * so it is important that we never blindly double trip on
160                  * the same variable. See xfs_create() for an example.
161                  */
162                 ASSERT(udqp == NULL);
163                 ASSERT(gdqp == NULL);
164                 code = XFS_QM_DQVOPALLOC(mp, ip, uid, gid, ip->i_d.di_projid,
165                                          qflags, &udqp, &gdqp);
166                 if (code)
167                         return code;
168         }
169
170         /*
171          * For the other attributes, we acquire the inode lock and
172          * first do an error checking pass.
173          */
174         tp = NULL;
175         lock_flags = XFS_ILOCK_EXCL;
176         if (flags & ATTR_NOLOCK)
177                 need_iolock = 0;
178         if (!(mask & XFS_AT_SIZE)) {
179                 if ((mask != (XFS_AT_CTIME|XFS_AT_ATIME|XFS_AT_MTIME)) ||
180                     (mp->m_flags & XFS_MOUNT_WSYNC)) {
181                         tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE);
182                         commit_flags = 0;
183                         if ((code = xfs_trans_reserve(tp, 0,
184                                                      XFS_ICHANGE_LOG_RES(mp), 0,
185                                                      0, 0))) {
186                                 lock_flags = 0;
187                                 goto error_return;
188                         }
189                 }
190         } else {
191                 if (DM_EVENT_ENABLED(ip, DM_EVENT_TRUNCATE) &&
192                     !(flags & ATTR_DMI)) {
193                         int dmflags = AT_DELAY_FLAG(flags) | DM_SEM_FLAG_WR;
194                         code = XFS_SEND_DATA(mp, DM_EVENT_TRUNCATE, ip,
195                                 vap->va_size, 0, dmflags, NULL);
196                         if (code) {
197                                 lock_flags = 0;
198                                 goto error_return;
199                         }
200                 }
201                 if (need_iolock)
202                         lock_flags |= XFS_IOLOCK_EXCL;
203         }
204
205         xfs_ilock(ip, lock_flags);
206
207         /* boolean: are we the file owner? */
208         file_owner = (current_fsuid(credp) == ip->i_d.di_uid);
209
210         /*
211          * Change various properties of a file.
212          * Only the owner or users with CAP_FOWNER
213          * capability may do these things.
214          */
215         if (mask & (XFS_AT_MODE|XFS_AT_UID|XFS_AT_GID)) {
216                 /*
217                  * CAP_FOWNER overrides the following restrictions:
218                  *
219                  * The user ID of the calling process must be equal
220                  * to the file owner ID, except in cases where the
221                  * CAP_FSETID capability is applicable.
222                  */
223                 if (!file_owner && !capable(CAP_FOWNER)) {
224                         code = XFS_ERROR(EPERM);
225                         goto error_return;
226                 }
227
228                 /*
229                  * CAP_FSETID overrides the following restrictions:
230                  *
231                  * The effective user ID of the calling process shall match
232                  * the file owner when setting the set-user-ID and
233                  * set-group-ID bits on that file.
234                  *
235                  * The effective group ID or one of the supplementary group
236                  * IDs of the calling process shall match the group owner of
237                  * the file when setting the set-group-ID bit on that file
238                  */
239                 if (mask & XFS_AT_MODE) {
240                         mode_t m = 0;
241
242                         if ((vap->va_mode & S_ISUID) && !file_owner)
243                                 m |= S_ISUID;
244                         if ((vap->va_mode & S_ISGID) &&
245                             !in_group_p((gid_t)ip->i_d.di_gid))
246                                 m |= S_ISGID;
247 #if 0
248                         /* Linux allows this, Irix doesn't. */
249                         if ((vap->va_mode & S_ISVTX) && !S_ISDIR(ip->i_d.di_mode))
250                                 m |= S_ISVTX;
251 #endif
252                         if (m && !capable(CAP_FSETID))
253                                 vap->va_mode &= ~m;
254                 }
255         }
256
257         /*
258          * Change file ownership.  Must be the owner or privileged.
259          * If the system was configured with the "restricted_chown"
260          * option, the owner is not permitted to give away the file,
261          * and can change the group id only to a group of which he
262          * or she is a member.
263          */
264         if (mask & (XFS_AT_UID|XFS_AT_GID)) {
265                 /*
266                  * These IDs could have changed since we last looked at them.
267                  * But, we're assured that if the ownership did change
268                  * while we didn't have the inode locked, inode's dquot(s)
269                  * would have changed also.
270                  */
271                 iuid = ip->i_d.di_uid;
272                 igid = ip->i_d.di_gid;
273                 gid = (mask & XFS_AT_GID) ? vap->va_gid : igid;
274                 uid = (mask & XFS_AT_UID) ? vap->va_uid : iuid;
275
276                 /*
277                  * CAP_CHOWN overrides the following restrictions:
278                  *
279                  * If _POSIX_CHOWN_RESTRICTED is defined, this capability
280                  * shall override the restriction that a process cannot
281                  * change the user ID of a file it owns and the restriction
282                  * that the group ID supplied to the chown() function
283                  * shall be equal to either the group ID or one of the
284                  * supplementary group IDs of the calling process.
285                  */
286                 if (restricted_chown &&
287                     (iuid != uid || (igid != gid &&
288                                      !in_group_p((gid_t)gid))) &&
289                     !capable(CAP_CHOWN)) {
290                         code = XFS_ERROR(EPERM);
291                         goto error_return;
292                 }
293                 /*
294                  * Do a quota reservation only if uid/gid is actually
295                  * going to change.
296                  */
297                 if ((XFS_IS_UQUOTA_ON(mp) && iuid != uid) ||
298                     (XFS_IS_GQUOTA_ON(mp) && igid != gid)) {
299                         ASSERT(tp);
300                         code = XFS_QM_DQVOPCHOWNRESV(mp, tp, ip, udqp, gdqp,
301                                                 capable(CAP_FOWNER) ?
302                                                 XFS_QMOPT_FORCE_RES : 0);
303                         if (code)       /* out of quota */
304                                 goto error_return;
305                 }
306         }
307
308         /*
309          * Truncate file.  Must have write permission and not be a directory.
310          */
311         if (mask & XFS_AT_SIZE) {
312                 /* Short circuit the truncate case for zero length files */
313                 if ((vap->va_size == 0) &&
314                    (ip->i_size == 0) && (ip->i_d.di_nextents == 0)) {
315                         xfs_iunlock(ip, XFS_ILOCK_EXCL);
316                         lock_flags &= ~XFS_ILOCK_EXCL;
317                         if (mask & XFS_AT_CTIME)
318                                 xfs_ichgtime(ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
319                         code = 0;
320                         goto error_return;
321                 }
322
323                 if (S_ISDIR(ip->i_d.di_mode)) {
324                         code = XFS_ERROR(EISDIR);
325                         goto error_return;
326                 } else if (!S_ISREG(ip->i_d.di_mode)) {
327                         code = XFS_ERROR(EINVAL);
328                         goto error_return;
329                 }
330                 /*
331                  * Make sure that the dquots are attached to the inode.
332                  */
333                 if ((code = XFS_QM_DQATTACH(mp, ip, XFS_QMOPT_ILOCKED)))
334                         goto error_return;
335         }
336
337         /*
338          * Change file access or modified times.
339          */
340         if (mask & (XFS_AT_ATIME|XFS_AT_MTIME)) {
341                 if (!file_owner) {
342                         if ((flags & ATTR_UTIME) &&
343                             !capable(CAP_FOWNER)) {
344                                 code = XFS_ERROR(EPERM);
345                                 goto error_return;
346                         }
347                 }
348         }
349
350         /*
351          * Now we can make the changes.  Before we join the inode
352          * to the transaction, if XFS_AT_SIZE is set then take care of
353          * the part of the truncation that must be done without the
354          * inode lock.  This needs to be done before joining the inode
355          * to the transaction, because the inode cannot be unlocked
356          * once it is a part of the transaction.
357          */
358         if (mask & XFS_AT_SIZE) {
359                 code = 0;
360                 if ((vap->va_size > ip->i_size) &&
361                     (flags & ATTR_NOSIZETOK) == 0) {
362                         /*
363                          * Do the first part of growing a file: zero any data
364                          * in the last block that is beyond the old EOF.  We
365                          * need to do this before the inode is joined to the
366                          * transaction to modify the i_size.
367                          */
368                         code = xfs_zero_eof(ip, vap->va_size, ip->i_size);
369                 }
370                 xfs_iunlock(ip, XFS_ILOCK_EXCL);
371
372                 /*
373                  * We are going to log the inode size change in this
374                  * transaction so any previous writes that are beyond the on
375                  * disk EOF and the new EOF that have not been written out need
376                  * to be written here. If we do not write the data out, we
377                  * expose ourselves to the null files problem.
378                  *
379                  * Only flush from the on disk size to the smaller of the in
380                  * memory file size or the new size as that's the range we
381                  * really care about here and prevents waiting for other data
382                  * not within the range we care about here.
383                  */
384                 if (!code &&
385                     (ip->i_size != ip->i_d.di_size) &&
386                     (vap->va_size > ip->i_d.di_size)) {
387                         code = xfs_flush_pages(ip,
388                                         ip->i_d.di_size, vap->va_size,
389                                         XFS_B_ASYNC, FI_NONE);
390                 }
391
392                 /* wait for all I/O to complete */
393                 vn_iowait(ip);
394
395                 if (!code)
396                         code = xfs_itruncate_data(ip, vap->va_size);
397                 if (code) {
398                         ASSERT(tp == NULL);
399                         lock_flags &= ~XFS_ILOCK_EXCL;
400                         ASSERT(lock_flags == XFS_IOLOCK_EXCL);
401                         goto error_return;
402                 }
403                 tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_SIZE);
404                 if ((code = xfs_trans_reserve(tp, 0,
405                                              XFS_ITRUNCATE_LOG_RES(mp), 0,
406                                              XFS_TRANS_PERM_LOG_RES,
407                                              XFS_ITRUNCATE_LOG_COUNT))) {
408                         xfs_trans_cancel(tp, 0);
409                         if (need_iolock)
410                                 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
411                         return code;
412                 }
413                 commit_flags = XFS_TRANS_RELEASE_LOG_RES;
414                 xfs_ilock(ip, XFS_ILOCK_EXCL);
415         }
416
417         if (tp) {
418                 xfs_trans_ijoin(tp, ip, lock_flags);
419                 xfs_trans_ihold(tp, ip);
420         }
421
422         /*
423          * Truncate file.  Must have write permission and not be a directory.
424          */
425         if (mask & XFS_AT_SIZE) {
426                 /*
427                  * Only change the c/mtime if we are changing the size
428                  * or we are explicitly asked to change it. This handles
429                  * the semantic difference between truncate() and ftruncate()
430                  * as implemented in the VFS.
431                  */
432                 if (vap->va_size != ip->i_size || (mask & XFS_AT_CTIME))
433                         timeflags |= XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG;
434
435                 if (vap->va_size > ip->i_size) {
436                         ip->i_d.di_size = vap->va_size;
437                         ip->i_size = vap->va_size;
438                         if (!(flags & ATTR_DMI))
439                                 xfs_ichgtime(ip, XFS_ICHGTIME_CHG);
440                         xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
441                 } else if ((vap->va_size <= ip->i_size) ||
442                            ((vap->va_size == 0) && ip->i_d.di_nextents)) {
443                         /*
444                          * signal a sync transaction unless
445                          * we're truncating an already unlinked
446                          * file on a wsync filesystem
447                          */
448                         code = xfs_itruncate_finish(&tp, ip,
449                                             (xfs_fsize_t)vap->va_size,
450                                             XFS_DATA_FORK,
451                                             ((ip->i_d.di_nlink != 0 ||
452                                               !(mp->m_flags & XFS_MOUNT_WSYNC))
453                                              ? 1 : 0));
454                         if (code)
455                                 goto abort_return;
456                         /*
457                          * Truncated "down", so we're removing references
458                          * to old data here - if we now delay flushing for
459                          * a long time, we expose ourselves unduly to the
460                          * notorious NULL files problem.  So, we mark this
461                          * vnode and flush it when the file is closed, and
462                          * do not wait the usual (long) time for writeout.
463                          */
464                         xfs_iflags_set(ip, XFS_ITRUNCATED);
465                 }
466         }
467
468         /*
469          * Change file access modes.
470          */
471         if (mask & XFS_AT_MODE) {
472                 ip->i_d.di_mode &= S_IFMT;
473                 ip->i_d.di_mode |= vap->va_mode & ~S_IFMT;
474
475                 xfs_trans_log_inode (tp, ip, XFS_ILOG_CORE);
476                 timeflags |= XFS_ICHGTIME_CHG;
477         }
478
479         /*
480          * Change file ownership.  Must be the owner or privileged.
481          * If the system was configured with the "restricted_chown"
482          * option, the owner is not permitted to give away the file,
483          * and can change the group id only to a group of which he
484          * or she is a member.
485          */
486         if (mask & (XFS_AT_UID|XFS_AT_GID)) {
487                 /*
488                  * CAP_FSETID overrides the following restrictions:
489                  *
490                  * The set-user-ID and set-group-ID bits of a file will be
491                  * cleared upon successful return from chown()
492                  */
493                 if ((ip->i_d.di_mode & (S_ISUID|S_ISGID)) &&
494                     !capable(CAP_FSETID)) {
495                         ip->i_d.di_mode &= ~(S_ISUID|S_ISGID);
496                 }
497
498                 /*
499                  * Change the ownerships and register quota modifications
500                  * in the transaction.
501                  */
502                 if (iuid != uid) {
503                         if (XFS_IS_UQUOTA_ON(mp)) {
504                                 ASSERT(mask & XFS_AT_UID);
505                                 ASSERT(udqp);
506                                 olddquot1 = XFS_QM_DQVOPCHOWN(mp, tp, ip,
507                                                         &ip->i_udquot, udqp);
508                         }
509                         ip->i_d.di_uid = uid;
510                 }
511                 if (igid != gid) {
512                         if (XFS_IS_GQUOTA_ON(mp)) {
513                                 ASSERT(!XFS_IS_PQUOTA_ON(mp));
514                                 ASSERT(mask & XFS_AT_GID);
515                                 ASSERT(gdqp);
516                                 olddquot2 = XFS_QM_DQVOPCHOWN(mp, tp, ip,
517                                                         &ip->i_gdquot, gdqp);
518                         }
519                         ip->i_d.di_gid = gid;
520                 }
521
522                 xfs_trans_log_inode (tp, ip, XFS_ILOG_CORE);
523                 timeflags |= XFS_ICHGTIME_CHG;
524         }
525
526
527         /*
528          * Change file access or modified times.
529          */
530         if (mask & (XFS_AT_ATIME|XFS_AT_MTIME)) {
531                 if (mask & XFS_AT_ATIME) {
532                         ip->i_d.di_atime.t_sec = vap->va_atime.tv_sec;
533                         ip->i_d.di_atime.t_nsec = vap->va_atime.tv_nsec;
534                         ip->i_update_core = 1;
535                         timeflags &= ~XFS_ICHGTIME_ACC;
536                 }
537                 if (mask & XFS_AT_MTIME) {
538                         ip->i_d.di_mtime.t_sec = vap->va_mtime.tv_sec;
539                         ip->i_d.di_mtime.t_nsec = vap->va_mtime.tv_nsec;
540                         timeflags &= ~XFS_ICHGTIME_MOD;
541                         timeflags |= XFS_ICHGTIME_CHG;
542                 }
543                 if (tp && (flags & ATTR_UTIME))
544                         xfs_trans_log_inode (tp, ip, XFS_ILOG_CORE);
545         }
546
547         /*
548          * Change file inode change time only if XFS_AT_CTIME set
549          * AND we have been called by a DMI function.
550          */
551
552         if ( (flags & ATTR_DMI) && (mask & XFS_AT_CTIME) ) {
553                 ip->i_d.di_ctime.t_sec = vap->va_ctime.tv_sec;
554                 ip->i_d.di_ctime.t_nsec = vap->va_ctime.tv_nsec;
555                 ip->i_update_core = 1;
556                 timeflags &= ~XFS_ICHGTIME_CHG;
557         }
558
559         /*
560          * Send out timestamp changes that need to be set to the
561          * current time.  Not done when called by a DMI function.
562          */
563         if (timeflags && !(flags & ATTR_DMI))
564                 xfs_ichgtime(ip, timeflags);
565
566         XFS_STATS_INC(xs_ig_attrchg);
567
568         /*
569          * If this is a synchronous mount, make sure that the
570          * transaction goes to disk before returning to the user.
571          * This is slightly sub-optimal in that truncates require
572          * two sync transactions instead of one for wsync filesystems.
573          * One for the truncate and one for the timestamps since we
574          * don't want to change the timestamps unless we're sure the
575          * truncate worked.  Truncates are less than 1% of the laddis
576          * mix so this probably isn't worth the trouble to optimize.
577          */
578         code = 0;
579         if (tp) {
580                 if (mp->m_flags & XFS_MOUNT_WSYNC)
581                         xfs_trans_set_sync(tp);
582
583                 code = xfs_trans_commit(tp, commit_flags);
584         }
585
586         xfs_iunlock(ip, lock_flags);
587
588         /*
589          * Release any dquot(s) the inode had kept before chown.
590          */
591         XFS_QM_DQRELE(mp, olddquot1);
592         XFS_QM_DQRELE(mp, olddquot2);
593         XFS_QM_DQRELE(mp, udqp);
594         XFS_QM_DQRELE(mp, gdqp);
595
596         if (code) {
597                 return code;
598         }
599
600         if (DM_EVENT_ENABLED(ip, DM_EVENT_ATTRIBUTE) &&
601             !(flags & ATTR_DMI)) {
602                 (void) XFS_SEND_NAMESP(mp, DM_EVENT_ATTRIBUTE, ip, DM_RIGHT_NULL,
603                                         NULL, DM_RIGHT_NULL, NULL, NULL,
604                                         0, 0, AT_DELAY_FLAG(flags));
605         }
606         return 0;
607
608  abort_return:
609         commit_flags |= XFS_TRANS_ABORT;
610         /* FALLTHROUGH */
611  error_return:
612         XFS_QM_DQRELE(mp, udqp);
613         XFS_QM_DQRELE(mp, gdqp);
614         if (tp) {
615                 xfs_trans_cancel(tp, commit_flags);
616         }
617         if (lock_flags != 0) {
618                 xfs_iunlock(ip, lock_flags);
619         }
620         return code;
621 }
622
623 /*
624  * The maximum pathlen is 1024 bytes. Since the minimum file system
625  * blocksize is 512 bytes, we can get a max of 2 extents back from
626  * bmapi.
627  */
628 #define SYMLINK_MAPS 2
629
630 STATIC int
631 xfs_readlink_bmap(
632         xfs_inode_t     *ip,
633         char            *link)
634 {
635         xfs_mount_t     *mp = ip->i_mount;
636         int             pathlen = ip->i_d.di_size;
637         int             nmaps = SYMLINK_MAPS;
638         xfs_bmbt_irec_t mval[SYMLINK_MAPS];
639         xfs_daddr_t     d;
640         int             byte_cnt;
641         int             n;
642         xfs_buf_t       *bp;
643         int             error = 0;
644
645         error = xfs_bmapi(NULL, ip, 0, XFS_B_TO_FSB(mp, pathlen), 0, NULL, 0,
646                         mval, &nmaps, NULL, NULL);
647         if (error)
648                 goto out;
649
650         for (n = 0; n < nmaps; n++) {
651                 d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock);
652                 byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount);
653
654                 bp = xfs_buf_read(mp->m_ddev_targp, d, BTOBB(byte_cnt), 0);
655                 error = XFS_BUF_GETERROR(bp);
656                 if (error) {
657                         xfs_ioerror_alert("xfs_readlink",
658                                   ip->i_mount, bp, XFS_BUF_ADDR(bp));
659                         xfs_buf_relse(bp);
660                         goto out;
661                 }
662                 if (pathlen < byte_cnt)
663                         byte_cnt = pathlen;
664                 pathlen -= byte_cnt;
665
666                 memcpy(link, XFS_BUF_PTR(bp), byte_cnt);
667                 xfs_buf_relse(bp);
668         }
669
670         link[ip->i_d.di_size] = '\0';
671         error = 0;
672
673  out:
674         return error;
675 }
676
677 int
678 xfs_readlink(
679         xfs_inode_t     *ip,
680         char            *link)
681 {
682         xfs_mount_t     *mp = ip->i_mount;
683         int             pathlen;
684         int             error = 0;
685
686         xfs_itrace_entry(ip);
687
688         if (XFS_FORCED_SHUTDOWN(mp))
689                 return XFS_ERROR(EIO);
690
691         xfs_ilock(ip, XFS_ILOCK_SHARED);
692
693         ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFLNK);
694         ASSERT(ip->i_d.di_size <= MAXPATHLEN);
695
696         pathlen = ip->i_d.di_size;
697         if (!pathlen)
698                 goto out;
699
700         if (ip->i_df.if_flags & XFS_IFINLINE) {
701                 memcpy(link, ip->i_df.if_u1.if_data, pathlen);
702                 link[pathlen] = '\0';
703         } else {
704                 error = xfs_readlink_bmap(ip, link);
705         }
706
707  out:
708         xfs_iunlock(ip, XFS_ILOCK_SHARED);
709         return error;
710 }
711
712 /*
713  * xfs_fsync
714  *
715  * This is called to sync the inode and its data out to disk.  We need to hold
716  * the I/O lock while flushing the data, and the inode lock while flushing the
717  * inode.  The inode lock CANNOT be held while flushing the data, so acquire
718  * after we're done with that.
719  */
720 int
721 xfs_fsync(
722         xfs_inode_t     *ip)
723 {
724         xfs_trans_t     *tp;
725         int             error;
726         int             log_flushed = 0, changed = 1;
727
728         xfs_itrace_entry(ip);
729
730         if (XFS_FORCED_SHUTDOWN(ip->i_mount))
731                 return XFS_ERROR(EIO);
732
733         /* capture size updates in I/O completion before writing the inode. */
734         error = filemap_fdatawait(vn_to_inode(XFS_ITOV(ip))->i_mapping);
735         if (error)
736                 return XFS_ERROR(error);
737
738         /*
739          * We always need to make sure that the required inode state is safe on
740          * disk.  The vnode might be clean but we still might need to force the
741          * log because of committed transactions that haven't hit the disk yet.
742          * Likewise, there could be unflushed non-transactional changes to the
743          * inode core that have to go to disk and this requires us to issue
744          * a synchronous transaction to capture these changes correctly.
745          *
746          * This code relies on the assumption that if the update_* fields
747          * of the inode are clear and the inode is unpinned then it is clean
748          * and no action is required.
749          */
750         xfs_ilock(ip, XFS_ILOCK_SHARED);
751
752         if (!(ip->i_update_size || ip->i_update_core)) {
753                 /*
754                  * Timestamps/size haven't changed since last inode flush or
755                  * inode transaction commit.  That means either nothing got
756                  * written or a transaction committed which caught the updates.
757                  * If the latter happened and the transaction hasn't hit the
758                  * disk yet, the inode will be still be pinned.  If it is,
759                  * force the log.
760                  */
761
762                 xfs_iunlock(ip, XFS_ILOCK_SHARED);
763
764                 if (xfs_ipincount(ip)) {
765                         error = _xfs_log_force(ip->i_mount, (xfs_lsn_t)0,
766                                       XFS_LOG_FORCE | XFS_LOG_SYNC,
767                                       &log_flushed);
768                 } else {
769                         /*
770                          * If the inode is not pinned and nothing has changed
771                          * we don't need to flush the cache.
772                          */
773                         changed = 0;
774                 }
775         } else  {
776                 /*
777                  * Kick off a transaction to log the inode core to get the
778                  * updates.  The sync transaction will also force the log.
779                  */
780                 xfs_iunlock(ip, XFS_ILOCK_SHARED);
781                 tp = xfs_trans_alloc(ip->i_mount, XFS_TRANS_FSYNC_TS);
782                 error = xfs_trans_reserve(tp, 0,
783                                 XFS_FSYNC_TS_LOG_RES(ip->i_mount), 0, 0, 0);
784                 if (error) {
785                         xfs_trans_cancel(tp, 0);
786                         return error;
787                 }
788                 xfs_ilock(ip, XFS_ILOCK_EXCL);
789
790                 /*
791                  * Note - it's possible that we might have pushed ourselves out
792                  * of the way during trans_reserve which would flush the inode.
793                  * But there's no guarantee that the inode buffer has actually
794                  * gone out yet (it's delwri).  Plus the buffer could be pinned
795                  * anyway if it's part of an inode in another recent
796                  * transaction.  So we play it safe and fire off the
797                  * transaction anyway.
798                  */
799                 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
800                 xfs_trans_ihold(tp, ip);
801                 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
802                 xfs_trans_set_sync(tp);
803                 error = _xfs_trans_commit(tp, 0, &log_flushed);
804
805                 xfs_iunlock(ip, XFS_ILOCK_EXCL);
806         }
807
808         if ((ip->i_mount->m_flags & XFS_MOUNT_BARRIER) && changed) {
809                 /*
810                  * If the log write didn't issue an ordered tag we need
811                  * to flush the disk cache for the data device now.
812                  */
813                 if (!log_flushed)
814                         xfs_blkdev_issue_flush(ip->i_mount->m_ddev_targp);
815
816                 /*
817                  * If this inode is on the RT dev we need to flush that
818                  * cache as well.
819                  */
820                 if (XFS_IS_REALTIME_INODE(ip))
821                         xfs_blkdev_issue_flush(ip->i_mount->m_rtdev_targp);
822         }
823
824         return error;
825 }
826
827 /*
828  * This is called by xfs_inactive to free any blocks beyond eof
829  * when the link count isn't zero and by xfs_dm_punch_hole() when
830  * punching a hole to EOF.
831  */
832 int
833 xfs_free_eofblocks(
834         xfs_mount_t     *mp,
835         xfs_inode_t     *ip,
836         int             flags)
837 {
838         xfs_trans_t     *tp;
839         int             error;
840         xfs_fileoff_t   end_fsb;
841         xfs_fileoff_t   last_fsb;
842         xfs_filblks_t   map_len;
843         int             nimaps;
844         xfs_bmbt_irec_t imap;
845         int             use_iolock = (flags & XFS_FREE_EOF_LOCK);
846
847         /*
848          * Figure out if there are any blocks beyond the end
849          * of the file.  If not, then there is nothing to do.
850          */
851         end_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)ip->i_size));
852         last_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_MAXIOFFSET(mp));
853         map_len = last_fsb - end_fsb;
854         if (map_len <= 0)
855                 return 0;
856
857         nimaps = 1;
858         xfs_ilock(ip, XFS_ILOCK_SHARED);
859         error = xfs_bmapi(NULL, ip, end_fsb, map_len, 0,
860                           NULL, 0, &imap, &nimaps, NULL, NULL);
861         xfs_iunlock(ip, XFS_ILOCK_SHARED);
862
863         if (!error && (nimaps != 0) &&
864             (imap.br_startblock != HOLESTARTBLOCK ||
865              ip->i_delayed_blks)) {
866                 /*
867                  * Attach the dquots to the inode up front.
868                  */
869                 if ((error = XFS_QM_DQATTACH(mp, ip, 0)))
870                         return error;
871
872                 /*
873                  * There are blocks after the end of file.
874                  * Free them up now by truncating the file to
875                  * its current size.
876                  */
877                 tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
878
879                 /*
880                  * Do the xfs_itruncate_start() call before
881                  * reserving any log space because
882                  * itruncate_start will call into the buffer
883                  * cache and we can't
884                  * do that within a transaction.
885                  */
886                 if (use_iolock)
887                         xfs_ilock(ip, XFS_IOLOCK_EXCL);
888                 error = xfs_itruncate_start(ip, XFS_ITRUNC_DEFINITE,
889                                     ip->i_size);
890                 if (error) {
891                         xfs_trans_cancel(tp, 0);
892                         if (use_iolock)
893                                 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
894                         return error;
895                 }
896
897                 error = xfs_trans_reserve(tp, 0,
898                                           XFS_ITRUNCATE_LOG_RES(mp),
899                                           0, XFS_TRANS_PERM_LOG_RES,
900                                           XFS_ITRUNCATE_LOG_COUNT);
901                 if (error) {
902                         ASSERT(XFS_FORCED_SHUTDOWN(mp));
903                         xfs_trans_cancel(tp, 0);
904                         xfs_iunlock(ip, XFS_IOLOCK_EXCL);
905                         return error;
906                 }
907
908                 xfs_ilock(ip, XFS_ILOCK_EXCL);
909                 xfs_trans_ijoin(tp, ip,
910                                 XFS_IOLOCK_EXCL |
911                                 XFS_ILOCK_EXCL);
912                 xfs_trans_ihold(tp, ip);
913
914                 error = xfs_itruncate_finish(&tp, ip,
915                                              ip->i_size,
916                                              XFS_DATA_FORK,
917                                              0);
918                 /*
919                  * If we get an error at this point we
920                  * simply don't bother truncating the file.
921                  */
922                 if (error) {
923                         xfs_trans_cancel(tp,
924                                          (XFS_TRANS_RELEASE_LOG_RES |
925                                           XFS_TRANS_ABORT));
926                 } else {
927                         error = xfs_trans_commit(tp,
928                                                 XFS_TRANS_RELEASE_LOG_RES);
929                 }
930                 xfs_iunlock(ip, (use_iolock ? (XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL)
931                                             : XFS_ILOCK_EXCL));
932         }
933         return error;
934 }
935
936 /*
937  * Free a symlink that has blocks associated with it.
938  */
939 STATIC int
940 xfs_inactive_symlink_rmt(
941         xfs_inode_t     *ip,
942         xfs_trans_t     **tpp)
943 {
944         xfs_buf_t       *bp;
945         int             committed;
946         int             done;
947         int             error;
948         xfs_fsblock_t   first_block;
949         xfs_bmap_free_t free_list;
950         int             i;
951         xfs_mount_t     *mp;
952         xfs_bmbt_irec_t mval[SYMLINK_MAPS];
953         int             nmaps;
954         xfs_trans_t     *ntp;
955         int             size;
956         xfs_trans_t     *tp;
957
958         tp = *tpp;
959         mp = ip->i_mount;
960         ASSERT(ip->i_d.di_size > XFS_IFORK_DSIZE(ip));
961         /*
962          * We're freeing a symlink that has some
963          * blocks allocated to it.  Free the
964          * blocks here.  We know that we've got
965          * either 1 or 2 extents and that we can
966          * free them all in one bunmapi call.
967          */
968         ASSERT(ip->i_d.di_nextents > 0 && ip->i_d.di_nextents <= 2);
969         if ((error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0,
970                         XFS_TRANS_PERM_LOG_RES, XFS_ITRUNCATE_LOG_COUNT))) {
971                 ASSERT(XFS_FORCED_SHUTDOWN(mp));
972                 xfs_trans_cancel(tp, 0);
973                 *tpp = NULL;
974                 return error;
975         }
976         /*
977          * Lock the inode, fix the size, and join it to the transaction.
978          * Hold it so in the normal path, we still have it locked for
979          * the second transaction.  In the error paths we need it
980          * held so the cancel won't rele it, see below.
981          */
982         xfs_ilock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
983         size = (int)ip->i_d.di_size;
984         ip->i_d.di_size = 0;
985         xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
986         xfs_trans_ihold(tp, ip);
987         xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
988         /*
989          * Find the block(s) so we can inval and unmap them.
990          */
991         done = 0;
992         XFS_BMAP_INIT(&free_list, &first_block);
993         nmaps = ARRAY_SIZE(mval);
994         if ((error = xfs_bmapi(tp, ip, 0, XFS_B_TO_FSB(mp, size),
995                         XFS_BMAPI_METADATA, &first_block, 0, mval, &nmaps,
996                         &free_list, NULL)))
997                 goto error0;
998         /*
999          * Invalidate the block(s).
1000          */
1001         for (i = 0; i < nmaps; i++) {
1002                 bp = xfs_trans_get_buf(tp, mp->m_ddev_targp,
1003                         XFS_FSB_TO_DADDR(mp, mval[i].br_startblock),
1004                         XFS_FSB_TO_BB(mp, mval[i].br_blockcount), 0);
1005                 xfs_trans_binval(tp, bp);
1006         }
1007         /*
1008          * Unmap the dead block(s) to the free_list.
1009          */
1010         if ((error = xfs_bunmapi(tp, ip, 0, size, XFS_BMAPI_METADATA, nmaps,
1011                         &first_block, &free_list, NULL, &done)))
1012                 goto error1;
1013         ASSERT(done);
1014         /*
1015          * Commit the first transaction.  This logs the EFI and the inode.
1016          */
1017         if ((error = xfs_bmap_finish(&tp, &free_list, &committed)))
1018                 goto error1;
1019         /*
1020          * The transaction must have been committed, since there were
1021          * actually extents freed by xfs_bunmapi.  See xfs_bmap_finish.
1022          * The new tp has the extent freeing and EFDs.
1023          */
1024         ASSERT(committed);
1025         /*
1026          * The first xact was committed, so add the inode to the new one.
1027          * Mark it dirty so it will be logged and moved forward in the log as
1028          * part of every commit.
1029          */
1030         xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
1031         xfs_trans_ihold(tp, ip);
1032         xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1033         /*
1034          * Get a new, empty transaction to return to our caller.
1035          */
1036         ntp = xfs_trans_dup(tp);
1037         /*
1038          * Commit the transaction containing extent freeing and EFDs.
1039          * If we get an error on the commit here or on the reserve below,
1040          * we need to unlock the inode since the new transaction doesn't
1041          * have the inode attached.
1042          */
1043         error = xfs_trans_commit(tp, 0);
1044         tp = ntp;
1045         if (error) {
1046                 ASSERT(XFS_FORCED_SHUTDOWN(mp));
1047                 goto error0;
1048         }
1049         /*
1050          * Remove the memory for extent descriptions (just bookkeeping).
1051          */
1052         if (ip->i_df.if_bytes)
1053                 xfs_idata_realloc(ip, -ip->i_df.if_bytes, XFS_DATA_FORK);
1054         ASSERT(ip->i_df.if_bytes == 0);
1055         /*
1056          * Put an itruncate log reservation in the new transaction
1057          * for our caller.
1058          */
1059         if ((error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0,
1060                         XFS_TRANS_PERM_LOG_RES, XFS_ITRUNCATE_LOG_COUNT))) {
1061                 ASSERT(XFS_FORCED_SHUTDOWN(mp));
1062                 goto error0;
1063         }
1064         /*
1065          * Return with the inode locked but not joined to the transaction.
1066          */
1067         *tpp = tp;
1068         return 0;
1069
1070  error1:
1071         xfs_bmap_cancel(&free_list);
1072  error0:
1073         /*
1074          * Have to come here with the inode locked and either
1075          * (held and in the transaction) or (not in the transaction).
1076          * If the inode isn't held then cancel would iput it, but
1077          * that's wrong since this is inactive and the vnode ref
1078          * count is 0 already.
1079          * Cancel won't do anything to the inode if held, but it still
1080          * needs to be locked until the cancel is done, if it was
1081          * joined to the transaction.
1082          */
1083         xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
1084         xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1085         *tpp = NULL;
1086         return error;
1087
1088 }
1089
1090 STATIC int
1091 xfs_inactive_symlink_local(
1092         xfs_inode_t     *ip,
1093         xfs_trans_t     **tpp)
1094 {
1095         int             error;
1096
1097         ASSERT(ip->i_d.di_size <= XFS_IFORK_DSIZE(ip));
1098         /*
1099          * We're freeing a symlink which fit into
1100          * the inode.  Just free the memory used
1101          * to hold the old symlink.
1102          */
1103         error = xfs_trans_reserve(*tpp, 0,
1104                                   XFS_ITRUNCATE_LOG_RES(ip->i_mount),
1105                                   0, XFS_TRANS_PERM_LOG_RES,
1106                                   XFS_ITRUNCATE_LOG_COUNT);
1107
1108         if (error) {
1109                 xfs_trans_cancel(*tpp, 0);
1110                 *tpp = NULL;
1111                 return error;
1112         }
1113         xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
1114
1115         /*
1116          * Zero length symlinks _can_ exist.
1117          */
1118         if (ip->i_df.if_bytes > 0) {
1119                 xfs_idata_realloc(ip,
1120                                   -(ip->i_df.if_bytes),
1121                                   XFS_DATA_FORK);
1122                 ASSERT(ip->i_df.if_bytes == 0);
1123         }
1124         return 0;
1125 }
1126
1127 STATIC int
1128 xfs_inactive_attrs(
1129         xfs_inode_t     *ip,
1130         xfs_trans_t     **tpp)
1131 {
1132         xfs_trans_t     *tp;
1133         int             error;
1134         xfs_mount_t     *mp;
1135
1136         ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
1137         tp = *tpp;
1138         mp = ip->i_mount;
1139         ASSERT(ip->i_d.di_forkoff != 0);
1140         error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
1141         xfs_iunlock(ip, XFS_ILOCK_EXCL);
1142         if (error)
1143                 goto error_unlock;
1144
1145         error = xfs_attr_inactive(ip);
1146         if (error)
1147                 goto error_unlock;
1148
1149         tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
1150         error = xfs_trans_reserve(tp, 0,
1151                                   XFS_IFREE_LOG_RES(mp),
1152                                   0, XFS_TRANS_PERM_LOG_RES,
1153                                   XFS_INACTIVE_LOG_COUNT);
1154         if (error)
1155                 goto error_cancel;
1156
1157         xfs_ilock(ip, XFS_ILOCK_EXCL);
1158         xfs_trans_ijoin(tp, ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1159         xfs_trans_ihold(tp, ip);
1160         xfs_idestroy_fork(ip, XFS_ATTR_FORK);
1161
1162         ASSERT(ip->i_d.di_anextents == 0);
1163
1164         *tpp = tp;
1165         return 0;
1166
1167 error_cancel:
1168         ASSERT(XFS_FORCED_SHUTDOWN(mp));
1169         xfs_trans_cancel(tp, 0);
1170 error_unlock:
1171         *tpp = NULL;
1172         xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1173         return error;
1174 }
1175
1176 int
1177 xfs_release(
1178         xfs_inode_t     *ip)
1179 {
1180         bhv_vnode_t     *vp = XFS_ITOV(ip);
1181         xfs_mount_t     *mp = ip->i_mount;
1182         int             error;
1183
1184         if (!S_ISREG(ip->i_d.di_mode) || (ip->i_d.di_mode == 0))
1185                 return 0;
1186
1187         /* If this is a read-only mount, don't do this (would generate I/O) */
1188         if (mp->m_flags & XFS_MOUNT_RDONLY)
1189                 return 0;
1190
1191         if (!XFS_FORCED_SHUTDOWN(mp)) {
1192                 int truncated;
1193
1194                 /*
1195                  * If we are using filestreams, and we have an unlinked
1196                  * file that we are processing the last close on, then nothing
1197                  * will be able to reopen and write to this file. Purge this
1198                  * inode from the filestreams cache so that it doesn't delay
1199                  * teardown of the inode.
1200                  */
1201                 if ((ip->i_d.di_nlink == 0) && xfs_inode_is_filestream(ip))
1202                         xfs_filestream_deassociate(ip);
1203
1204                 /*
1205                  * If we previously truncated this file and removed old data
1206                  * in the process, we want to initiate "early" writeout on
1207                  * the last close.  This is an attempt to combat the notorious
1208                  * NULL files problem which is particularly noticable from a
1209                  * truncate down, buffered (re-)write (delalloc), followed by
1210                  * a crash.  What we are effectively doing here is
1211                  * significantly reducing the time window where we'd otherwise
1212                  * be exposed to that problem.
1213                  */
1214                 truncated = xfs_iflags_test_and_clear(ip, XFS_ITRUNCATED);
1215                 if (truncated && VN_DIRTY(vp) && ip->i_delayed_blks > 0)
1216                         xfs_flush_pages(ip, 0, -1, XFS_B_ASYNC, FI_NONE);
1217         }
1218
1219         if (ip->i_d.di_nlink != 0) {
1220                 if ((((ip->i_d.di_mode & S_IFMT) == S_IFREG) &&
1221                      ((ip->i_size > 0) || (VN_CACHED(vp) > 0 ||
1222                        ip->i_delayed_blks > 0)) &&
1223                      (ip->i_df.if_flags & XFS_IFEXTENTS))  &&
1224                     (!(ip->i_d.di_flags &
1225                                 (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)))) {
1226                         error = xfs_free_eofblocks(mp, ip, XFS_FREE_EOF_LOCK);
1227                         if (error)
1228                                 return error;
1229                 }
1230         }
1231
1232         return 0;
1233 }
1234
1235 /*
1236  * xfs_inactive
1237  *
1238  * This is called when the vnode reference count for the vnode
1239  * goes to zero.  If the file has been unlinked, then it must
1240  * now be truncated.  Also, we clear all of the read-ahead state
1241  * kept for the inode here since the file is now closed.
1242  */
1243 int
1244 xfs_inactive(
1245         xfs_inode_t     *ip)
1246 {
1247         bhv_vnode_t     *vp = XFS_ITOV(ip);
1248         xfs_bmap_free_t free_list;
1249         xfs_fsblock_t   first_block;
1250         int             committed;
1251         xfs_trans_t     *tp;
1252         xfs_mount_t     *mp;
1253         int             error;
1254         int             truncate;
1255
1256         xfs_itrace_entry(ip);
1257
1258         /*
1259          * If the inode is already free, then there can be nothing
1260          * to clean up here.
1261          */
1262         if (ip->i_d.di_mode == 0 || VN_BAD(vp)) {
1263                 ASSERT(ip->i_df.if_real_bytes == 0);
1264                 ASSERT(ip->i_df.if_broot_bytes == 0);
1265                 return VN_INACTIVE_CACHE;
1266         }
1267
1268         /*
1269          * Only do a truncate if it's a regular file with
1270          * some actual space in it.  It's OK to look at the
1271          * inode's fields without the lock because we're the
1272          * only one with a reference to the inode.
1273          */
1274         truncate = ((ip->i_d.di_nlink == 0) &&
1275             ((ip->i_d.di_size != 0) || (ip->i_size != 0) ||
1276              (ip->i_d.di_nextents > 0) || (ip->i_delayed_blks > 0)) &&
1277             ((ip->i_d.di_mode & S_IFMT) == S_IFREG));
1278
1279         mp = ip->i_mount;
1280
1281         if (ip->i_d.di_nlink == 0 && DM_EVENT_ENABLED(ip, DM_EVENT_DESTROY))
1282                 XFS_SEND_DESTROY(mp, ip, DM_RIGHT_NULL);
1283
1284         error = 0;
1285
1286         /* If this is a read-only mount, don't do this (would generate I/O) */
1287         if (mp->m_flags & XFS_MOUNT_RDONLY)
1288                 goto out;
1289
1290         if (ip->i_d.di_nlink != 0) {
1291                 if ((((ip->i_d.di_mode & S_IFMT) == S_IFREG) &&
1292                      ((ip->i_size > 0) || (VN_CACHED(vp) > 0 ||
1293                        ip->i_delayed_blks > 0)) &&
1294                       (ip->i_df.if_flags & XFS_IFEXTENTS) &&
1295                      (!(ip->i_d.di_flags &
1296                                 (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)) ||
1297                       (ip->i_delayed_blks != 0)))) {
1298                         error = xfs_free_eofblocks(mp, ip, XFS_FREE_EOF_LOCK);
1299                         if (error)
1300                                 return VN_INACTIVE_CACHE;
1301                 }
1302                 goto out;
1303         }
1304
1305         ASSERT(ip->i_d.di_nlink == 0);
1306
1307         if ((error = XFS_QM_DQATTACH(mp, ip, 0)))
1308                 return VN_INACTIVE_CACHE;
1309
1310         tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
1311         if (truncate) {
1312                 /*
1313                  * Do the xfs_itruncate_start() call before
1314                  * reserving any log space because itruncate_start
1315                  * will call into the buffer cache and we can't
1316                  * do that within a transaction.
1317                  */
1318                 xfs_ilock(ip, XFS_IOLOCK_EXCL);
1319
1320                 error = xfs_itruncate_start(ip, XFS_ITRUNC_DEFINITE, 0);
1321                 if (error) {
1322                         xfs_trans_cancel(tp, 0);
1323                         xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1324                         return VN_INACTIVE_CACHE;
1325                 }
1326
1327                 error = xfs_trans_reserve(tp, 0,
1328                                           XFS_ITRUNCATE_LOG_RES(mp),
1329                                           0, XFS_TRANS_PERM_LOG_RES,
1330                                           XFS_ITRUNCATE_LOG_COUNT);
1331                 if (error) {
1332                         /* Don't call itruncate_cleanup */
1333                         ASSERT(XFS_FORCED_SHUTDOWN(mp));
1334                         xfs_trans_cancel(tp, 0);
1335                         xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1336                         return VN_INACTIVE_CACHE;
1337                 }
1338
1339                 xfs_ilock(ip, XFS_ILOCK_EXCL);
1340                 xfs_trans_ijoin(tp, ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1341                 xfs_trans_ihold(tp, ip);
1342
1343                 /*
1344                  * normally, we have to run xfs_itruncate_finish sync.
1345                  * But if filesystem is wsync and we're in the inactive
1346                  * path, then we know that nlink == 0, and that the
1347                  * xaction that made nlink == 0 is permanently committed
1348                  * since xfs_remove runs as a synchronous transaction.
1349                  */
1350                 error = xfs_itruncate_finish(&tp, ip, 0, XFS_DATA_FORK,
1351                                 (!(mp->m_flags & XFS_MOUNT_WSYNC) ? 1 : 0));
1352
1353                 if (error) {
1354                         xfs_trans_cancel(tp,
1355                                 XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
1356                         xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1357                         return VN_INACTIVE_CACHE;
1358                 }
1359         } else if ((ip->i_d.di_mode & S_IFMT) == S_IFLNK) {
1360
1361                 /*
1362                  * If we get an error while cleaning up a
1363                  * symlink we bail out.
1364                  */
1365                 error = (ip->i_d.di_size > XFS_IFORK_DSIZE(ip)) ?
1366                         xfs_inactive_symlink_rmt(ip, &tp) :
1367                         xfs_inactive_symlink_local(ip, &tp);
1368
1369                 if (error) {
1370                         ASSERT(tp == NULL);
1371                         return VN_INACTIVE_CACHE;
1372                 }
1373
1374                 xfs_trans_ijoin(tp, ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1375                 xfs_trans_ihold(tp, ip);
1376         } else {
1377                 error = xfs_trans_reserve(tp, 0,
1378                                           XFS_IFREE_LOG_RES(mp),
1379                                           0, XFS_TRANS_PERM_LOG_RES,
1380                                           XFS_INACTIVE_LOG_COUNT);
1381                 if (error) {
1382                         ASSERT(XFS_FORCED_SHUTDOWN(mp));
1383                         xfs_trans_cancel(tp, 0);
1384                         return VN_INACTIVE_CACHE;
1385                 }
1386
1387                 xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
1388                 xfs_trans_ijoin(tp, ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1389                 xfs_trans_ihold(tp, ip);
1390         }
1391
1392         /*
1393          * If there are attributes associated with the file
1394          * then blow them away now.  The code calls a routine
1395          * that recursively deconstructs the attribute fork.
1396          * We need to just commit the current transaction
1397          * because we can't use it for xfs_attr_inactive().
1398          */
1399         if (ip->i_d.di_anextents > 0) {
1400                 error = xfs_inactive_attrs(ip, &tp);
1401                 /*
1402                  * If we got an error, the transaction is already
1403                  * cancelled, and the inode is unlocked. Just get out.
1404                  */
1405                  if (error)
1406                          return VN_INACTIVE_CACHE;
1407         } else if (ip->i_afp) {
1408                 xfs_idestroy_fork(ip, XFS_ATTR_FORK);
1409         }
1410
1411         /*
1412          * Free the inode.
1413          */
1414         XFS_BMAP_INIT(&free_list, &first_block);
1415         error = xfs_ifree(tp, ip, &free_list);
1416         if (error) {
1417                 /*
1418                  * If we fail to free the inode, shut down.  The cancel
1419                  * might do that, we need to make sure.  Otherwise the
1420                  * inode might be lost for a long time or forever.
1421                  */
1422                 if (!XFS_FORCED_SHUTDOWN(mp)) {
1423                         cmn_err(CE_NOTE,
1424                 "xfs_inactive:  xfs_ifree() returned an error = %d on %s",
1425                                 error, mp->m_fsname);
1426                         xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
1427                 }
1428                 xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
1429         } else {
1430                 /*
1431                  * Credit the quota account(s). The inode is gone.
1432                  */
1433                 XFS_TRANS_MOD_DQUOT_BYINO(mp, tp, ip, XFS_TRANS_DQ_ICOUNT, -1);
1434
1435                 /*
1436                  * Just ignore errors at this point.  There is nothing we can
1437                  * do except to try to keep going. Make sure it's not a silent
1438                  * error.
1439                  */
1440                 error = xfs_bmap_finish(&tp,  &free_list, &committed);
1441                 if (error)
1442                         xfs_fs_cmn_err(CE_NOTE, mp, "xfs_inactive: "
1443                                 "xfs_bmap_finish() returned error %d", error);
1444                 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
1445                 if (error)
1446                         xfs_fs_cmn_err(CE_NOTE, mp, "xfs_inactive: "
1447                                 "xfs_trans_commit() returned error %d", error);
1448         }
1449         /*
1450          * Release the dquots held by inode, if any.
1451          */
1452         XFS_QM_DQDETACH(mp, ip);
1453
1454         xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1455
1456  out:
1457         return VN_INACTIVE_CACHE;
1458 }
1459
1460 /*
1461  * Lookups up an inode from "name". If ci_name is not NULL, then a CI match
1462  * is allowed, otherwise it has to be an exact match. If a CI match is found,
1463  * ci_name->name will point to a the actual name (caller must free) or
1464  * will be set to NULL if an exact match is found.
1465  */
1466 int
1467 xfs_lookup(
1468         xfs_inode_t             *dp,
1469         struct xfs_name         *name,
1470         xfs_inode_t             **ipp,
1471         struct xfs_name         *ci_name)
1472 {
1473         xfs_ino_t               inum;
1474         int                     error;
1475         uint                    lock_mode;
1476
1477         xfs_itrace_entry(dp);
1478
1479         if (XFS_FORCED_SHUTDOWN(dp->i_mount))
1480                 return XFS_ERROR(EIO);
1481
1482         lock_mode = xfs_ilock_map_shared(dp);
1483         error = xfs_dir_lookup(NULL, dp, name, &inum, ci_name);
1484         xfs_iunlock_map_shared(dp, lock_mode);
1485
1486         if (error)
1487                 goto out;
1488
1489         error = xfs_iget(dp->i_mount, NULL, inum, 0, 0, ipp, 0);
1490         if (error)
1491                 goto out_free_name;
1492
1493         xfs_itrace_ref(*ipp);
1494         return 0;
1495
1496 out_free_name:
1497         if (ci_name)
1498                 kmem_free(ci_name->name);
1499 out:
1500         *ipp = NULL;
1501         return error;
1502 }
1503
1504 int
1505 xfs_create(
1506         xfs_inode_t             *dp,
1507         struct xfs_name         *name,
1508         mode_t                  mode,
1509         xfs_dev_t               rdev,
1510         xfs_inode_t             **ipp,
1511         cred_t                  *credp)
1512 {
1513         xfs_mount_t             *mp = dp->i_mount;
1514         xfs_inode_t             *ip;
1515         xfs_trans_t             *tp;
1516         int                     error;
1517         xfs_bmap_free_t         free_list;
1518         xfs_fsblock_t           first_block;
1519         boolean_t               unlock_dp_on_error = B_FALSE;
1520         int                     dm_event_sent = 0;
1521         uint                    cancel_flags;
1522         int                     committed;
1523         xfs_prid_t              prid;
1524         struct xfs_dquot        *udqp, *gdqp;
1525         uint                    resblks;
1526
1527         ASSERT(!*ipp);
1528         xfs_itrace_entry(dp);
1529
1530         if (DM_EVENT_ENABLED(dp, DM_EVENT_CREATE)) {
1531                 error = XFS_SEND_NAMESP(mp, DM_EVENT_CREATE,
1532                                 dp, DM_RIGHT_NULL, NULL,
1533                                 DM_RIGHT_NULL, name->name, NULL,
1534                                 mode, 0, 0);
1535
1536                 if (error)
1537                         return error;
1538                 dm_event_sent = 1;
1539         }
1540
1541         if (XFS_FORCED_SHUTDOWN(mp))
1542                 return XFS_ERROR(EIO);
1543
1544         /* Return through std_return after this point. */
1545
1546         udqp = gdqp = NULL;
1547         if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
1548                 prid = dp->i_d.di_projid;
1549         else
1550                 prid = (xfs_prid_t)dfltprid;
1551
1552         /*
1553          * Make sure that we have allocated dquot(s) on disk.
1554          */
1555         error = XFS_QM_DQVOPALLOC(mp, dp,
1556                         current_fsuid(credp), current_fsgid(credp), prid,
1557                         XFS_QMOPT_QUOTALL|XFS_QMOPT_INHERIT, &udqp, &gdqp);
1558         if (error)
1559                 goto std_return;
1560
1561         ip = NULL;
1562
1563         tp = xfs_trans_alloc(mp, XFS_TRANS_CREATE);
1564         cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
1565         resblks = XFS_CREATE_SPACE_RES(mp, name->len);
1566         /*
1567          * Initially assume that the file does not exist and
1568          * reserve the resources for that case.  If that is not
1569          * the case we'll drop the one we have and get a more
1570          * appropriate transaction later.
1571          */
1572         error = xfs_trans_reserve(tp, resblks, XFS_CREATE_LOG_RES(mp), 0,
1573                         XFS_TRANS_PERM_LOG_RES, XFS_CREATE_LOG_COUNT);
1574         if (error == ENOSPC) {
1575                 resblks = 0;
1576                 error = xfs_trans_reserve(tp, 0, XFS_CREATE_LOG_RES(mp), 0,
1577                                 XFS_TRANS_PERM_LOG_RES, XFS_CREATE_LOG_COUNT);
1578         }
1579         if (error) {
1580                 cancel_flags = 0;
1581                 goto error_return;
1582         }
1583
1584         xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
1585         unlock_dp_on_error = B_TRUE;
1586
1587         XFS_BMAP_INIT(&free_list, &first_block);
1588
1589         ASSERT(ip == NULL);
1590
1591         /*
1592          * Reserve disk quota and the inode.
1593          */
1594         error = XFS_TRANS_RESERVE_QUOTA(mp, tp, udqp, gdqp, resblks, 1, 0);
1595         if (error)
1596                 goto error_return;
1597
1598         error = xfs_dir_canenter(tp, dp, name, resblks);
1599         if (error)
1600                 goto error_return;
1601         error = xfs_dir_ialloc(&tp, dp, mode, 1,
1602                         rdev, credp, prid, resblks > 0,
1603                         &ip, &committed);
1604         if (error) {
1605                 if (error == ENOSPC)
1606                         goto error_return;
1607                 goto abort_return;
1608         }
1609         xfs_itrace_ref(ip);
1610
1611         /*
1612          * At this point, we've gotten a newly allocated inode.
1613          * It is locked (and joined to the transaction).
1614          */
1615
1616         ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
1617
1618         /*
1619          * Now we join the directory inode to the transaction.  We do not do it
1620          * earlier because xfs_dir_ialloc might commit the previous transaction
1621          * (and release all the locks).  An error from here on will result in
1622          * the transaction cancel unlocking dp so don't do it explicitly in the
1623          * error path.
1624          */
1625         IHOLD(dp);
1626         xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
1627         unlock_dp_on_error = B_FALSE;
1628
1629         error = xfs_dir_createname(tp, dp, name, ip->i_ino,
1630                                         &first_block, &free_list, resblks ?
1631                                         resblks - XFS_IALLOC_SPACE_RES(mp) : 0);
1632         if (error) {
1633                 ASSERT(error != ENOSPC);
1634                 goto abort_return;
1635         }
1636         xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
1637         xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
1638
1639         /*
1640          * If this is a synchronous mount, make sure that the
1641          * create transaction goes to disk before returning to
1642          * the user.
1643          */
1644         if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
1645                 xfs_trans_set_sync(tp);
1646         }
1647
1648         dp->i_gen++;
1649
1650         /*
1651          * Attach the dquot(s) to the inodes and modify them incore.
1652          * These ids of the inode couldn't have changed since the new
1653          * inode has been locked ever since it was created.
1654          */
1655         XFS_QM_DQVOPCREATE(mp, tp, ip, udqp, gdqp);
1656
1657         /*
1658          * xfs_trans_commit normally decrements the vnode ref count
1659          * when it unlocks the inode. Since we want to return the
1660          * vnode to the caller, we bump the vnode ref count now.
1661          */
1662         IHOLD(ip);
1663
1664         error = xfs_bmap_finish(&tp, &free_list, &committed);
1665         if (error) {
1666                 xfs_bmap_cancel(&free_list);
1667                 goto abort_rele;
1668         }
1669
1670         error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
1671         if (error) {
1672                 IRELE(ip);
1673                 tp = NULL;
1674                 goto error_return;
1675         }
1676
1677         XFS_QM_DQRELE(mp, udqp);
1678         XFS_QM_DQRELE(mp, gdqp);
1679
1680         *ipp = ip;
1681
1682         /* Fallthrough to std_return with error = 0  */
1683
1684 std_return:
1685         if ((*ipp || (error != 0 && dm_event_sent != 0)) &&
1686             DM_EVENT_ENABLED(dp, DM_EVENT_POSTCREATE)) {
1687                 (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTCREATE,
1688                         dp, DM_RIGHT_NULL,
1689                         *ipp ? ip : NULL,
1690                         DM_RIGHT_NULL, name->name, NULL,
1691                         mode, error, 0);
1692         }
1693         return error;
1694
1695  abort_return:
1696         cancel_flags |= XFS_TRANS_ABORT;
1697         /* FALLTHROUGH */
1698
1699  error_return:
1700         if (tp != NULL)
1701                 xfs_trans_cancel(tp, cancel_flags);
1702
1703         XFS_QM_DQRELE(mp, udqp);
1704         XFS_QM_DQRELE(mp, gdqp);
1705
1706         if (unlock_dp_on_error)
1707                 xfs_iunlock(dp, XFS_ILOCK_EXCL);
1708
1709         goto std_return;
1710
1711  abort_rele:
1712         /*
1713          * Wait until after the current transaction is aborted to
1714          * release the inode.  This prevents recursive transactions
1715          * and deadlocks from xfs_inactive.
1716          */
1717         cancel_flags |= XFS_TRANS_ABORT;
1718         xfs_trans_cancel(tp, cancel_flags);
1719         IRELE(ip);
1720
1721         XFS_QM_DQRELE(mp, udqp);
1722         XFS_QM_DQRELE(mp, gdqp);
1723
1724         goto std_return;
1725 }
1726
1727 #ifdef DEBUG
1728 /*
1729  * Some counters to see if (and how often) we are hitting some deadlock
1730  * prevention code paths.
1731  */
1732
1733 int xfs_rm_locks;
1734 int xfs_rm_lock_delays;
1735 int xfs_rm_attempts;
1736 #endif
1737
1738 /*
1739  * The following routine will lock the inodes associated with the
1740  * directory and the named entry in the directory. The locks are
1741  * acquired in increasing inode number.
1742  *
1743  * If the entry is "..", then only the directory is locked. The
1744  * vnode ref count will still include that from the .. entry in
1745  * this case.
1746  *
1747  * There is a deadlock we need to worry about. If the locked directory is
1748  * in the AIL, it might be blocking up the log. The next inode we lock
1749  * could be already locked by another thread waiting for log space (e.g
1750  * a permanent log reservation with a long running transaction (see
1751  * xfs_itruncate_finish)). To solve this, we must check if the directory
1752  * is in the ail and use lock_nowait. If we can't lock, we need to
1753  * drop the inode lock on the directory and try again. xfs_iunlock will
1754  * potentially push the tail if we were holding up the log.
1755  */
1756 STATIC int
1757 xfs_lock_dir_and_entry(
1758         xfs_inode_t     *dp,
1759         xfs_inode_t     *ip)    /* inode of entry 'name' */
1760 {
1761         int             attempts;
1762         xfs_ino_t       e_inum;
1763         xfs_inode_t     *ips[2];
1764         xfs_log_item_t  *lp;
1765
1766 #ifdef DEBUG
1767         xfs_rm_locks++;
1768 #endif
1769         attempts = 0;
1770
1771 again:
1772         xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
1773
1774         e_inum = ip->i_ino;
1775
1776         xfs_itrace_ref(ip);
1777
1778         /*
1779          * We want to lock in increasing inum. Since we've already
1780          * acquired the lock on the directory, we may need to release
1781          * if if the inum of the entry turns out to be less.
1782          */
1783         if (e_inum > dp->i_ino) {
1784                 /*
1785                  * We are already in the right order, so just
1786                  * lock on the inode of the entry.
1787                  * We need to use nowait if dp is in the AIL.
1788                  */
1789
1790                 lp = (xfs_log_item_t *)dp->i_itemp;
1791                 if (lp && (lp->li_flags & XFS_LI_IN_AIL)) {
1792                         if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
1793                                 attempts++;
1794 #ifdef DEBUG
1795                                 xfs_rm_attempts++;
1796 #endif
1797
1798                                 /*
1799                                  * Unlock dp and try again.
1800                                  * xfs_iunlock will try to push the tail
1801                                  * if the inode is in the AIL.
1802                                  */
1803
1804                                 xfs_iunlock(dp, XFS_ILOCK_EXCL);
1805
1806                                 if ((attempts % 5) == 0) {
1807                                         delay(1); /* Don't just spin the CPU */
1808 #ifdef DEBUG
1809                                         xfs_rm_lock_delays++;
1810 #endif
1811                                 }
1812                                 goto again;
1813                         }
1814                 } else {
1815                         xfs_ilock(ip, XFS_ILOCK_EXCL);
1816                 }
1817         } else if (e_inum < dp->i_ino) {
1818                 xfs_iunlock(dp, XFS_ILOCK_EXCL);
1819
1820                 ips[0] = ip;
1821                 ips[1] = dp;
1822                 xfs_lock_inodes(ips, 2, XFS_ILOCK_EXCL);
1823         }
1824         /* else  e_inum == dp->i_ino */
1825         /*     This can happen if we're asked to lock /x/..
1826          *     the entry is "..", which is also the parent directory.
1827          */
1828
1829         return 0;
1830 }
1831
1832 #ifdef DEBUG
1833 int xfs_locked_n;
1834 int xfs_small_retries;
1835 int xfs_middle_retries;
1836 int xfs_lots_retries;
1837 int xfs_lock_delays;
1838 #endif
1839
1840 /*
1841  * Bump the subclass so xfs_lock_inodes() acquires each lock with
1842  * a different value
1843  */
1844 static inline int
1845 xfs_lock_inumorder(int lock_mode, int subclass)
1846 {
1847         if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL))
1848                 lock_mode |= (subclass + XFS_LOCK_INUMORDER) << XFS_IOLOCK_SHIFT;
1849         if (lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL))
1850                 lock_mode |= (subclass + XFS_LOCK_INUMORDER) << XFS_ILOCK_SHIFT;
1851
1852         return lock_mode;
1853 }
1854
1855 /*
1856  * The following routine will lock n inodes in exclusive mode.
1857  * We assume the caller calls us with the inodes in i_ino order.
1858  *
1859  * We need to detect deadlock where an inode that we lock
1860  * is in the AIL and we start waiting for another inode that is locked
1861  * by a thread in a long running transaction (such as truncate). This can
1862  * result in deadlock since the long running trans might need to wait
1863  * for the inode we just locked in order to push the tail and free space
1864  * in the log.
1865  */
1866 void
1867 xfs_lock_inodes(
1868         xfs_inode_t     **ips,
1869         int             inodes,
1870         uint            lock_mode)
1871 {
1872         int             attempts = 0, i, j, try_lock;
1873         xfs_log_item_t  *lp;
1874
1875         ASSERT(ips && (inodes >= 2)); /* we need at least two */
1876
1877         try_lock = 0;
1878         i = 0;
1879
1880 again:
1881         for (; i < inodes; i++) {
1882                 ASSERT(ips[i]);
1883
1884                 if (i && (ips[i] == ips[i-1]))  /* Already locked */
1885                         continue;
1886
1887                 /*
1888                  * If try_lock is not set yet, make sure all locked inodes
1889                  * are not in the AIL.
1890                  * If any are, set try_lock to be used later.
1891                  */
1892
1893                 if (!try_lock) {
1894                         for (j = (i - 1); j >= 0 && !try_lock; j--) {
1895                                 lp = (xfs_log_item_t *)ips[j]->i_itemp;
1896                                 if (lp && (lp->li_flags & XFS_LI_IN_AIL)) {
1897                                         try_lock++;
1898                                 }
1899                         }
1900                 }
1901
1902                 /*
1903                  * If any of the previous locks we have locked is in the AIL,
1904                  * we must TRY to get the second and subsequent locks. If
1905                  * we can't get any, we must release all we have
1906                  * and try again.
1907                  */
1908
1909                 if (try_lock) {
1910                         /* try_lock must be 0 if i is 0. */
1911                         /*
1912                          * try_lock means we have an inode locked
1913                          * that is in the AIL.
1914                          */
1915                         ASSERT(i != 0);
1916                         if (!xfs_ilock_nowait(ips[i], xfs_lock_inumorder(lock_mode, i))) {
1917                                 attempts++;
1918
1919                                 /*
1920                                  * Unlock all previous guys and try again.
1921                                  * xfs_iunlock will try to push the tail
1922                                  * if the inode is in the AIL.
1923                                  */
1924
1925                                 for(j = i - 1; j >= 0; j--) {
1926
1927                                         /*
1928                                          * Check to see if we've already
1929                                          * unlocked this one.
1930                                          * Not the first one going back,
1931                                          * and the inode ptr is the same.
1932                                          */
1933                                         if ((j != (i - 1)) && ips[j] ==
1934                                                                 ips[j+1])
1935                                                 continue;
1936
1937                                         xfs_iunlock(ips[j], lock_mode);
1938                                 }
1939
1940                                 if ((attempts % 5) == 0) {
1941                                         delay(1); /* Don't just spin the CPU */
1942 #ifdef DEBUG
1943                                         xfs_lock_delays++;
1944 #endif
1945                                 }
1946                                 i = 0;
1947                                 try_lock = 0;
1948                                 goto again;
1949                         }
1950                 } else {
1951                         xfs_ilock(ips[i], xfs_lock_inumorder(lock_mode, i));
1952                 }
1953         }
1954
1955 #ifdef DEBUG
1956         if (attempts) {
1957                 if (attempts < 5) xfs_small_retries++;
1958                 else if (attempts < 100) xfs_middle_retries++;
1959                 else xfs_lots_retries++;
1960         } else {
1961                 xfs_locked_n++;
1962         }
1963 #endif
1964 }
1965
1966 int
1967 xfs_remove(
1968         xfs_inode_t             *dp,
1969         struct xfs_name         *name,
1970         xfs_inode_t             *ip)
1971 {
1972         xfs_mount_t             *mp = dp->i_mount;
1973         xfs_trans_t             *tp = NULL;
1974         int                     is_dir = S_ISDIR(ip->i_d.di_mode);
1975         int                     error = 0;
1976         xfs_bmap_free_t         free_list;
1977         xfs_fsblock_t           first_block;
1978         int                     cancel_flags;
1979         int                     committed;
1980         int                     link_zero;
1981         uint                    resblks;
1982         uint                    log_count;
1983
1984         xfs_itrace_entry(dp);
1985         xfs_itrace_entry(ip);
1986
1987         if (XFS_FORCED_SHUTDOWN(mp))
1988                 return XFS_ERROR(EIO);
1989
1990         if (DM_EVENT_ENABLED(dp, DM_EVENT_REMOVE)) {
1991                 error = XFS_SEND_NAMESP(mp, DM_EVENT_REMOVE, dp, DM_RIGHT_NULL,
1992                                         NULL, DM_RIGHT_NULL, name->name, NULL,
1993                                         ip->i_d.di_mode, 0, 0);
1994                 if (error)
1995                         return error;
1996         }
1997
1998         error = XFS_QM_DQATTACH(mp, dp, 0);
1999         if (error)
2000                 goto std_return;
2001
2002         error = XFS_QM_DQATTACH(mp, ip, 0);
2003         if (error)
2004                 goto std_return;
2005
2006         if (is_dir) {
2007                 tp = xfs_trans_alloc(mp, XFS_TRANS_RMDIR);
2008                 log_count = XFS_DEFAULT_LOG_COUNT;
2009         } else {
2010                 tp = xfs_trans_alloc(mp, XFS_TRANS_REMOVE);
2011                 log_count = XFS_REMOVE_LOG_COUNT;
2012         }
2013         cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
2014
2015         /*
2016          * We try to get the real space reservation first,
2017          * allowing for directory btree deletion(s) implying
2018          * possible bmap insert(s).  If we can't get the space
2019          * reservation then we use 0 instead, and avoid the bmap
2020          * btree insert(s) in the directory code by, if the bmap
2021          * insert tries to happen, instead trimming the LAST
2022          * block from the directory.
2023          */
2024         resblks = XFS_REMOVE_SPACE_RES(mp);
2025         error = xfs_trans_reserve(tp, resblks, XFS_REMOVE_LOG_RES(mp), 0,
2026                                   XFS_TRANS_PERM_LOG_RES, log_count);
2027         if (error == ENOSPC) {
2028                 resblks = 0;
2029                 error = xfs_trans_reserve(tp, 0, XFS_REMOVE_LOG_RES(mp), 0,
2030                                           XFS_TRANS_PERM_LOG_RES, log_count);
2031         }
2032         if (error) {
2033                 ASSERT(error != ENOSPC);
2034                 cancel_flags = 0;
2035                 goto out_trans_cancel;
2036         }
2037
2038         error = xfs_lock_dir_and_entry(dp, ip);
2039         if (error)
2040                 goto out_trans_cancel;
2041
2042         /*
2043          * At this point, we've gotten both the directory and the entry
2044          * inodes locked.
2045          */
2046         IHOLD(ip);
2047         xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
2048
2049         IHOLD(dp);
2050         xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
2051
2052         /*
2053          * If we're removing a directory perform some additional validation.
2054          */
2055         if (is_dir) {
2056                 ASSERT(ip->i_d.di_nlink >= 2);
2057                 if (ip->i_d.di_nlink != 2) {
2058                         error = XFS_ERROR(ENOTEMPTY);
2059                         goto out_trans_cancel;
2060                 }
2061                 if (!xfs_dir_isempty(ip)) {
2062                         error = XFS_ERROR(ENOTEMPTY);
2063                         goto out_trans_cancel;
2064                 }
2065         }
2066
2067         /*
2068          * Entry must exist since we did a lookup in xfs_lock_dir_and_entry.
2069          */
2070         XFS_BMAP_INIT(&free_list, &first_block);
2071         error = xfs_dir_removename(tp, dp, name, ip->i_ino,
2072                                         &first_block, &free_list, resblks);
2073         if (error) {
2074                 ASSERT(error != ENOENT);
2075                 goto out_bmap_cancel;
2076         }
2077         xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
2078
2079         /*
2080          * Bump the in memory generation count on the parent
2081          * directory so that other can know that it has changed.
2082          */
2083         dp->i_gen++;
2084         xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
2085
2086         if (is_dir) {
2087                 /*
2088                  * Drop the link from ip's "..".
2089                  */
2090                 error = xfs_droplink(tp, dp);
2091                 if (error)
2092                         goto out_bmap_cancel;
2093
2094                 /*
2095                  * Drop the link from dp to ip.
2096                  */
2097                 error = xfs_droplink(tp, ip);
2098                 if (error)
2099                         goto out_bmap_cancel;
2100         } else {
2101                 /*
2102                  * When removing a non-directory we need to log the parent
2103                  * inode here for the i_gen update.  For a directory this is
2104                  * done implicitly by the xfs_droplink call for the ".." entry.
2105                  */
2106                 xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
2107         }
2108
2109         /*
2110          * Drop the "." link from ip to self.
2111          */
2112         error = xfs_droplink(tp, ip);
2113         if (error)
2114                 goto out_bmap_cancel;
2115
2116         /*
2117          * Determine if this is the last link while
2118          * we are in the transaction.
2119          */
2120         link_zero = (ip->i_d.di_nlink == 0);
2121
2122         /*
2123          * If this is a synchronous mount, make sure that the
2124          * remove transaction goes to disk before returning to
2125          * the user.
2126          */
2127         if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC))
2128                 xfs_trans_set_sync(tp);
2129
2130         error = xfs_bmap_finish(&tp, &free_list, &committed);
2131         if (error)
2132                 goto out_bmap_cancel;
2133
2134         error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
2135         if (error)
2136                 goto std_return;
2137
2138         /*
2139          * If we are using filestreams, kill the stream association.
2140          * If the file is still open it may get a new one but that
2141          * will get killed on last close in xfs_close() so we don't
2142          * have to worry about that.
2143          */
2144         if (!is_dir && link_zero && xfs_inode_is_filestream(ip))
2145                 xfs_filestream_deassociate(ip);
2146
2147         xfs_itrace_exit(ip);
2148         xfs_itrace_exit(dp);
2149
2150  std_return:
2151         if (DM_EVENT_ENABLED(dp, DM_EVENT_POSTREMOVE)) {
2152                 XFS_SEND_NAMESP(mp, DM_EVENT_POSTREMOVE, dp, DM_RIGHT_NULL,
2153                                 NULL, DM_RIGHT_NULL, name->name, NULL,
2154                                 ip->i_d.di_mode, error, 0);
2155         }
2156
2157         return error;
2158
2159  out_bmap_cancel:
2160         xfs_bmap_cancel(&free_list);
2161         cancel_flags |= XFS_TRANS_ABORT;
2162  out_trans_cancel:
2163         xfs_trans_cancel(tp, cancel_flags);
2164         goto std_return;
2165 }
2166
2167 int
2168 xfs_link(
2169         xfs_inode_t             *tdp,
2170         xfs_inode_t             *sip,
2171         struct xfs_name         *target_name)
2172 {
2173         xfs_mount_t             *mp = tdp->i_mount;
2174         xfs_trans_t             *tp;
2175         xfs_inode_t             *ips[2];
2176         int                     error;
2177         xfs_bmap_free_t         free_list;
2178         xfs_fsblock_t           first_block;
2179         int                     cancel_flags;
2180         int                     committed;
2181         int                     resblks;
2182
2183         xfs_itrace_entry(tdp);
2184         xfs_itrace_entry(sip);
2185
2186         ASSERT(!S_ISDIR(sip->i_d.di_mode));
2187
2188         if (XFS_FORCED_SHUTDOWN(mp))
2189                 return XFS_ERROR(EIO);
2190
2191         if (DM_EVENT_ENABLED(tdp, DM_EVENT_LINK)) {
2192                 error = XFS_SEND_NAMESP(mp, DM_EVENT_LINK,
2193                                         tdp, DM_RIGHT_NULL,
2194                                         sip, DM_RIGHT_NULL,
2195                                         target_name->name, NULL, 0, 0, 0);
2196                 if (error)
2197                         return error;
2198         }
2199
2200         /* Return through std_return after this point. */
2201
2202         error = XFS_QM_DQATTACH(mp, sip, 0);
2203         if (!error && sip != tdp)
2204                 error = XFS_QM_DQATTACH(mp, tdp, 0);
2205         if (error)
2206                 goto std_return;
2207
2208         tp = xfs_trans_alloc(mp, XFS_TRANS_LINK);
2209         cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
2210         resblks = XFS_LINK_SPACE_RES(mp, target_name->len);
2211         error = xfs_trans_reserve(tp, resblks, XFS_LINK_LOG_RES(mp), 0,
2212                         XFS_TRANS_PERM_LOG_RES, XFS_LINK_LOG_COUNT);
2213         if (error == ENOSPC) {
2214                 resblks = 0;
2215                 error = xfs_trans_reserve(tp, 0, XFS_LINK_LOG_RES(mp), 0,
2216                                 XFS_TRANS_PERM_LOG_RES, XFS_LINK_LOG_COUNT);
2217         }
2218         if (error) {
2219                 cancel_flags = 0;
2220                 goto error_return;
2221         }
2222
2223         if (sip->i_ino < tdp->i_ino) {
2224                 ips[0] = sip;
2225                 ips[1] = tdp;
2226         } else {
2227                 ips[0] = tdp;
2228                 ips[1] = sip;
2229         }
2230
2231         xfs_lock_inodes(ips, 2, XFS_ILOCK_EXCL);
2232
2233         /*
2234          * Increment vnode ref counts since xfs_trans_commit &
2235          * xfs_trans_cancel will both unlock the inodes and
2236          * decrement the associated ref counts.
2237          */
2238         IHOLD(sip);
2239         IHOLD(tdp);
2240         xfs_trans_ijoin(tp, sip, XFS_ILOCK_EXCL);
2241         xfs_trans_ijoin(tp, tdp, XFS_ILOCK_EXCL);
2242
2243         /*
2244          * If the source has too many links, we can't make any more to it.
2245          */
2246         if (sip->i_d.di_nlink >= XFS_MAXLINK) {
2247                 error = XFS_ERROR(EMLINK);
2248                 goto error_return;
2249         }
2250
2251         /*
2252          * If we are using project inheritance, we only allow hard link
2253          * creation in our tree when the project IDs are the same; else
2254          * the tree quota mechanism could be circumvented.
2255          */
2256         if (unlikely((tdp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) &&
2257                      (tdp->i_d.di_projid != sip->i_d.di_projid))) {
2258                 error = XFS_ERROR(EXDEV);
2259                 goto error_return;
2260         }
2261
2262         error = xfs_dir_canenter(tp, tdp, target_name, resblks);
2263         if (error)
2264                 goto error_return;
2265
2266         XFS_BMAP_INIT(&free_list, &first_block);
2267
2268         error = xfs_dir_createname(tp, tdp, target_name, sip->i_ino,
2269                                         &first_block, &free_list, resblks);
2270         if (error)
2271                 goto abort_return;
2272         xfs_ichgtime(tdp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
2273         tdp->i_gen++;
2274         xfs_trans_log_inode(tp, tdp, XFS_ILOG_CORE);
2275
2276         error = xfs_bumplink(tp, sip);
2277         if (error)
2278                 goto abort_return;
2279
2280         /*
2281          * If this is a synchronous mount, make sure that the
2282          * link transaction goes to disk before returning to
2283          * the user.
2284          */
2285         if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
2286                 xfs_trans_set_sync(tp);
2287         }
2288
2289         error = xfs_bmap_finish (&tp, &free_list, &committed);
2290         if (error) {
2291                 xfs_bmap_cancel(&free_list);
2292                 goto abort_return;
2293         }
2294
2295         error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
2296         if (error)
2297                 goto std_return;
2298
2299         /* Fall through to std_return with error = 0. */
2300 std_return:
2301         if (DM_EVENT_ENABLED(sip, DM_EVENT_POSTLINK)) {
2302                 (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTLINK,
2303                                 tdp, DM_RIGHT_NULL,
2304                                 sip, DM_RIGHT_NULL,
2305                                 target_name->name, NULL, 0, error, 0);
2306         }
2307         return error;
2308
2309  abort_return:
2310         cancel_flags |= XFS_TRANS_ABORT;
2311         /* FALLTHROUGH */
2312
2313  error_return:
2314         xfs_trans_cancel(tp, cancel_flags);
2315         goto std_return;
2316 }
2317
2318
2319 int
2320 xfs_mkdir(
2321         xfs_inode_t             *dp,
2322         struct xfs_name         *dir_name,
2323         mode_t                  mode,
2324         xfs_inode_t             **ipp,
2325         cred_t                  *credp)
2326 {
2327         xfs_mount_t             *mp = dp->i_mount;
2328         xfs_inode_t             *cdp;   /* inode of created dir */
2329         xfs_trans_t             *tp;
2330         int                     cancel_flags;
2331         int                     error;
2332         int                     committed;
2333         xfs_bmap_free_t         free_list;
2334         xfs_fsblock_t           first_block;
2335         boolean_t               unlock_dp_on_error = B_FALSE;
2336         boolean_t               created = B_FALSE;
2337         int                     dm_event_sent = 0;
2338         xfs_prid_t              prid;
2339         struct xfs_dquot        *udqp, *gdqp;
2340         uint                    resblks;
2341
2342         if (XFS_FORCED_SHUTDOWN(mp))
2343                 return XFS_ERROR(EIO);
2344
2345         tp = NULL;
2346
2347         if (DM_EVENT_ENABLED(dp, DM_EVENT_CREATE)) {
2348                 error = XFS_SEND_NAMESP(mp, DM_EVENT_CREATE,
2349                                         dp, DM_RIGHT_NULL, NULL,
2350                                         DM_RIGHT_NULL, dir_name->name, NULL,
2351                                         mode, 0, 0);
2352                 if (error)
2353                         return error;
2354                 dm_event_sent = 1;
2355         }
2356
2357         /* Return through std_return after this point. */
2358
2359         xfs_itrace_entry(dp);
2360
2361         mp = dp->i_mount;
2362         udqp = gdqp = NULL;
2363         if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
2364                 prid = dp->i_d.di_projid;
2365         else
2366                 prid = (xfs_prid_t)dfltprid;
2367
2368         /*
2369          * Make sure that we have allocated dquot(s) on disk.
2370          */
2371         error = XFS_QM_DQVOPALLOC(mp, dp,
2372                         current_fsuid(credp), current_fsgid(credp), prid,
2373                         XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp);
2374         if (error)
2375                 goto std_return;
2376
2377         tp = xfs_trans_alloc(mp, XFS_TRANS_MKDIR);
2378         cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
2379         resblks = XFS_MKDIR_SPACE_RES(mp, dir_name->len);
2380         error = xfs_trans_reserve(tp, resblks, XFS_MKDIR_LOG_RES(mp), 0,
2381                                   XFS_TRANS_PERM_LOG_RES, XFS_MKDIR_LOG_COUNT);
2382         if (error == ENOSPC) {
2383                 resblks = 0;
2384                 error = xfs_trans_reserve(tp, 0, XFS_MKDIR_LOG_RES(mp), 0,
2385                                           XFS_TRANS_PERM_LOG_RES,
2386                                           XFS_MKDIR_LOG_COUNT);
2387         }
2388         if (error) {
2389                 cancel_flags = 0;
2390                 goto error_return;
2391         }
2392
2393         xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
2394         unlock_dp_on_error = B_TRUE;
2395
2396         /*
2397          * Check for directory link count overflow.
2398          */
2399         if (dp->i_d.di_nlink >= XFS_MAXLINK) {
2400                 error = XFS_ERROR(EMLINK);
2401                 goto error_return;
2402         }
2403
2404         /*
2405          * Reserve disk quota and the inode.
2406          */
2407         error = XFS_TRANS_RESERVE_QUOTA(mp, tp, udqp, gdqp, resblks, 1, 0);
2408         if (error)
2409                 goto error_return;
2410
2411         error = xfs_dir_canenter(tp, dp, dir_name, resblks);
2412         if (error)
2413                 goto error_return;
2414         /*
2415          * create the directory inode.
2416          */
2417         error = xfs_dir_ialloc(&tp, dp, mode, 2,
2418                         0, credp, prid, resblks > 0,
2419                 &cdp, NULL);
2420         if (error) {
2421                 if (error == ENOSPC)
2422                         goto error_return;
2423                 goto abort_return;
2424         }
2425         xfs_itrace_ref(cdp);
2426
2427         /*
2428          * Now we add the directory inode to the transaction.
2429          * We waited until now since xfs_dir_ialloc might start
2430          * a new transaction.  Had we joined the transaction
2431          * earlier, the locks might have gotten released. An error
2432          * from here on will result in the transaction cancel
2433          * unlocking dp so don't do it explicitly in the error path.
2434          */
2435         IHOLD(dp);
2436         xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
2437         unlock_dp_on_error = B_FALSE;
2438
2439         XFS_BMAP_INIT(&free_list, &first_block);
2440
2441         error = xfs_dir_createname(tp, dp, dir_name, cdp->i_ino,
2442                                         &first_block, &free_list, resblks ?
2443                                         resblks - XFS_IALLOC_SPACE_RES(mp) : 0);
2444         if (error) {
2445                 ASSERT(error != ENOSPC);
2446                 goto error1;
2447         }
2448         xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
2449
2450         /*
2451          * Bump the in memory version number of the parent directory
2452          * so that other processes accessing it will recognize that
2453          * the directory has changed.
2454          */
2455         dp->i_gen++;
2456
2457         error = xfs_dir_init(tp, cdp, dp);
2458         if (error)
2459                 goto error2;
2460
2461         cdp->i_gen = 1;
2462         error = xfs_bumplink(tp, dp);
2463         if (error)
2464                 goto error2;
2465
2466         created = B_TRUE;
2467
2468         *ipp = cdp;
2469         IHOLD(cdp);
2470
2471         /*
2472          * Attach the dquots to the new inode and modify the icount incore.
2473          */
2474         XFS_QM_DQVOPCREATE(mp, tp, cdp, udqp, gdqp);
2475
2476         /*
2477          * If this is a synchronous mount, make sure that the
2478          * mkdir transaction goes to disk before returning to
2479          * the user.
2480          */
2481         if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
2482                 xfs_trans_set_sync(tp);
2483         }
2484
2485         error = xfs_bmap_finish(&tp, &free_list, &committed);
2486         if (error) {
2487                 IRELE(cdp);
2488                 goto error2;
2489         }
2490
2491         error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
2492         XFS_QM_DQRELE(mp, udqp);
2493         XFS_QM_DQRELE(mp, gdqp);
2494         if (error) {
2495                 IRELE(cdp);
2496         }
2497
2498         /* Fall through to std_return with error = 0 or errno from
2499          * xfs_trans_commit. */
2500
2501 std_return:
2502         if ((created || (error != 0 && dm_event_sent != 0)) &&
2503             DM_EVENT_ENABLED(dp, DM_EVENT_POSTCREATE)) {
2504                 (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTCREATE,
2505                                         dp, DM_RIGHT_NULL,
2506                                         created ? cdp : NULL,
2507                                         DM_RIGHT_NULL,
2508                                         dir_name->name, NULL,
2509                                         mode, error, 0);
2510         }
2511         return error;
2512
2513  error2:
2514  error1:
2515         xfs_bmap_cancel(&free_list);
2516  abort_return:
2517         cancel_flags |= XFS_TRANS_ABORT;
2518  error_return:
2519         xfs_trans_cancel(tp, cancel_flags);
2520         XFS_QM_DQRELE(mp, udqp);
2521         XFS_QM_DQRELE(mp, gdqp);
2522
2523         if (unlock_dp_on_error)
2524                 xfs_iunlock(dp, XFS_ILOCK_EXCL);
2525
2526         goto std_return;
2527 }
2528
2529 int
2530 xfs_symlink(
2531         xfs_inode_t             *dp,
2532         struct xfs_name         *link_name,
2533         const char              *target_path,
2534         mode_t                  mode,
2535         xfs_inode_t             **ipp,
2536         cred_t                  *credp)
2537 {
2538         xfs_mount_t             *mp = dp->i_mount;
2539         xfs_trans_t             *tp;
2540         xfs_inode_t             *ip;
2541         int                     error;
2542         int                     pathlen;
2543         xfs_bmap_free_t         free_list;
2544         xfs_fsblock_t           first_block;
2545         boolean_t               unlock_dp_on_error = B_FALSE;
2546         uint                    cancel_flags;
2547         int                     committed;
2548         xfs_fileoff_t           first_fsb;
2549         xfs_filblks_t           fs_blocks;
2550         int                     nmaps;
2551         xfs_bmbt_irec_t         mval[SYMLINK_MAPS];
2552         xfs_daddr_t             d;
2553         const char              *cur_chunk;
2554         int                     byte_cnt;
2555         int                     n;
2556         xfs_buf_t               *bp;
2557         xfs_prid_t              prid;
2558         struct xfs_dquot        *udqp, *gdqp;
2559         uint                    resblks;
2560
2561         *ipp = NULL;
2562         error = 0;
2563         ip = NULL;
2564         tp = NULL;
2565
2566         xfs_itrace_entry(dp);
2567
2568         if (XFS_FORCED_SHUTDOWN(mp))
2569                 return XFS_ERROR(EIO);
2570
2571         /*
2572          * Check component lengths of the target path name.
2573          */
2574         pathlen = strlen(target_path);
2575         if (pathlen >= MAXPATHLEN)      /* total string too long */
2576                 return XFS_ERROR(ENAMETOOLONG);
2577
2578         if (DM_EVENT_ENABLED(dp, DM_EVENT_SYMLINK)) {
2579                 error = XFS_SEND_NAMESP(mp, DM_EVENT_SYMLINK, dp,
2580                                         DM_RIGHT_NULL, NULL, DM_RIGHT_NULL,
2581                                         link_name->name, target_path, 0, 0, 0);
2582                 if (error)
2583                         return error;
2584         }
2585
2586         /* Return through std_return after this point. */
2587
2588         udqp = gdqp = NULL;
2589         if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
2590                 prid = dp->i_d.di_projid;
2591         else
2592                 prid = (xfs_prid_t)dfltprid;
2593
2594         /*
2595          * Make sure that we have allocated dquot(s) on disk.
2596          */
2597         error = XFS_QM_DQVOPALLOC(mp, dp,
2598                         current_fsuid(credp), current_fsgid(credp), prid,
2599                         XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp);
2600         if (error)
2601                 goto std_return;
2602
2603         tp = xfs_trans_alloc(mp, XFS_TRANS_SYMLINK);
2604         cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
2605         /*
2606          * The symlink will fit into the inode data fork?
2607          * There can't be any attributes so we get the whole variable part.
2608          */
2609         if (pathlen <= XFS_LITINO(mp))
2610                 fs_blocks = 0;
2611         else
2612                 fs_blocks = XFS_B_TO_FSB(mp, pathlen);
2613         resblks = XFS_SYMLINK_SPACE_RES(mp, link_name->len, fs_blocks);
2614         error = xfs_trans_reserve(tp, resblks, XFS_SYMLINK_LOG_RES(mp), 0,
2615                         XFS_TRANS_PERM_LOG_RES, XFS_SYMLINK_LOG_COUNT);
2616         if (error == ENOSPC && fs_blocks == 0) {
2617                 resblks = 0;
2618                 error = xfs_trans_reserve(tp, 0, XFS_SYMLINK_LOG_RES(mp), 0,
2619                                 XFS_TRANS_PERM_LOG_RES, XFS_SYMLINK_LOG_COUNT);
2620         }
2621         if (error) {
2622                 cancel_flags = 0;
2623                 goto error_return;
2624         }
2625
2626         xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
2627         unlock_dp_on_error = B_TRUE;
2628
2629         /*
2630          * Check whether the directory allows new symlinks or not.
2631          */
2632         if (dp->i_d.di_flags & XFS_DIFLAG_NOSYMLINKS) {
2633                 error = XFS_ERROR(EPERM);
2634                 goto error_return;
2635         }
2636
2637         /*
2638          * Reserve disk quota : blocks and inode.
2639          */
2640         error = XFS_TRANS_RESERVE_QUOTA(mp, tp, udqp, gdqp, resblks, 1, 0);
2641         if (error)
2642                 goto error_return;
2643
2644         /*
2645          * Check for ability to enter directory entry, if no space reserved.
2646          */
2647         error = xfs_dir_canenter(tp, dp, link_name, resblks);
2648         if (error)
2649                 goto error_return;
2650         /*
2651          * Initialize the bmap freelist prior to calling either
2652          * bmapi or the directory create code.
2653          */
2654         XFS_BMAP_INIT(&free_list, &first_block);
2655
2656         /*
2657          * Allocate an inode for the symlink.
2658          */
2659         error = xfs_dir_ialloc(&tp, dp, S_IFLNK | (mode & ~S_IFMT),
2660                                1, 0, credp, prid, resblks > 0, &ip, NULL);
2661         if (error) {
2662                 if (error == ENOSPC)
2663                         goto error_return;
2664                 goto error1;
2665         }
2666         xfs_itrace_ref(ip);
2667
2668         /*
2669          * An error after we've joined dp to the transaction will result in the
2670          * transaction cancel unlocking dp so don't do it explicitly in the
2671          * error path.
2672          */
2673         IHOLD(dp);
2674         xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
2675         unlock_dp_on_error = B_FALSE;
2676
2677         /*
2678          * Also attach the dquot(s) to it, if applicable.
2679          */
2680         XFS_QM_DQVOPCREATE(mp, tp, ip, udqp, gdqp);
2681
2682         if (resblks)
2683                 resblks -= XFS_IALLOC_SPACE_RES(mp);
2684         /*
2685          * If the symlink will fit into the inode, write it inline.
2686          */
2687         if (pathlen <= XFS_IFORK_DSIZE(ip)) {
2688                 xfs_idata_realloc(ip, pathlen, XFS_DATA_FORK);
2689                 memcpy(ip->i_df.if_u1.if_data, target_path, pathlen);
2690                 ip->i_d.di_size = pathlen;
2691
2692                 /*
2693                  * The inode was initially created in extent format.
2694                  */
2695                 ip->i_df.if_flags &= ~(XFS_IFEXTENTS | XFS_IFBROOT);
2696                 ip->i_df.if_flags |= XFS_IFINLINE;
2697
2698                 ip->i_d.di_format = XFS_DINODE_FMT_LOCAL;
2699                 xfs_trans_log_inode(tp, ip, XFS_ILOG_DDATA | XFS_ILOG_CORE);
2700
2701         } else {
2702                 first_fsb = 0;
2703                 nmaps = SYMLINK_MAPS;
2704
2705                 error = xfs_bmapi(tp, ip, first_fsb, fs_blocks,
2706                                   XFS_BMAPI_WRITE | XFS_BMAPI_METADATA,
2707                                   &first_block, resblks, mval, &nmaps,
2708                                   &free_list, NULL);
2709                 if (error) {
2710                         goto error1;
2711                 }
2712
2713                 if (resblks)
2714                         resblks -= fs_blocks;
2715                 ip->i_d.di_size = pathlen;
2716                 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
2717
2718                 cur_chunk = target_path;
2719                 for (n = 0; n < nmaps; n++) {
2720                         d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock);
2721                         byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount);
2722                         bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d,
2723                                                BTOBB(byte_cnt), 0);
2724                         ASSERT(bp && !XFS_BUF_GETERROR(bp));
2725                         if (pathlen < byte_cnt) {
2726                                 byte_cnt = pathlen;
2727                         }
2728                         pathlen -= byte_cnt;
2729
2730                         memcpy(XFS_BUF_PTR(bp), cur_chunk, byte_cnt);
2731                         cur_chunk += byte_cnt;
2732
2733                         xfs_trans_log_buf(tp, bp, 0, byte_cnt - 1);
2734                 }
2735         }
2736
2737         /*
2738          * Create the directory entry for the symlink.
2739          */
2740         error = xfs_dir_createname(tp, dp, link_name, ip->i_ino,
2741                                         &first_block, &free_list, resblks);
2742         if (error)
2743                 goto error1;
2744         xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
2745         xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
2746
2747         /*
2748          * Bump the in memory version number of the parent directory
2749          * so that other processes accessing it will recognize that
2750          * the directory has changed.
2751          */
2752         dp->i_gen++;
2753
2754         /*
2755          * If this is a synchronous mount, make sure that the
2756          * symlink transaction goes to disk before returning to
2757          * the user.
2758          */
2759         if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
2760                 xfs_trans_set_sync(tp);
2761         }
2762
2763         /*
2764          * xfs_trans_commit normally decrements the vnode ref count
2765          * when it unlocks the inode. Since we want to return the
2766          * vnode to the caller, we bump the vnode ref count now.
2767          */
2768         IHOLD(ip);
2769
2770         error = xfs_bmap_finish(&tp, &free_list, &committed);
2771         if (error) {
2772                 goto error2;
2773         }
2774         error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
2775         XFS_QM_DQRELE(mp, udqp);
2776         XFS_QM_DQRELE(mp, gdqp);
2777
2778         /* Fall through to std_return with error = 0 or errno from
2779          * xfs_trans_commit     */
2780 std_return:
2781         if (DM_EVENT_ENABLED(dp, DM_EVENT_POSTSYMLINK)) {
2782                 (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTSYMLINK,
2783                                         dp, DM_RIGHT_NULL,
2784                                         error ? NULL : ip,
2785                                         DM_RIGHT_NULL, link_name->name,
2786                                         target_path, 0, error, 0);
2787         }
2788
2789         if (!error)
2790                 *ipp = ip;
2791         return error;
2792
2793  error2:
2794         IRELE(ip);
2795  error1:
2796         xfs_bmap_cancel(&free_list);
2797         cancel_flags |= XFS_TRANS_ABORT;
2798  error_return:
2799         xfs_trans_cancel(tp, cancel_flags);
2800         XFS_QM_DQRELE(mp, udqp);
2801         XFS_QM_DQRELE(mp, gdqp);
2802
2803         if (unlock_dp_on_error)
2804                 xfs_iunlock(dp, XFS_ILOCK_EXCL);
2805
2806         goto std_return;
2807 }
2808
2809 int
2810 xfs_inode_flush(
2811         xfs_inode_t     *ip,
2812         int             flags)
2813 {
2814         xfs_mount_t     *mp = ip->i_mount;
2815         int             error = 0;
2816
2817         if (XFS_FORCED_SHUTDOWN(mp))
2818                 return XFS_ERROR(EIO);
2819
2820         /*
2821          * Bypass inodes which have already been cleaned by
2822          * the inode flush clustering code inside xfs_iflush
2823          */
2824         if (xfs_inode_clean(ip))
2825                 return 0;
2826
2827         /*
2828          * We make this non-blocking if the inode is contended,
2829          * return EAGAIN to indicate to the caller that they
2830          * did not succeed. This prevents the flush path from
2831          * blocking on inodes inside another operation right
2832          * now, they get caught later by xfs_sync.
2833          */
2834         if (flags & FLUSH_SYNC) {
2835                 xfs_ilock(ip, XFS_ILOCK_SHARED);
2836                 xfs_iflock(ip);
2837         } else if (xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) {
2838                 if (xfs_ipincount(ip) || !xfs_iflock_nowait(ip)) {
2839                         xfs_iunlock(ip, XFS_ILOCK_SHARED);
2840                         return EAGAIN;
2841                 }
2842         } else {
2843                 return EAGAIN;
2844         }
2845
2846         error = xfs_iflush(ip, (flags & FLUSH_SYNC) ? XFS_IFLUSH_SYNC
2847                                                     : XFS_IFLUSH_ASYNC_NOBLOCK);
2848         xfs_iunlock(ip, XFS_ILOCK_SHARED);
2849
2850         return error;
2851 }
2852
2853
2854 int
2855 xfs_set_dmattrs(
2856         xfs_inode_t     *ip,
2857         u_int           evmask,
2858         u_int16_t       state)
2859 {
2860         xfs_mount_t     *mp = ip->i_mount;
2861         xfs_trans_t     *tp;
2862         int             error;
2863
2864         if (!capable(CAP_SYS_ADMIN))
2865                 return XFS_ERROR(EPERM);
2866
2867         if (XFS_FORCED_SHUTDOWN(mp))
2868                 return XFS_ERROR(EIO);
2869
2870         tp = xfs_trans_alloc(mp, XFS_TRANS_SET_DMATTRS);
2871         error = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES (mp), 0, 0, 0);
2872         if (error) {
2873                 xfs_trans_cancel(tp, 0);
2874                 return error;
2875         }
2876         xfs_ilock(ip, XFS_ILOCK_EXCL);
2877         xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
2878
2879         ip->i_d.di_dmevmask = evmask;
2880         ip->i_d.di_dmstate  = state;
2881
2882         xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
2883         IHOLD(ip);
2884         error = xfs_trans_commit(tp, 0);
2885
2886         return error;
2887 }
2888
2889 int
2890 xfs_reclaim(
2891         xfs_inode_t     *ip)
2892 {
2893         bhv_vnode_t     *vp = XFS_ITOV(ip);
2894
2895         xfs_itrace_entry(ip);
2896
2897         ASSERT(!VN_MAPPED(vp));
2898
2899         /* bad inode, get out here ASAP */
2900         if (VN_BAD(vp)) {
2901                 xfs_ireclaim(ip);
2902                 return 0;
2903         }
2904
2905         vn_iowait(ip);
2906
2907         ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || ip->i_delayed_blks == 0);
2908
2909         /*
2910          * Make sure the atime in the XFS inode is correct before freeing the
2911          * Linux inode.
2912          */
2913         xfs_synchronize_atime(ip);
2914
2915         /*
2916          * If we have nothing to flush with this inode then complete the
2917          * teardown now, otherwise break the link between the xfs inode and the
2918          * linux inode and clean up the xfs inode later. This avoids flushing
2919          * the inode to disk during the delete operation itself.
2920          *
2921          * When breaking the link, we need to set the XFS_IRECLAIMABLE flag
2922          * first to ensure that xfs_iunpin() will never see an xfs inode
2923          * that has a linux inode being reclaimed. Synchronisation is provided
2924          * by the i_flags_lock.
2925          */
2926         if (!ip->i_update_core && (ip->i_itemp == NULL)) {
2927                 xfs_ilock(ip, XFS_ILOCK_EXCL);
2928                 xfs_iflock(ip);
2929                 return xfs_finish_reclaim(ip, 1, XFS_IFLUSH_DELWRI_ELSE_SYNC);
2930         } else {
2931                 xfs_mount_t     *mp = ip->i_mount;
2932
2933                 /* Protect sync and unpin from us */
2934                 XFS_MOUNT_ILOCK(mp);
2935                 spin_lock(&ip->i_flags_lock);
2936                 __xfs_iflags_set(ip, XFS_IRECLAIMABLE);
2937                 vn_to_inode(vp)->i_private = NULL;
2938                 ip->i_vnode = NULL;
2939                 spin_unlock(&ip->i_flags_lock);
2940                 list_add_tail(&ip->i_reclaim, &mp->m_del_inodes);
2941                 XFS_MOUNT_IUNLOCK(mp);
2942         }
2943         return 0;
2944 }
2945
2946 int
2947 xfs_finish_reclaim(
2948         xfs_inode_t     *ip,
2949         int             locked,
2950         int             sync_mode)
2951 {
2952         xfs_perag_t     *pag = xfs_get_perag(ip->i_mount, ip->i_ino);
2953         bhv_vnode_t     *vp = XFS_ITOV_NULL(ip);
2954
2955         if (vp && VN_BAD(vp))
2956                 goto reclaim;
2957
2958         /* The hash lock here protects a thread in xfs_iget_core from
2959          * racing with us on linking the inode back with a vnode.
2960          * Once we have the XFS_IRECLAIM flag set it will not touch
2961          * us.
2962          */
2963         write_lock(&pag->pag_ici_lock);
2964         spin_lock(&ip->i_flags_lock);
2965         if (__xfs_iflags_test(ip, XFS_IRECLAIM) ||
2966             (!__xfs_iflags_test(ip, XFS_IRECLAIMABLE) && vp == NULL)) {
2967                 spin_unlock(&ip->i_flags_lock);
2968                 write_unlock(&pag->pag_ici_lock);
2969                 if (locked) {
2970                         xfs_ifunlock(ip);
2971                         xfs_iunlock(ip, XFS_ILOCK_EXCL);
2972                 }
2973                 return 1;
2974         }
2975         __xfs_iflags_set(ip, XFS_IRECLAIM);
2976         spin_unlock(&ip->i_flags_lock);
2977         write_unlock(&pag->pag_ici_lock);
2978         xfs_put_perag(ip->i_mount, pag);
2979
2980         /*
2981          * If the inode is still dirty, then flush it out.  If the inode
2982          * is not in the AIL, then it will be OK to flush it delwri as
2983          * long as xfs_iflush() does not keep any references to the inode.
2984          * We leave that decision up to xfs_iflush() since it has the
2985          * knowledge of whether it's OK to simply do a delwri flush of
2986          * the inode or whether we need to wait until the inode is
2987          * pulled from the AIL.
2988          * We get the flush lock regardless, though, just to make sure
2989          * we don't free it while it is being flushed.
2990          */
2991         if (!locked) {
2992                 xfs_ilock(ip, XFS_ILOCK_EXCL);
2993                 xfs_iflock(ip);
2994         }
2995
2996         /*
2997          * In the case of a forced shutdown we rely on xfs_iflush() to
2998          * wait for the inode to be unpinned before returning an error.
2999          */
3000         if (xfs_iflush(ip, sync_mode) == 0) {
3001                 /* synchronize with xfs_iflush_done */
3002                 xfs_iflock(ip);
3003                 xfs_ifunlock(ip);
3004         }
3005
3006         xfs_iunlock(ip, XFS_ILOCK_EXCL);
3007
3008  reclaim:
3009         xfs_ireclaim(ip);
3010         return 0;
3011 }
3012
3013 int
3014 xfs_finish_reclaim_all(xfs_mount_t *mp, int noblock)
3015 {
3016         int             purged;
3017         xfs_inode_t     *ip, *n;
3018         int             done = 0;
3019
3020         while (!done) {
3021                 purged = 0;
3022                 XFS_MOUNT_ILOCK(mp);
3023                 list_for_each_entry_safe(ip, n, &mp->m_del_inodes, i_reclaim) {
3024                         if (noblock) {
3025                                 if (xfs_ilock_nowait(ip, XFS_ILOCK_EXCL) == 0)
3026                                         continue;
3027                                 if (xfs_ipincount(ip) ||
3028                                     !xfs_iflock_nowait(ip)) {
3029                                         xfs_iunlock(ip, XFS_ILOCK_EXCL);
3030                                         continue;
3031                                 }
3032                         }
3033                         XFS_MOUNT_IUNLOCK(mp);
3034                         if (xfs_finish_reclaim(ip, noblock,
3035                                         XFS_IFLUSH_DELWRI_ELSE_ASYNC))
3036                                 delay(1);
3037                         purged = 1;
3038                         break;
3039                 }
3040
3041                 done = !purged;
3042         }
3043
3044         XFS_MOUNT_IUNLOCK(mp);
3045         return 0;
3046 }
3047
3048 /*
3049  * xfs_alloc_file_space()
3050  *      This routine allocates disk space for the given file.
3051  *
3052  *      If alloc_type == 0, this request is for an ALLOCSP type
3053  *      request which will change the file size.  In this case, no
3054  *      DMAPI event will be generated by the call.  A TRUNCATE event
3055  *      will be generated later by xfs_setattr.
3056  *
3057  *      If alloc_type != 0, this request is for a RESVSP type
3058  *      request, and a DMAPI DM_EVENT_WRITE will be generated if the
3059  *      lower block boundary byte address is less than the file's
3060  *      length.
3061  *
3062  * RETURNS:
3063  *       0 on success
3064  *      errno on error
3065  *
3066  */
3067 STATIC int
3068 xfs_alloc_file_space(
3069         xfs_inode_t             *ip,
3070         xfs_off_t               offset,
3071         xfs_off_t               len,
3072         int                     alloc_type,
3073         int                     attr_flags)
3074 {
3075         xfs_mount_t             *mp = ip->i_mount;
3076         xfs_off_t               count;
3077         xfs_filblks_t           allocated_fsb;
3078         xfs_filblks_t           allocatesize_fsb;
3079         xfs_extlen_t            extsz, temp;
3080         xfs_fileoff_t           startoffset_fsb;
3081         xfs_fsblock_t           firstfsb;
3082         int                     nimaps;
3083         int                     bmapi_flag;
3084         int                     quota_flag;
3085         int                     rt;
3086         xfs_trans_t             *tp;
3087         xfs_bmbt_irec_t         imaps[1], *imapp;
3088         xfs_bmap_free_t         free_list;
3089         uint                    qblocks, resblks, resrtextents;
3090         int                     committed;
3091         int                     error;
3092
3093         xfs_itrace_entry(ip);
3094
3095         if (XFS_FORCED_SHUTDOWN(mp))
3096                 return XFS_ERROR(EIO);
3097
3098         if ((error = XFS_QM_DQATTACH(mp, ip, 0)))
3099                 return error;
3100
3101         if (len <= 0)
3102                 return XFS_ERROR(EINVAL);
3103
3104         rt = XFS_IS_REALTIME_INODE(ip);
3105         extsz = xfs_get_extsz_hint(ip);
3106
3107         count = len;
3108         imapp = &imaps[0];
3109         nimaps = 1;
3110         bmapi_flag = XFS_BMAPI_WRITE | (alloc_type ? XFS_BMAPI_PREALLOC : 0);
3111         startoffset_fsb = XFS_B_TO_FSBT(mp, offset);
3112         allocatesize_fsb = XFS_B_TO_FSB(mp, count);
3113
3114         /*      Generate a DMAPI event if needed.       */
3115         if (alloc_type != 0 && offset < ip->i_size &&
3116                         (attr_flags&ATTR_DMI) == 0  &&
3117                         DM_EVENT_ENABLED(ip, DM_EVENT_WRITE)) {
3118                 xfs_off_t           end_dmi_offset;
3119
3120                 end_dmi_offset = offset+len;
3121                 if (end_dmi_offset > ip->i_size)
3122                         end_dmi_offset = ip->i_size;
3123                 error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, ip, offset,
3124                                       end_dmi_offset - offset, 0, NULL);
3125                 if (error)
3126                         return error;
3127         }
3128
3129         /*
3130          * Allocate file space until done or until there is an error
3131          */
3132 retry:
3133         while (allocatesize_fsb && !error) {
3134                 xfs_fileoff_t   s, e;
3135
3136                 /*
3137                  * Determine space reservations for data/realtime.
3138                  */
3139                 if (unlikely(extsz)) {
3140                         s = startoffset_fsb;
3141                         do_div(s, extsz);
3142                         s *= extsz;
3143                         e = startoffset_fsb + allocatesize_fsb;
3144                         if ((temp = do_mod(startoffset_fsb, extsz)))
3145                                 e += temp;
3146                         if ((temp = do_mod(e, extsz)))
3147                                 e += extsz - temp;
3148                 } else {
3149                         s = 0;
3150                         e = allocatesize_fsb;
3151                 }
3152
3153                 if (unlikely(rt)) {
3154                         resrtextents = qblocks = (uint)(e - s);
3155                         resrtextents /= mp->m_sb.sb_rextsize;
3156                         resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
3157                         quota_flag = XFS_QMOPT_RES_RTBLKS;
3158                 } else {
3159                         resrtextents = 0;
3160                         resblks = qblocks = \
3161                                 XFS_DIOSTRAT_SPACE_RES(mp, (uint)(e - s));
3162                         quota_flag = XFS_QMOPT_RES_REGBLKS;
3163                 }
3164
3165                 /*
3166                  * Allocate and setup the transaction.
3167                  */
3168                 tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
3169                 error = xfs_trans_reserve(tp, resblks,
3170                                           XFS_WRITE_LOG_RES(mp), resrtextents,
3171                                           XFS_TRANS_PERM_LOG_RES,
3172                                           XFS_WRITE_LOG_COUNT);
3173                 /*
3174                  * Check for running out of space
3175                  */
3176                 if (error) {
3177                         /*
3178                          * Free the transaction structure.
3179                          */
3180                         ASSERT(error == ENOSPC || XFS_FORCED_SHUTDOWN(mp));
3181                         xfs_trans_cancel(tp, 0);
3182                         break;
3183                 }
3184                 xfs_ilock(ip, XFS_ILOCK_EXCL);
3185                 error = XFS_TRANS_RESERVE_QUOTA_NBLKS(mp, tp, ip,
3186                                                       qblocks, 0, quota_flag);
3187                 if (error)
3188                         goto error1;
3189
3190                 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
3191                 xfs_trans_ihold(tp, ip);
3192
3193                 /*
3194                  * Issue the xfs_bmapi() call to allocate the blocks
3195                  */
3196                 XFS_BMAP_INIT(&free_list, &firstfsb);
3197                 error = xfs_bmapi(tp, ip, startoffset_fsb,
3198                                   allocatesize_fsb, bmapi_flag,
3199                                   &firstfsb, 0, imapp, &nimaps,
3200                                   &free_list, NULL);
3201                 if (error) {
3202                         goto error0;
3203                 }
3204
3205                 /*
3206                  * Complete the transaction
3207                  */
3208                 error = xfs_bmap_finish(&tp, &free_list, &committed);
3209                 if (error) {
3210                         goto error0;
3211                 }
3212
3213                 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
3214                 xfs_iunlock(ip, XFS_ILOCK_EXCL);
3215                 if (error) {
3216                         break;
3217                 }
3218
3219                 allocated_fsb = imapp->br_blockcount;
3220
3221                 if (nimaps == 0) {
3222                         error = XFS_ERROR(ENOSPC);
3223                         break;
3224                 }
3225
3226                 startoffset_fsb += allocated_fsb;
3227                 allocatesize_fsb -= allocated_fsb;
3228         }
3229 dmapi_enospc_check:
3230         if (error == ENOSPC && (attr_flags & ATTR_DMI) == 0 &&
3231             DM_EVENT_ENABLED(ip, DM_EVENT_NOSPACE)) {
3232                 error = XFS_SEND_NAMESP(mp, DM_EVENT_NOSPACE,
3233                                 ip, DM_RIGHT_NULL,
3234                                 ip, DM_RIGHT_NULL,
3235                                 NULL, NULL, 0, 0, 0); /* Delay flag intentionally unused */
3236                 if (error == 0)
3237                         goto retry;     /* Maybe DMAPI app. has made space */
3238                 /* else fall through with error from XFS_SEND_DATA */
3239         }
3240
3241         return error;
3242
3243 error0: /* Cancel bmap, unlock inode, unreserve quota blocks, cancel trans */
3244         xfs_bmap_cancel(&free_list);
3245         XFS_TRANS_UNRESERVE_QUOTA_NBLKS(mp, tp, ip, qblocks, 0, quota_flag);
3246
3247 error1: /* Just cancel transaction */
3248         xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
3249         xfs_iunlock(ip, XFS_ILOCK_EXCL);
3250         goto dmapi_enospc_check;
3251 }
3252
3253 /*
3254  * Zero file bytes between startoff and endoff inclusive.
3255  * The iolock is held exclusive and no blocks are buffered.
3256  */
3257 STATIC int
3258 xfs_zero_remaining_bytes(
3259         xfs_inode_t             *ip,
3260         xfs_off_t               startoff,
3261         xfs_off_t               endoff)
3262 {
3263         xfs_bmbt_irec_t         imap;
3264         xfs_fileoff_t           offset_fsb;
3265         xfs_off_t               lastoffset;
3266         xfs_off_t               offset;
3267         xfs_buf_t               *bp;
3268         xfs_mount_t             *mp = ip->i_mount;
3269         int                     nimap;
3270         int                     error = 0;
3271
3272         bp = xfs_buf_get_noaddr(mp->m_sb.sb_blocksize,
3273                                 XFS_IS_REALTIME_INODE(ip) ?
3274                                 mp->m_rtdev_targp : mp->m_ddev_targp);
3275
3276         for (offset = startoff; offset <= endoff; offset = lastoffset + 1) {
3277                 offset_fsb = XFS_B_TO_FSBT(mp, offset);
3278                 nimap = 1;
3279                 error = xfs_bmapi(NULL, ip, offset_fsb, 1, 0,
3280                         NULL, 0, &imap, &nimap, NULL, NULL);
3281                 if (error || nimap < 1)
3282                         break;
3283                 ASSERT(imap.br_blockcount >= 1);
3284                 ASSERT(imap.br_startoff == offset_fsb);
3285                 lastoffset = XFS_FSB_TO_B(mp, imap.br_startoff + 1) - 1;
3286                 if (lastoffset > endoff)
3287                         lastoffset = endoff;
3288                 if (imap.br_startblock == HOLESTARTBLOCK)
3289                         continue;
3290                 ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
3291                 if (imap.br_state == XFS_EXT_UNWRITTEN)
3292                         continue;
3293                 XFS_BUF_UNDONE(bp);
3294                 XFS_BUF_UNWRITE(bp);
3295                 XFS_BUF_READ(bp);
3296                 XFS_BUF_SET_ADDR(bp, XFS_FSB_TO_DB(ip, imap.br_startblock));
3297                 xfsbdstrat(mp, bp);
3298                 error = xfs_iowait(bp);
3299                 if (error) {
3300                         xfs_ioerror_alert("xfs_zero_remaining_bytes(read)",
3301                                           mp, bp, XFS_BUF_ADDR(bp));
3302                         break;
3303                 }
3304                 memset(XFS_BUF_PTR(bp) +
3305                         (offset - XFS_FSB_TO_B(mp, imap.br_startoff)),
3306                       0, lastoffset - offset + 1);
3307                 XFS_BUF_UNDONE(bp);
3308                 XFS_BUF_UNREAD(bp);
3309                 XFS_BUF_WRITE(bp);
3310                 xfsbdstrat(mp, bp);
3311                 error = xfs_iowait(bp);
3312                 if (error) {
3313                         xfs_ioerror_alert("xfs_zero_remaining_bytes(write)",
3314                                           mp, bp, XFS_BUF_ADDR(bp));
3315                         break;
3316                 }
3317         }
3318         xfs_buf_free(bp);
3319         return error;
3320 }
3321
3322 /*
3323  * xfs_free_file_space()
3324  *      This routine frees disk space for the given file.
3325  *
3326  *      This routine is only called by xfs_change_file_space
3327  *      for an UNRESVSP type call.
3328  *
3329  * RETURNS:
3330  *       0 on success
3331  *      errno on error
3332  *
3333  */
3334 STATIC int
3335 xfs_free_file_space(
3336         xfs_inode_t             *ip,
3337         xfs_off_t               offset,
3338         xfs_off_t               len,
3339         int                     attr_flags)
3340 {
3341         bhv_vnode_t             *vp;
3342         int                     committed;
3343         int                     done;
3344         xfs_off_t               end_dmi_offset;
3345         xfs_fileoff_t           endoffset_fsb;
3346         int                     error;
3347         xfs_fsblock_t           firstfsb;
3348         xfs_bmap_free_t         free_list;
3349         xfs_bmbt_irec_t         imap;
3350         xfs_off_t               ioffset;
3351         xfs_extlen_t            mod=0;
3352         xfs_mount_t             *mp;
3353         int                     nimap;
3354         uint                    resblks;
3355         uint                    rounding;
3356         int                     rt;
3357         xfs_fileoff_t           startoffset_fsb;
3358         xfs_trans_t             *tp;
3359         int                     need_iolock = 1;
3360
3361         vp = XFS_ITOV(ip);
3362         mp = ip->i_mount;
3363
3364         xfs_itrace_entry(ip);
3365
3366         if ((error = XFS_QM_DQATTACH(mp, ip, 0)))
3367                 return error;
3368
3369         error = 0;
3370         if (len <= 0)   /* if nothing being freed */
3371                 return error;
3372         rt = XFS_IS_REALTIME_INODE(ip);
3373         startoffset_fsb = XFS_B_TO_FSB(mp, offset);
3374         end_dmi_offset = offset + len;
3375         endoffset_fsb = XFS_B_TO_FSBT(mp, end_dmi_offset);
3376
3377         if (offset < ip->i_size && (attr_flags & ATTR_DMI) == 0 &&
3378             DM_EVENT_ENABLED(ip, DM_EVENT_WRITE)) {
3379                 if (end_dmi_offset > ip->i_size)
3380                         end_dmi_offset = ip->i_size;
3381                 error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, ip,
3382                                 offset, end_dmi_offset - offset,
3383                                 AT_DELAY_FLAG(attr_flags), NULL);
3384                 if (error)
3385                         return error;
3386         }
3387
3388         if (attr_flags & ATTR_NOLOCK)
3389                 need_iolock = 0;
3390         if (need_iolock) {
3391                 xfs_ilock(ip, XFS_IOLOCK_EXCL);
3392                 vn_iowait(ip);  /* wait for the completion of any pending DIOs */
3393         }
3394
3395         rounding = max_t(uint, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE);
3396         ioffset = offset & ~(rounding - 1);
3397
3398         if (VN_CACHED(vp) != 0) {
3399                 xfs_inval_cached_trace(ip, ioffset, -1, ioffset, -1);
3400                 error = xfs_flushinval_pages(ip, ioffset, -1, FI_REMAPF_LOCKED);
3401                 if (error)
3402                         goto out_unlock_iolock;
3403         }
3404
3405         /*
3406          * Need to zero the stuff we're not freeing, on disk.
3407          * If its a realtime file & can't use unwritten extents then we
3408          * actually need to zero the extent edges.  Otherwise xfs_bunmapi
3409          * will take care of it for us.
3410          */
3411         if (rt && !xfs_sb_version_hasextflgbit(&mp->m_sb)) {
3412                 nimap = 1;
3413                 error = xfs_bmapi(NULL, ip, startoffset_fsb,
3414                         1, 0, NULL, 0, &imap, &nimap, NULL, NULL);
3415                 if (error)
3416                         goto out_unlock_iolock;
3417                 ASSERT(nimap == 0 || nimap == 1);
3418                 if (nimap && imap.br_startblock != HOLESTARTBLOCK) {
3419                         xfs_daddr_t     block;
3420
3421                         ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
3422                         block = imap.br_startblock;
3423                         mod = do_div(block, mp->m_sb.sb_rextsize);
3424                         if (mod)
3425                                 startoffset_fsb += mp->m_sb.sb_rextsize - mod;
3426                 }
3427                 nimap = 1;
3428                 error = xfs_bmapi(NULL, ip, endoffset_fsb - 1,
3429                         1, 0, NULL, 0, &imap, &nimap, NULL, NULL);
3430                 if (error)
3431                         goto out_unlock_iolock;
3432                 ASSERT(nimap == 0 || nimap == 1);
3433                 if (nimap && imap.br_startblock != HOLESTARTBLOCK) {
3434                         ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
3435                         mod++;
3436                         if (mod && (mod != mp->m_sb.sb_rextsize))
3437                                 endoffset_fsb -= mod;
3438                 }
3439         }
3440         if ((done = (endoffset_fsb <= startoffset_fsb)))
3441                 /*
3442                  * One contiguous piece to clear
3443                  */
3444                 error = xfs_zero_remaining_bytes(ip, offset, offset + len - 1);
3445         else {
3446                 /*
3447                  * Some full blocks, possibly two pieces to clear
3448                  */
3449                 if (offset < XFS_FSB_TO_B(mp, startoffset_fsb))
3450                         error = xfs_zero_remaining_bytes(ip, offset,
3451                                 XFS_FSB_TO_B(mp, startoffset_fsb) - 1);
3452                 if (!error &&
3453                     XFS_FSB_TO_B(mp, endoffset_fsb) < offset + len)
3454                         error = xfs_zero_remaining_bytes(ip,
3455                                 XFS_FSB_TO_B(mp, endoffset_fsb),
3456                                 offset + len - 1);
3457         }
3458
3459         /*
3460          * free file space until done or until there is an error
3461          */
3462         resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
3463         while (!error && !done) {
3464
3465                 /*
3466                  * allocate and setup the transaction. Allow this
3467                  * transaction to dip into the reserve blocks to ensure
3468                  * the freeing of the space succeeds at ENOSPC.
3469                  */
3470                 tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
3471                 tp->t_flags |= XFS_TRANS_RESERVE;
3472                 error = xfs_trans_reserve(tp,
3473                                           resblks,
3474                                           XFS_WRITE_LOG_RES(mp),
3475                                           0,
3476                                           XFS_TRANS_PERM_LOG_RES,
3477                                           XFS_WRITE_LOG_COUNT);
3478
3479                 /*
3480                  * check for running out of space
3481                  */
3482                 if (error) {
3483                         /*
3484                          * Free the transaction structure.
3485                          */
3486                         ASSERT(error == ENOSPC || XFS_FORCED_SHUTDOWN(mp));
3487                         xfs_trans_cancel(tp, 0);
3488                         break;
3489                 }
3490                 xfs_ilock(ip, XFS_ILOCK_EXCL);
3491                 error = XFS_TRANS_RESERVE_QUOTA(mp, tp,
3492                                 ip->i_udquot, ip->i_gdquot, resblks, 0,
3493                                 XFS_QMOPT_RES_REGBLKS);
3494                 if (error)
3495                         goto error1;
3496
3497                 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
3498                 xfs_trans_ihold(tp, ip);
3499
3500                 /*
3501                  * issue the bunmapi() call to free the blocks
3502                  */
3503                 XFS_BMAP_INIT(&free_list, &firstfsb);
3504                 error = xfs_bunmapi(tp, ip, startoffset_fsb,
3505                                   endoffset_fsb - startoffset_fsb,
3506                                   0, 2, &firstfsb, &free_list, NULL, &done);
3507                 if (error) {
3508                         goto error0;
3509                 }
3510
3511                 /*
3512                  * complete the transaction
3513                  */
3514                 error = xfs_bmap_finish(&tp, &free_list, &committed);
3515                 if (error) {
3516                         goto error0;
3517                 }
3518
3519                 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
3520                 xfs_iunlock(ip, XFS_ILOCK_EXCL);
3521         }
3522
3523  out_unlock_iolock:
3524         if (need_iolock)
3525                 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
3526         return error;
3527
3528  error0:
3529         xfs_bmap_cancel(&free_list);
3530  error1:
3531         xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
3532         xfs_iunlock(ip, need_iolock ? (XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL) :
3533                     XFS_ILOCK_EXCL);
3534         return error;
3535 }
3536
3537 /*
3538  * xfs_change_file_space()
3539  *      This routine allocates or frees disk space for the given file.
3540  *      The user specified parameters are checked for alignment and size
3541  *      limitations.
3542  *
3543  * RETURNS:
3544  *       0 on success
3545  *      errno on error
3546  *
3547  */
3548 int
3549 xfs_change_file_space(
3550         xfs_inode_t     *ip,
3551         int             cmd,
3552         xfs_flock64_t   *bf,
3553         xfs_off_t       offset,
3554         cred_t          *credp,
3555         int             attr_flags)
3556 {
3557         xfs_mount_t     *mp = ip->i_mount;
3558         int             clrprealloc;
3559         int             error;
3560         xfs_fsize_t     fsize;
3561         int             setprealloc;
3562         xfs_off_t       startoffset;
3563         xfs_off_t       llen;
3564         xfs_trans_t     *tp;
3565         bhv_vattr_t     va;
3566
3567         xfs_itrace_entry(ip);
3568
3569         if (!S_ISREG(ip->i_d.di_mode))
3570                 return XFS_ERROR(EINVAL);
3571
3572         switch (bf->l_whence) {
3573         case 0: /*SEEK_SET*/
3574                 break;
3575         case 1: /*SEEK_CUR*/
3576                 bf->l_start += offset;
3577                 break;
3578         case 2: /*SEEK_END*/
3579                 bf->l_start += ip->i_size;
3580                 break;
3581         default:
3582                 return XFS_ERROR(EINVAL);
3583         }
3584
3585         llen = bf->l_len > 0 ? bf->l_len - 1 : bf->l_len;
3586
3587         if (   (bf->l_start < 0)
3588             || (bf->l_start > XFS_MAXIOFFSET(mp))
3589             || (bf->l_start + llen < 0)
3590             || (bf->l_start + llen > XFS_MAXIOFFSET(mp)))
3591                 return XFS_ERROR(EINVAL);
3592
3593         bf->l_whence = 0;
3594
3595         startoffset = bf->l_start;
3596         fsize = ip->i_size;
3597
3598         /*
3599          * XFS_IOC_RESVSP and XFS_IOC_UNRESVSP will reserve or unreserve
3600          * file space.
3601          * These calls do NOT zero the data space allocated to the file,
3602          * nor do they change the file size.
3603          *
3604          * XFS_IOC_ALLOCSP and XFS_IOC_FREESP will allocate and free file
3605          * space.
3606          * These calls cause the new file data to be zeroed and the file
3607          * size to be changed.
3608          */
3609         setprealloc = clrprealloc = 0;
3610
3611         switch (cmd) {
3612         case XFS_IOC_RESVSP:
3613         case XFS_IOC_RESVSP64:
3614                 error = xfs_alloc_file_space(ip, startoffset, bf->l_len,
3615                                                                 1, attr_flags);
3616                 if (error)
3617                         return error;
3618                 setprealloc = 1;
3619                 break;
3620
3621         case XFS_IOC_UNRESVSP:
3622         case XFS_IOC_UNRESVSP64:
3623                 if ((error = xfs_free_file_space(ip, startoffset, bf->l_len,
3624                                                                 attr_flags)))
3625                         return error;
3626                 break;
3627
3628         case XFS_IOC_ALLOCSP:
3629         case XFS_IOC_ALLOCSP64:
3630         case XFS_IOC_FREESP:
3631         case XFS_IOC_FREESP64:
3632                 if (startoffset > fsize) {
3633                         error = xfs_alloc_file_space(ip, fsize,
3634                                         startoffset - fsize, 0, attr_flags);
3635                         if (error)
3636                                 break;
3637                 }
3638
3639                 va.va_mask = XFS_AT_SIZE;
3640                 va.va_size = startoffset;
3641
3642                 error = xfs_setattr(ip, &va, attr_flags, credp);
3643
3644                 if (error)
3645                         return error;
3646
3647                 clrprealloc = 1;
3648                 break;
3649
3650         default:
3651                 ASSERT(0);
3652                 return XFS_ERROR(EINVAL);
3653         }
3654
3655         /*
3656          * update the inode timestamp, mode, and prealloc flag bits
3657          */
3658         tp = xfs_trans_alloc(mp, XFS_TRANS_WRITEID);
3659
3660         if ((error = xfs_trans_reserve(tp, 0, XFS_WRITEID_LOG_RES(mp),
3661                                       0, 0, 0))) {
3662                 /* ASSERT(0); */
3663                 xfs_trans_cancel(tp, 0);
3664                 return error;
3665         }
3666
3667         xfs_ilock(ip, XFS_ILOCK_EXCL);
3668
3669         xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
3670         xfs_trans_ihold(tp, ip);
3671
3672         if ((attr_flags & ATTR_DMI) == 0) {
3673                 ip->i_d.di_mode &= ~S_ISUID;
3674
3675                 /*
3676                  * Note that we don't have to worry about mandatory
3677                  * file locking being disabled here because we only
3678                  * clear the S_ISGID bit if the Group execute bit is
3679                  * on, but if it was on then mandatory locking wouldn't
3680                  * have been enabled.
3681                  */
3682                 if (ip->i_d.di_mode & S_IXGRP)
3683                         ip->i_d.di_mode &= ~S_ISGID;
3684
3685                 xfs_ichgtime(ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
3686         }
3687         if (setprealloc)
3688                 ip->i_d.di_flags |= XFS_DIFLAG_PREALLOC;
3689         else if (clrprealloc)
3690                 ip->i_d.di_flags &= ~XFS_DIFLAG_PREALLOC;
3691
3692         xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
3693         xfs_trans_set_sync(tp);
3694
3695         error = xfs_trans_commit(tp, 0);
3696
3697         xfs_iunlock(ip, XFS_ILOCK_EXCL);
3698
3699         return error;
3700 }