Skip to content

Commit

Permalink
ocfs2: Remove delete inode vote
Browse files Browse the repository at this point in the history
Ocfs2 currently does cluster-wide node messaging to check the open state of
an inode during delete. This patch removes that mechanism in favor of an
inode cluster lock which is taken at shared read when an inode is first read
and dropped in clear_inode(). This allows a deleting node to test the
liveness of an inode by attempting to take an exclusive lock.

Signed-off-by: Tiger Yang <[email protected]>
Signed-off-by: Mark Fasheh <[email protected]>
  • Loading branch information
Tiger Yang authored and Mark Fasheh committed Apr 26, 2007
1 parent a9f5f70 commit 5000863
Show file tree
Hide file tree
Showing 10 changed files with 205 additions and 38 deletions.
5 changes: 4 additions & 1 deletion fs/ocfs2/cluster/tcp_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,9 @@
* locking semantics of the file system using the protocol. It should
* be somewhere else, I'm sure, but right now it isn't.
*
* New in version 8:
* - Replace delete inode votes with a cluster lock
*
* New in version 7:
* - DLM join domain includes the live nodemap
*
Expand All @@ -57,7 +60,7 @@
* - full 64 bit i_size in the metadata lock lvbs
* - introduction of "rw" lock and pushing meta/data locking down
*/
#define O2NET_PROTOCOL_VERSION 7ULL
#define O2NET_PROTOCOL_VERSION 8ULL
struct o2net_handshake {
__be64 protocol_version;
__be64 connector_id;
Expand Down
119 changes: 117 additions & 2 deletions fs/ocfs2/dlmglue.c
Original file line number Diff line number Diff line change
Expand Up @@ -225,11 +225,17 @@ static struct ocfs2_lock_res_ops ocfs2_dentry_lops = {
.flags = 0,
};

static struct ocfs2_lock_res_ops ocfs2_inode_open_lops = {
.get_osb = ocfs2_get_inode_osb,
.flags = 0,
};

static inline int ocfs2_is_inode_lock(struct ocfs2_lock_res *lockres)
{
return lockres->l_type == OCFS2_LOCK_TYPE_META ||
lockres->l_type == OCFS2_LOCK_TYPE_DATA ||
lockres->l_type == OCFS2_LOCK_TYPE_RW;
lockres->l_type == OCFS2_LOCK_TYPE_RW ||
lockres->l_type == OCFS2_LOCK_TYPE_OPEN;
}

static inline struct inode *ocfs2_lock_res_inode(struct ocfs2_lock_res *lockres)
Expand Down Expand Up @@ -373,6 +379,9 @@ void ocfs2_inode_lock_res_init(struct ocfs2_lock_res *res,
case OCFS2_LOCK_TYPE_DATA:
ops = &ocfs2_inode_data_lops;
break;
case OCFS2_LOCK_TYPE_OPEN:
ops = &ocfs2_inode_open_lops;
break;
default:
mlog_bug_on_msg(1, "type: %d\n", type);
ops = NULL; /* thanks, gcc */
Expand Down Expand Up @@ -1129,6 +1138,12 @@ int ocfs2_create_new_inode_locks(struct inode *inode)
goto bail;
}

ret = ocfs2_create_new_lock(osb, &OCFS2_I(inode)->ip_open_lockres, 0, 0);
if (ret) {
mlog_errno(ret);
goto bail;
}

bail:
mlog_exit(ret);
return ret;
Expand Down Expand Up @@ -1182,6 +1197,99 @@ void ocfs2_rw_unlock(struct inode *inode, int write)
mlog_exit_void();
}

/*
* ocfs2_open_lock always get PR mode lock.
*/
int ocfs2_open_lock(struct inode *inode)
{
int status = 0;
struct ocfs2_lock_res *lockres;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);

BUG_ON(!inode);

mlog_entry_void();

mlog(0, "inode %llu take PRMODE open lock\n",
(unsigned long long)OCFS2_I(inode)->ip_blkno);

if (ocfs2_mount_local(osb))
goto out;

lockres = &OCFS2_I(inode)->ip_open_lockres;

status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres,
LKM_PRMODE, 0, 0);
if (status < 0)
mlog_errno(status);

out:
mlog_exit(status);
return status;
}

int ocfs2_try_open_lock(struct inode *inode, int write)
{
int status = 0, level;
struct ocfs2_lock_res *lockres;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);

BUG_ON(!inode);

mlog_entry_void();

mlog(0, "inode %llu try to take %s open lock\n",
(unsigned long long)OCFS2_I(inode)->ip_blkno,
write ? "EXMODE" : "PRMODE");

if (ocfs2_mount_local(osb))
goto out;

lockres = &OCFS2_I(inode)->ip_open_lockres;

level = write ? LKM_EXMODE : LKM_PRMODE;

/*
* The file system may already holding a PRMODE/EXMODE open lock.
* Since we pass LKM_NOQUEUE, the request won't block waiting on
* other nodes and the -EAGAIN will indicate to the caller that
* this inode is still in use.
*/
status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres,
level, LKM_NOQUEUE, 0);

out:
mlog_exit(status);
return status;
}

/*
* ocfs2_open_unlock unlock PR and EX mode open locks.
*/
void ocfs2_open_unlock(struct inode *inode)
{
struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_open_lockres;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);

mlog_entry_void();

mlog(0, "inode %llu drop open lock\n",
(unsigned long long)OCFS2_I(inode)->ip_blkno);

if (ocfs2_mount_local(osb))
goto out;

if(lockres->l_ro_holders)
ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres,
LKM_PRMODE);
if(lockres->l_ex_holders)
ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres,
LKM_EXMODE);

out:
mlog_exit_void();
}

int ocfs2_data_lock_full(struct inode *inode,
int write,
int arg_flags)
Expand Down Expand Up @@ -2455,12 +2563,19 @@ int ocfs2_drop_inode_locks(struct inode *inode)
* ocfs2_clear_inode has done it for us. */

err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb),
&OCFS2_I(inode)->ip_data_lockres);
&OCFS2_I(inode)->ip_open_lockres);
if (err < 0)
mlog_errno(err);

status = err;

err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb),
&OCFS2_I(inode)->ip_data_lockres);
if (err < 0)
mlog_errno(err);
if (err < 0 && !status)
status = err;

err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb),
&OCFS2_I(inode)->ip_meta_lockres);
if (err < 0)
Expand Down
3 changes: 3 additions & 0 deletions fs/ocfs2/dlmglue.h
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,9 @@ void ocfs2_data_unlock(struct inode *inode,
int write);
int ocfs2_rw_lock(struct inode *inode, int write);
void ocfs2_rw_unlock(struct inode *inode, int write);
int ocfs2_open_lock(struct inode *inode);
int ocfs2_try_open_lock(struct inode *inode, int write);
void ocfs2_open_unlock(struct inode *inode);
int ocfs2_meta_lock_atime(struct inode *inode,
struct vfsmount *vfsmnt,
int *level);
Expand Down
93 changes: 67 additions & 26 deletions fs/ocfs2/inode.c
Original file line number Diff line number Diff line change
Expand Up @@ -289,7 +289,6 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
(unsigned long long)fe->i_blkno);

OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
OCFS2_I(inode)->ip_orphaned_slot = OCFS2_INVALID_SLOT;
OCFS2_I(inode)->ip_attr = le32_to_cpu(fe->i_attr);

inode->i_nlink = le16_to_cpu(fe->i_links_count);
Expand Down Expand Up @@ -347,6 +346,9 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,

ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_meta_lockres,
OCFS2_LOCK_TYPE_META, 0, inode);

ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_open_lockres,
OCFS2_LOCK_TYPE_OPEN, 0, inode);
}

ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_rw_lockres,
Expand Down Expand Up @@ -421,7 +423,7 @@ static int ocfs2_read_locked_inode(struct inode *inode,
* cluster lock before trusting anything anyway.
*/
can_lock = !(args->fi_flags & OCFS2_FI_FLAG_SYSFILE)
&& !(args->fi_flags & OCFS2_FI_FLAG_NOLOCK)
&& !(args->fi_flags & OCFS2_FI_FLAG_ORPHAN_RECOVERY)
&& !ocfs2_mount_local(osb);

/*
Expand All @@ -438,7 +440,17 @@ static int ocfs2_read_locked_inode(struct inode *inode,
OCFS2_LOCK_TYPE_META,
generation, inode);

ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_open_lockres,
OCFS2_LOCK_TYPE_OPEN,
0, inode);

if (can_lock) {
status = ocfs2_open_lock(inode);
if (status) {
make_bad_inode(inode);
mlog_errno(status);
return status;
}
status = ocfs2_meta_lock(inode, NULL, 0);
if (status) {
make_bad_inode(inode);
Expand All @@ -447,6 +459,14 @@ static int ocfs2_read_locked_inode(struct inode *inode,
}
}

if (args->fi_flags & OCFS2_FI_FLAG_ORPHAN_RECOVERY) {
status = ocfs2_try_open_lock(inode, 0);
if (status) {
make_bad_inode(inode);
return status;
}
}

status = ocfs2_read_block(osb, args->fi_blkno, &bh, 0,
can_lock ? inode : NULL);
if (status < 0) {
Expand Down Expand Up @@ -678,10 +698,10 @@ static int ocfs2_wipe_inode(struct inode *inode,
struct inode *orphan_dir_inode = NULL;
struct buffer_head *orphan_dir_bh = NULL;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
struct ocfs2_dinode *di;

/* We've already voted on this so it should be readonly - no
* spinlock needed. */
orphaned_slot = OCFS2_I(inode)->ip_orphaned_slot;
di = (struct ocfs2_dinode *) di_bh->b_data;
orphaned_slot = le16_to_cpu(di->i_orphaned_slot);

status = ocfs2_check_orphan_recovery_state(osb, orphaned_slot);
if (status)
Expand Down Expand Up @@ -787,6 +807,35 @@ static int ocfs2_inode_is_valid_to_delete(struct inode *inode)
return ret;
}

static int ocfs2_request_delete(struct inode *inode)
{
int status = 0;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);

if (ocfs2_inode_is_new(inode))
return 0;

if (ocfs2_node_map_is_only(osb, &osb->mounted_map,
osb->node_num))
return 0;
/*
* This is how ocfs2 determines whether an inode is still live
* within the cluster. Every node takes a shared read lock on
* the inode open lock in ocfs2_read_locked_inode(). When we
* get to ->delete_inode(), each node tries to convert it's
* lock to an exclusive. Trylocks are serialized by the inode
* meta data lock. If the upconvert suceeds, we know the inode
* is no longer live and can be deleted.
*
* Though we call this with the meta data lock held, the
* trylock keeps us from ABBA deadlock.
*/
status = ocfs2_try_open_lock(inode, 1);
if (status < 0 && status != -EAGAIN)
mlog_errno(status);
return status;
}

/* Query the cluster to determine whether we should wipe an inode from
* disk or not.
*
Expand Down Expand Up @@ -839,11 +888,11 @@ static int ocfs2_query_inode_wipe(struct inode *inode,
goto bail;
}

status = ocfs2_request_delete_vote(inode);
/* -EBUSY means that other nodes are still using the
status = ocfs2_request_delete(inode);
/* -EAGAIN means that other nodes are still using the
* inode. We're done here though, so avoid doing anything on
* disk and let them worry about deleting it. */
if (status == -EBUSY) {
if (status == -EAGAIN) {
status = 0;
mlog(0, "Skipping delete of %llu because it is in use on"
"other nodes\n", (unsigned long long)oi->ip_blkno);
Expand All @@ -854,21 +903,10 @@ static int ocfs2_query_inode_wipe(struct inode *inode,
goto bail;
}

spin_lock(&oi->ip_lock);
if (oi->ip_orphaned_slot == OCFS2_INVALID_SLOT) {
/* Nobody knew which slot this inode was orphaned
* into. This may happen during node death and
* recovery knows how to clean it up so we can safely
* ignore this inode for now on. */
mlog(0, "Nobody knew where inode %llu was orphaned!\n",
(unsigned long long)oi->ip_blkno);
} else {
*wipe = 1;

mlog(0, "Inode %llu is ok to wipe from orphan dir %d\n",
(unsigned long long)oi->ip_blkno, oi->ip_orphaned_slot);
}
spin_unlock(&oi->ip_lock);
*wipe = 1;
mlog(0, "Inode %llu is ok to wipe from orphan dir %u\n",
(unsigned long long)oi->ip_blkno,
le16_to_cpu(di->i_orphaned_slot));

bail:
return status;
Expand Down Expand Up @@ -1001,11 +1039,16 @@ void ocfs2_clear_inode(struct inode *inode)
mlog_bug_on_msg(OCFS2_SB(inode->i_sb) == NULL,
"Inode=%lu\n", inode->i_ino);

/* For remove delete_inode vote, we hold open lock before,
* now it is time to unlock PR and EX open locks. */
ocfs2_open_unlock(inode);

/* Do these before all the other work so that we don't bounce
* the vote thread while waiting to destroy the locks. */
ocfs2_mark_lockres_freeing(&oi->ip_rw_lockres);
ocfs2_mark_lockres_freeing(&oi->ip_meta_lockres);
ocfs2_mark_lockres_freeing(&oi->ip_data_lockres);
ocfs2_mark_lockres_freeing(&oi->ip_open_lockres);

/* We very well may get a clear_inode before all an inodes
* metadata has hit disk. Of course, we can't drop any cluster
Expand All @@ -1030,6 +1073,7 @@ void ocfs2_clear_inode(struct inode *inode)
ocfs2_lock_res_free(&oi->ip_rw_lockres);
ocfs2_lock_res_free(&oi->ip_meta_lockres);
ocfs2_lock_res_free(&oi->ip_data_lockres);
ocfs2_lock_res_free(&oi->ip_open_lockres);

ocfs2_metadata_cache_purge(inode);

Expand Down Expand Up @@ -1086,9 +1130,6 @@ void ocfs2_drop_inode(struct inode *inode)
mlog(0, "Drop inode %llu, nlink = %u, ip_flags = 0x%x\n",
(unsigned long long)oi->ip_blkno, inode->i_nlink, oi->ip_flags);

/* Testing ip_orphaned_slot here wouldn't work because we may
* not have gotten a delete_inode vote from any other nodes
* yet. */
if (oi->ip_flags & OCFS2_INODE_MAYBE_ORPHANED)
generic_delete_inode(inode);
else
Expand Down
5 changes: 3 additions & 2 deletions fs/ocfs2/inode.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ struct ocfs2_inode_info
struct ocfs2_lock_res ip_rw_lockres;
struct ocfs2_lock_res ip_meta_lockres;
struct ocfs2_lock_res ip_data_lockres;
struct ocfs2_lock_res ip_open_lockres;

/* protects allocation changes on this inode. */
struct rw_semaphore ip_alloc_sem;
Expand Down Expand Up @@ -119,8 +120,8 @@ void ocfs2_drop_inode(struct inode *inode);
/* Flags for ocfs2_iget() */
#define OCFS2_FI_FLAG_NOWAIT 0x1
#define OCFS2_FI_FLAG_DELETE 0x2
#define OCFS2_FI_FLAG_SYSFILE 0x4
#define OCFS2_FI_FLAG_NOLOCK 0x8
#define OCFS2_FI_FLAG_SYSFILE 0x4
#define OCFS2_FI_FLAG_ORPHAN_RECOVERY 0x8
struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 feoff, int flags);
struct inode *ocfs2_ilookup_for_vote(struct ocfs2_super *osb,
u64 blkno,
Expand Down
Loading

0 comments on commit 5000863

Please sign in to comment.