Skip to content

Commit

Permalink
Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel…
Browse files Browse the repository at this point in the history
…/git/sage/ceph-client

Pull Ceph updates from Sage Weil:
 "A few groups of patches here.  Alex has been hard at work improving
  the RBD code, layout groundwork for understanding the new formats and
  doing layering.  Most of the infrastructure is now in place for the
  final bits that will come with the next window.

  There are a few changes to the data layout.  Jim Schutt's patch fixes
  some non-ideal CRUSH behavior, and a set of patches from me updates
  the client to speak a newer version of the protocol and implement an
  improved hashing strategy across storage nodes (when the server side
  supports it too).

  A pair of patches from Sam Lang fix the atomicity of open+create
  operations.  Several patches from Yan, Zheng fix various mds/client
  issues that turned up during multi-mds torture tests.

  A final set of patches expose file layouts via virtual xattrs, and
  allow the policies to be set on directories via xattrs as well
  (avoiding the awkward ioctl interface and providing a consistent
  interface for both kernel mount and ceph-fuse users)."

* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client: (143 commits)
  libceph: add support for HASHPSPOOL pool flag
  libceph: update osd request/reply encoding
  libceph: calculate placement based on the internal data types
  ceph: update support for PGID64, PGPOOL3, OSDENC protocol features
  ceph: update "ceph_features.h"
  libceph: decode into cpu-native ceph_pg type
  libceph: rename ceph_pg -> ceph_pg_v1
  rbd: pass length, not op for osd completions
  rbd: move rbd_osd_trivial_callback()
  libceph: use a do..while loop in con_work()
  libceph: use a flag to indicate a fault has occurred
  libceph: separate non-locked fault handling
  libceph: encapsulate connection backoff
  libceph: eliminate sparse warnings
  ceph: eliminate sparse warnings in fs code
  rbd: eliminate sparse warnings
  libceph: define connection flag helpers
  rbd: normalize dout() calls
  rbd: barriers are hard
  rbd: ignore zero-length requests
  ...
  • Loading branch information
torvalds committed Mar 1, 2013
2 parents de1a226 + 83ca14f commit 1cf0209
Show file tree
Hide file tree
Showing 32 changed files with 2,402 additions and 1,528 deletions.
1,852 changes: 1,161 additions & 691 deletions drivers/block/rbd.c

Large diffs are not rendered by default.

38 changes: 9 additions & 29 deletions fs/ceph/addr.c
Original file line number Diff line number Diff line change
Expand Up @@ -236,16 +236,10 @@ static int ceph_readpage(struct file *filp, struct page *page)
static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg)
{
struct inode *inode = req->r_inode;
struct ceph_osd_reply_head *replyhead;
int rc, bytes;
int rc = req->r_result;
int bytes = le32_to_cpu(msg->hdr.data_len);
int i;

/* parse reply */
replyhead = msg->front.iov_base;
WARN_ON(le32_to_cpu(replyhead->num_ops) == 0);
rc = le32_to_cpu(replyhead->result);
bytes = le32_to_cpu(msg->hdr.data_len);

dout("finish_read %p req %p rc %d bytes %d\n", inode, req, rc, bytes);

/* unlock all pages, zeroing any data we didn't read */
Expand Down Expand Up @@ -315,7 +309,7 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max)
CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ,
NULL, 0,
ci->i_truncate_seq, ci->i_truncate_size,
NULL, false, 1, 0);
NULL, false, 0);
if (IS_ERR(req))
return PTR_ERR(req);

Expand Down Expand Up @@ -492,8 +486,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
&ci->i_layout, snapc,
page_off, len,
ci->i_truncate_seq, ci->i_truncate_size,
&inode->i_mtime,
&page, 1, 0, 0, true);
&inode->i_mtime, &page, 1);
if (err < 0) {
dout("writepage setting page/mapping error %d %p\n", err, page);
SetPageError(page);
Expand Down Expand Up @@ -554,27 +547,18 @@ static void writepages_finish(struct ceph_osd_request *req,
struct ceph_msg *msg)
{
struct inode *inode = req->r_inode;
struct ceph_osd_reply_head *replyhead;
struct ceph_osd_op *op;
struct ceph_inode_info *ci = ceph_inode(inode);
unsigned wrote;
struct page *page;
int i;
struct ceph_snap_context *snapc = req->r_snapc;
struct address_space *mapping = inode->i_mapping;
__s32 rc = -EIO;
u64 bytes = 0;
int rc = req->r_result;
u64 bytes = le64_to_cpu(req->r_request_ops[0].extent.length);
struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
long writeback_stat;
unsigned issued = ceph_caps_issued(ci);

/* parse reply */
replyhead = msg->front.iov_base;
WARN_ON(le32_to_cpu(replyhead->num_ops) == 0);
op = (void *)(replyhead + 1);
rc = le32_to_cpu(replyhead->result);
bytes = le64_to_cpu(op->extent.length);

if (rc >= 0) {
/*
* Assume we wrote the pages we originally sent. The
Expand Down Expand Up @@ -741,8 +725,6 @@ static int ceph_writepages_start(struct address_space *mapping,
struct page *page;
int want;
u64 offset, len;
struct ceph_osd_request_head *reqhead;
struct ceph_osd_op *op;
long writeback_stat;

next = 0;
Expand Down Expand Up @@ -838,7 +820,7 @@ static int ceph_writepages_start(struct address_space *mapping,
snapc, do_sync,
ci->i_truncate_seq,
ci->i_truncate_size,
&inode->i_mtime, true, 1, 0);
&inode->i_mtime, true, 0);

if (IS_ERR(req)) {
rc = PTR_ERR(req);
Expand Down Expand Up @@ -906,10 +888,8 @@ static int ceph_writepages_start(struct address_space *mapping,

/* revise final length, page count */
req->r_num_pages = locked_pages;
reqhead = req->r_request->front.iov_base;
op = (void *)(reqhead + 1);
op->extent.length = cpu_to_le64(len);
op->payload_len = cpu_to_le32(len);
req->r_request_ops[0].extent.length = cpu_to_le64(len);
req->r_request_ops[0].payload_len = cpu_to_le32(len);
req->r_request->hdr.data_len = cpu_to_le32(len);

rc = ceph_osdc_start_request(&fsc->client->osdc, req, true);
Expand Down
32 changes: 25 additions & 7 deletions fs/ceph/caps.c
Original file line number Diff line number Diff line change
Expand Up @@ -611,8 +611,16 @@ int ceph_add_cap(struct inode *inode,

if (flags & CEPH_CAP_FLAG_AUTH)
ci->i_auth_cap = cap;
else if (ci->i_auth_cap == cap)
else if (ci->i_auth_cap == cap) {
ci->i_auth_cap = NULL;
spin_lock(&mdsc->cap_dirty_lock);
if (!list_empty(&ci->i_dirty_item)) {
dout(" moving %p to cap_dirty_migrating\n", inode);
list_move(&ci->i_dirty_item,
&mdsc->cap_dirty_migrating);
}
spin_unlock(&mdsc->cap_dirty_lock);
}

dout("add_cap inode %p (%llx.%llx) cap %p %s now %s seq %d mds%d\n",
inode, ceph_vinop(inode), cap, ceph_cap_string(issued),
Expand Down Expand Up @@ -1460,7 +1468,7 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
struct ceph_mds_client *mdsc = fsc->mdsc;
struct inode *inode = &ci->vfs_inode;
struct ceph_cap *cap;
int file_wanted, used;
int file_wanted, used, cap_used;
int took_snap_rwsem = 0; /* true if mdsc->snap_rwsem held */
int issued, implemented, want, retain, revoking, flushing = 0;
int mds = -1; /* keep track of how far we've gone through i_caps list
Expand Down Expand Up @@ -1563,9 +1571,14 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,

/* NOTE: no side-effects allowed, until we take s_mutex */

cap_used = used;
if (ci->i_auth_cap && cap != ci->i_auth_cap)
cap_used &= ~ci->i_auth_cap->issued;

revoking = cap->implemented & ~cap->issued;
dout(" mds%d cap %p issued %s implemented %s revoking %s\n",
dout(" mds%d cap %p used %s issued %s implemented %s revoking %s\n",
cap->mds, cap, ceph_cap_string(cap->issued),
ceph_cap_string(cap_used),
ceph_cap_string(cap->implemented),
ceph_cap_string(revoking));

Expand Down Expand Up @@ -1593,7 +1606,7 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
}

/* completed revocation? going down and there are no caps? */
if (revoking && (revoking & used) == 0) {
if (revoking && (revoking & cap_used) == 0) {
dout("completed revocation of %s\n",
ceph_cap_string(cap->implemented & ~cap->issued));
goto ack;
Expand Down Expand Up @@ -1670,8 +1683,8 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
sent++;

/* __send_cap drops i_ceph_lock */
delayed += __send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, used, want,
retain, flushing, NULL);
delayed += __send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, cap_used,
want, retain, flushing, NULL);
goto retry; /* retake i_ceph_lock and restart our cap scan. */
}

Expand Down Expand Up @@ -2417,7 +2430,9 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
dout("mds wanted %s -> %s\n",
ceph_cap_string(le32_to_cpu(grant->wanted)),
ceph_cap_string(wanted));
grant->wanted = cpu_to_le32(wanted);
/* imported cap may not have correct mds_wanted */
if (le32_to_cpu(grant->op) == CEPH_CAP_OP_IMPORT)
check_caps = 1;
}

cap->seq = seq;
Expand Down Expand Up @@ -2821,6 +2836,9 @@ void ceph_handle_caps(struct ceph_mds_session *session,
dout(" mds%d seq %lld cap seq %u\n", session->s_mds, session->s_seq,
(unsigned)seq);

if (op == CEPH_CAP_OP_IMPORT)
ceph_add_cap_releases(mdsc, session);

/* lookup ino */
inode = ceph_find_inode(sb, vino);
ci = ceph_inode(inode);
Expand Down
8 changes: 7 additions & 1 deletion fs/ceph/file.c
Original file line number Diff line number Diff line change
Expand Up @@ -243,6 +243,9 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
err = ceph_mdsc_do_request(mdsc,
(flags & (O_CREAT|O_TRUNC)) ? dir : NULL,
req);
if (err)
goto out_err;

err = ceph_handle_snapdir(req, dentry, err);
if (err == 0 && (flags & O_CREAT) && !req->r_reply_info.head->is_dentry)
err = ceph_handle_notrace_create(dir, dentry);
Expand All @@ -263,6 +266,9 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
err = finish_no_open(file, dn);
} else {
dout("atomic_open finish_open on dn %p\n", dn);
if (req->r_op == CEPH_MDS_OP_CREATE && req->r_reply_info.has_create_ino) {
*opened |= FILE_CREATED;
}
err = finish_open(file, dentry, ceph_open, opened);
}

Expand Down Expand Up @@ -535,7 +541,7 @@ static ssize_t ceph_sync_write(struct file *file, const char __user *data,
ci->i_snap_realm->cached_context,
do_sync,
ci->i_truncate_seq, ci->i_truncate_size,
&mtime, false, 2, page_align);
&mtime, false, page_align);
if (IS_ERR(req))
return PTR_ERR(req);

Expand Down
6 changes: 2 additions & 4 deletions fs/ceph/ioctl.c
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,6 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
&ceph_sb_to_client(inode->i_sb)->client->osdc;
u64 len = 1, olen;
u64 tmp;
struct ceph_object_layout ol;
struct ceph_pg pgid;
int r;

Expand All @@ -194,7 +193,7 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
return -EFAULT;

down_read(&osdc->map_sem);
r = ceph_calc_file_object_mapping(&ci->i_layout, dl.file_offset, &len,
r = ceph_calc_file_object_mapping(&ci->i_layout, dl.file_offset, len,
&dl.object_no, &dl.object_offset,
&olen);
if (r < 0)
Expand All @@ -209,10 +208,9 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)

snprintf(dl.object_name, sizeof(dl.object_name), "%llx.%08llx",
ceph_ino(inode), dl.object_no);
ceph_calc_object_layout(&ol, dl.object_name, &ci->i_layout,
ceph_calc_object_layout(&pgid, dl.object_name, &ci->i_layout,
osdc->osdmap);

pgid = ol.ol_pgid;
dl.osd = ceph_calc_pg_primary(osdc->osdmap, pgid);
if (dl.osd >= 0) {
struct ceph_entity_addr *a =
Expand Down
33 changes: 31 additions & 2 deletions fs/ceph/mds_client.c
Original file line number Diff line number Diff line change
Expand Up @@ -232,6 +232,30 @@ static int parse_reply_info_filelock(void **p, void *end,
return -EIO;
}

/*
* parse create results
*/
static int parse_reply_info_create(void **p, void *end,
struct ceph_mds_reply_info_parsed *info,
int features)
{
if (features & CEPH_FEATURE_REPLY_CREATE_INODE) {
if (*p == end) {
info->has_create_ino = false;
} else {
info->has_create_ino = true;
info->ino = ceph_decode_64(p);
}
}

if (unlikely(*p != end))
goto bad;
return 0;

bad:
return -EIO;
}

/*
* parse extra results
*/
Expand All @@ -241,8 +265,12 @@ static int parse_reply_info_extra(void **p, void *end,
{
if (info->head->op == CEPH_MDS_OP_GETFILELOCK)
return parse_reply_info_filelock(p, end, info, features);
else
else if (info->head->op == CEPH_MDS_OP_READDIR)
return parse_reply_info_dir(p, end, info, features);
else if (info->head->op == CEPH_MDS_OP_CREATE)
return parse_reply_info_create(p, end, info, features);
else
return -EIO;
}

/*
Expand Down Expand Up @@ -2170,7 +2198,8 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
mutex_lock(&req->r_fill_mutex);
err = ceph_fill_trace(mdsc->fsc->sb, req, req->r_session);
if (err == 0) {
if (result == 0 && req->r_op != CEPH_MDS_OP_GETFILELOCK &&
if (result == 0 && (req->r_op == CEPH_MDS_OP_READDIR ||
req->r_op == CEPH_MDS_OP_LSSNAP) &&
rinfo->dir_nr)
ceph_readdir_prepopulate(req, req->r_session);
ceph_unreserve_caps(mdsc, &req->r_caps_reservation);
Expand Down
6 changes: 6 additions & 0 deletions fs/ceph/mds_client.h
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,12 @@ struct ceph_mds_reply_info_parsed {
struct ceph_mds_reply_info_in *dir_in;
u8 dir_complete, dir_end;
};

/* for create results */
struct {
bool has_create_ino;
u64 ino;
};
};

/* encoded blob describing snapshot contexts for certain
Expand Down
12 changes: 8 additions & 4 deletions fs/ceph/mdsmap.c
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,10 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
return ERR_PTR(-ENOMEM);

ceph_decode_16_safe(p, end, version, bad);
if (version > 3) {
pr_warning("got mdsmap version %d > 3, failing", version);
goto bad;
}

ceph_decode_need(p, end, 8*sizeof(u32) + sizeof(u64), bad);
m->m_epoch = ceph_decode_32(p);
Expand Down Expand Up @@ -144,13 +148,13 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
/* pg_pools */
ceph_decode_32_safe(p, end, n, bad);
m->m_num_data_pg_pools = n;
m->m_data_pg_pools = kcalloc(n, sizeof(u32), GFP_NOFS);
m->m_data_pg_pools = kcalloc(n, sizeof(u64), GFP_NOFS);
if (!m->m_data_pg_pools)
goto badmem;
ceph_decode_need(p, end, sizeof(u32)*(n+1), bad);
ceph_decode_need(p, end, sizeof(u64)*(n+1), bad);
for (i = 0; i < n; i++)
m->m_data_pg_pools[i] = ceph_decode_32(p);
m->m_cas_pg_pool = ceph_decode_32(p);
m->m_data_pg_pools[i] = ceph_decode_64(p);
m->m_cas_pg_pool = ceph_decode_64(p);

/* ok, we don't care about the rest. */
dout("mdsmap_decode success epoch %u\n", m->m_epoch);
Expand Down
4 changes: 4 additions & 0 deletions fs/ceph/strings.c
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ const char *ceph_mds_state_name(int s)
case CEPH_MDS_STATE_BOOT: return "up:boot";
case CEPH_MDS_STATE_STANDBY: return "up:standby";
case CEPH_MDS_STATE_STANDBY_REPLAY: return "up:standby-replay";
case CEPH_MDS_STATE_REPLAYONCE: return "up:oneshot-replay";
case CEPH_MDS_STATE_CREATING: return "up:creating";
case CEPH_MDS_STATE_STARTING: return "up:starting";
/* up and in */
Expand Down Expand Up @@ -50,10 +51,13 @@ const char *ceph_mds_op_name(int op)
case CEPH_MDS_OP_LOOKUP: return "lookup";
case CEPH_MDS_OP_LOOKUPHASH: return "lookuphash";
case CEPH_MDS_OP_LOOKUPPARENT: return "lookupparent";
case CEPH_MDS_OP_LOOKUPINO: return "lookupino";
case CEPH_MDS_OP_GETATTR: return "getattr";
case CEPH_MDS_OP_SETXATTR: return "setxattr";
case CEPH_MDS_OP_SETATTR: return "setattr";
case CEPH_MDS_OP_RMXATTR: return "rmxattr";
case CEPH_MDS_OP_SETLAYOUT: return "setlayou";
case CEPH_MDS_OP_SETDIRLAYOUT: return "setdirlayout";
case CEPH_MDS_OP_READDIR: return "readdir";
case CEPH_MDS_OP_MKNOD: return "mknod";
case CEPH_MDS_OP_LINK: return "link";
Expand Down
7 changes: 6 additions & 1 deletion fs/ceph/super.c
Original file line number Diff line number Diff line change
Expand Up @@ -71,16 +71,21 @@ static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf)
/*
* express utilization in terms of large blocks to avoid
* overflow on 32-bit machines.
*
* NOTE: for the time being, we make bsize == frsize to humor
* not-yet-ancient versions of glibc that are broken.
* Someday, we will probably want to report a real block
* size... whatever that may mean for a network file system!
*/
buf->f_bsize = 1 << CEPH_BLOCK_SHIFT;
buf->f_frsize = 1 << CEPH_BLOCK_SHIFT;
buf->f_blocks = le64_to_cpu(st.kb) >> (CEPH_BLOCK_SHIFT-10);
buf->f_bfree = le64_to_cpu(st.kb_avail) >> (CEPH_BLOCK_SHIFT-10);
buf->f_bavail = le64_to_cpu(st.kb_avail) >> (CEPH_BLOCK_SHIFT-10);

buf->f_files = le64_to_cpu(st.num_objects);
buf->f_ffree = -1;
buf->f_namelen = NAME_MAX;
buf->f_frsize = PAGE_CACHE_SIZE;

/* leave fsid little-endian, regardless of host endianness */
fsid = *(u64 *)(&monmap->fsid) ^ *((u64 *)&monmap->fsid + 1);
Expand Down
Loading

0 comments on commit 1cf0209

Please sign in to comment.