aboutsummaryrefslogtreecommitdiffstats
path: root/net/ceph
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2020-06-08 12:49:18 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2020-06-08 12:49:18 -0700
commit95288a9b3beee8dd69d73b7691e36f2f231b7903 (patch)
treeb7bb598a516e5d7fa7dba040ac34bb0608b6388e /net/ceph
parentca687877e05ad1bf5b4cefd9cdd091044626deac (diff)
parentdc1dad8e1a612650b1e786e992cb0c6e101e226a (diff)
downloadlinux-95288a9b3beee8dd69d73b7691e36f2f231b7903.tar.gz
Merge tag 'ceph-for-5.8-rc1' of git://github.com/ceph/ceph-client
Pull ceph updates from Ilya Dryomov: "The highlights are: - OSD/MDS latency and caps cache metrics infrastructure for the filesytem (Xiubo Li). Currently available through debugfs and will be periodically sent to the MDS in the future. - support for replica reads (balanced and localized reads) for rbd and the filesystem (myself). The default remains to always read from primary, users can opt-in with the new crush_location and read_from_replica options. Note that reading from replica is safe for general use only since Octopus. - support for RADOS allocation hint flags (myself). Currently used by rbd to propagate the compressible/incompressible hint given with the new compression_hint map option and ready for passing on more advanced hints, e.g. based on fadvise() from the filesystem. - support for efficient cross-quota-realm renames (Luis Henriques) - assorted cap handling improvements and cleanups, particularly untangling some of the locking (Jeff Layton)" * tag 'ceph-for-5.8-rc1' of git://github.com/ceph/ceph-client: (29 commits) rbd: compression_hint option libceph: support for alloc hint flags libceph: read_from_replica option libceph: support for balanced and localized reads libceph: crush_location infrastructure libceph: decode CRUSH device/bucket types and names libceph: add non-asserting rbtree insertion helper ceph: skip checking caps when session reconnecting and releasing reqs ceph: make sure mdsc->mutex is nested in s->s_mutex to fix dead lock ceph: don't return -ESTALE if there's still an open file libceph, rbd: replace zero-length array with flexible-array ceph: allow rename operation under different quota realms ceph: normalize 'delta' parameter usage in check_quota_exceeded ceph: ceph_kick_flushing_caps needs the s_mutex ceph: request expedited service on session's last cap flush ceph: convert mdsc->cap_dirty to a per-session list ceph: reset i_requested_max_size if file write is not wanted ceph: throw a warning if we destroy session with mutex still locked ceph: fix potential race in ceph_check_caps ceph: document what protects i_dirty_item and i_flushing_item ...
Diffstat (limited to 'net/ceph')
-rw-r--r--net/ceph/ceph_common.c75
-rw-r--r--net/ceph/crush/crush.c3
-rw-r--r--net/ceph/debugfs.c6
-rw-r--r--net/ceph/osd_client.c103
-rw-r--r--net/ceph/osdmap.c363
5 files changed, 490 insertions, 60 deletions
diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c
index 66f22e8aa52968..afe0e8184c236f 100644
--- a/net/ceph/ceph_common.c
+++ b/net/ceph/ceph_common.c
@@ -176,6 +176,10 @@ int ceph_compare_options(struct ceph_options *new_opt,
}
}
+ ret = ceph_compare_crush_locs(&opt1->crush_locs, &opt2->crush_locs);
+ if (ret)
+ return ret;
+
/* any matching mon ip implies a match */
for (i = 0; i < opt1->num_mon; i++) {
if (ceph_monmap_contains(client->monc.monmap,
@@ -259,6 +263,8 @@ enum {
Opt_secret,
Opt_key,
Opt_ip,
+ Opt_crush_location,
+ Opt_read_from_replica,
/* string args above */
Opt_share,
Opt_crc,
@@ -268,11 +274,25 @@ enum {
Opt_abort_on_full,
};
+enum {
+ Opt_read_from_replica_no,
+ Opt_read_from_replica_balance,
+ Opt_read_from_replica_localize,
+};
+
+static const struct constant_table ceph_param_read_from_replica[] = {
+ {"no", Opt_read_from_replica_no},
+ {"balance", Opt_read_from_replica_balance},
+ {"localize", Opt_read_from_replica_localize},
+ {}
+};
+
static const struct fs_parameter_spec ceph_parameters[] = {
fsparam_flag ("abort_on_full", Opt_abort_on_full),
fsparam_flag_no ("cephx_require_signatures", Opt_cephx_require_signatures),
fsparam_flag_no ("cephx_sign_messages", Opt_cephx_sign_messages),
fsparam_flag_no ("crc", Opt_crc),
+ fsparam_string ("crush_location", Opt_crush_location),
fsparam_string ("fsid", Opt_fsid),
fsparam_string ("ip", Opt_ip),
fsparam_string ("key", Opt_key),
@@ -283,6 +303,8 @@ static const struct fs_parameter_spec ceph_parameters[] = {
fsparam_u32 ("osdkeepalive", Opt_osdkeepalivetimeout),
__fsparam (fs_param_is_s32, "osdtimeout", Opt_osdtimeout,
fs_param_deprecated, NULL),
+ fsparam_enum ("read_from_replica", Opt_read_from_replica,
+ ceph_param_read_from_replica),
fsparam_string ("secret", Opt_secret),
fsparam_flag_no ("share", Opt_share),
fsparam_flag_no ("tcp_nodelay", Opt_tcp_nodelay),
@@ -297,6 +319,7 @@ struct ceph_options *ceph_alloc_options(void)
if (!opt)
return NULL;
+ opt->crush_locs = RB_ROOT;
opt->mon_addr = kcalloc(CEPH_MAX_MON, sizeof(*opt->mon_addr),
GFP_KERNEL);
if (!opt->mon_addr) {
@@ -319,6 +342,7 @@ void ceph_destroy_options(struct ceph_options *opt)
if (!opt)
return;
+ ceph_clear_crush_locs(&opt->crush_locs);
kfree(opt->name);
if (opt->key) {
ceph_crypto_key_destroy(opt->key);
@@ -453,6 +477,34 @@ int ceph_parse_param(struct fs_parameter *param, struct ceph_options *opt,
if (!opt->key)
return -ENOMEM;
return get_secret(opt->key, param->string, &log);
+ case Opt_crush_location:
+ ceph_clear_crush_locs(&opt->crush_locs);
+ err = ceph_parse_crush_location(param->string,
+ &opt->crush_locs);
+ if (err) {
+ error_plog(&log, "Failed to parse CRUSH location: %d",
+ err);
+ return err;
+ }
+ break;
+ case Opt_read_from_replica:
+ switch (result.uint_32) {
+ case Opt_read_from_replica_no:
+ opt->osd_req_flags &= ~(CEPH_OSD_FLAG_BALANCE_READS |
+ CEPH_OSD_FLAG_LOCALIZE_READS);
+ break;
+ case Opt_read_from_replica_balance:
+ opt->osd_req_flags |= CEPH_OSD_FLAG_BALANCE_READS;
+ opt->osd_req_flags &= ~CEPH_OSD_FLAG_LOCALIZE_READS;
+ break;
+ case Opt_read_from_replica_localize:
+ opt->osd_req_flags |= CEPH_OSD_FLAG_LOCALIZE_READS;
+ opt->osd_req_flags &= ~CEPH_OSD_FLAG_BALANCE_READS;
+ break;
+ default:
+ BUG();
+ }
+ break;
case Opt_osdtimeout:
warn_plog(&log, "Ignoring osdtimeout");
@@ -535,6 +587,7 @@ int ceph_print_client_options(struct seq_file *m, struct ceph_client *client,
{
struct ceph_options *opt = client->options;
size_t pos = m->count;
+ struct rb_node *n;
if (opt->name) {
seq_puts(m, "name=");
@@ -544,6 +597,28 @@ int ceph_print_client_options(struct seq_file *m, struct ceph_client *client,
if (opt->key)
seq_puts(m, "secret=<hidden>,");
+ if (!RB_EMPTY_ROOT(&opt->crush_locs)) {
+ seq_puts(m, "crush_location=");
+ for (n = rb_first(&opt->crush_locs); ; ) {
+ struct crush_loc_node *loc =
+ rb_entry(n, struct crush_loc_node, cl_node);
+
+ seq_printf(m, "%s:%s", loc->cl_loc.cl_type_name,
+ loc->cl_loc.cl_name);
+ n = rb_next(n);
+ if (!n)
+ break;
+
+ seq_putc(m, '|');
+ }
+ seq_putc(m, ',');
+ }
+ if (opt->osd_req_flags & CEPH_OSD_FLAG_BALANCE_READS) {
+ seq_puts(m, "read_from_replica=balance,");
+ } else if (opt->osd_req_flags & CEPH_OSD_FLAG_LOCALIZE_READS) {
+ seq_puts(m, "read_from_replica=localize,");
+ }
+
if (opt->flags & CEPH_OPT_FSID)
seq_printf(m, "fsid=%pU,", &opt->fsid);
if (opt->flags & CEPH_OPT_NOSHARE)
diff --git a/net/ceph/crush/crush.c b/net/ceph/crush/crush.c
index 3d70244bc1b63f..254ded0b05f6a1 100644
--- a/net/ceph/crush/crush.c
+++ b/net/ceph/crush/crush.c
@@ -2,7 +2,6 @@
#ifdef __KERNEL__
# include <linux/slab.h>
# include <linux/crush/crush.h>
-void clear_choose_args(struct crush_map *c);
#else
# include "crush_compat.h"
# include "crush.h"
@@ -130,6 +129,8 @@ void crush_destroy(struct crush_map *map)
#ifndef __KERNEL__
kfree(map->choose_tries);
#else
+ clear_crush_names(&map->type_names);
+ clear_crush_names(&map->names);
clear_choose_args(map);
#endif
kfree(map);
diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c
index 1344f232ecc5c5..409d505ff3203f 100644
--- a/net/ceph/debugfs.c
+++ b/net/ceph/debugfs.c
@@ -81,11 +81,13 @@ static int osdmap_show(struct seq_file *s, void *p)
u32 state = map->osd_state[i];
char sb[64];
- seq_printf(s, "osd%d\t%s\t%3d%%\t(%s)\t%3d%%\n",
+ seq_printf(s, "osd%d\t%s\t%3d%%\t(%s)\t%3d%%\t%2d\n",
i, ceph_pr_addr(addr),
((map->osd_weight[i]*100) >> 16),
ceph_osdmap_state_str(sb, sizeof(sb), state),
- ((ceph_get_primary_affinity(map, i)*100) >> 16));
+ ((ceph_get_primary_affinity(map, i)*100) >> 16),
+ ceph_get_crush_locality(map, i,
+ &client->options->crush_locs));
}
for (n = rb_first(&map->pg_temp); n; n = rb_next(n)) {
struct ceph_pg_mapping *pg =
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 1d4973f8cd7a69..4fea3c33af2a73 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -932,10 +932,14 @@ static void osd_req_op_watch_init(struct ceph_osd_request *req, int which,
op->watch.gen = 0;
}
+/*
+ * @flags: CEPH_OSD_OP_ALLOC_HINT_FLAG_*
+ */
void osd_req_op_alloc_hint_init(struct ceph_osd_request *osd_req,
unsigned int which,
u64 expected_object_size,
- u64 expected_write_size)
+ u64 expected_write_size,
+ u32 flags)
{
struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which,
CEPH_OSD_OP_SETALLOCHINT,
@@ -943,6 +947,7 @@ void osd_req_op_alloc_hint_init(struct ceph_osd_request *osd_req,
op->alloc_hint.expected_object_size = expected_object_size;
op->alloc_hint.expected_write_size = expected_write_size;
+ op->alloc_hint.flags = flags;
/*
* CEPH_OSD_OP_SETALLOCHINT op is advisory and therefore deemed
@@ -1018,6 +1023,7 @@ static u32 osd_req_encode_op(struct ceph_osd_op *dst,
cpu_to_le64(src->alloc_hint.expected_object_size);
dst->alloc_hint.expected_write_size =
cpu_to_le64(src->alloc_hint.expected_write_size);
+ dst->alloc_hint.flags = cpu_to_le32(src->alloc_hint.flags);
break;
case CEPH_OSD_OP_SETXATTR:
case CEPH_OSD_OP_CMPXATTR:
@@ -1497,6 +1503,45 @@ static bool target_should_be_paused(struct ceph_osd_client *osdc,
(osdc->osdmap->epoch < osdc->epoch_barrier);
}
+static int pick_random_replica(const struct ceph_osds *acting)
+{
+ int i = prandom_u32() % acting->size;
+
+ dout("%s picked osd%d, primary osd%d\n", __func__,
+ acting->osds[i], acting->primary);
+ return i;
+}
+
+/*
+ * Picks the closest replica based on client's location given by
+ * crush_location option. Prefers the primary if the locality is
+ * the same.
+ */
+static int pick_closest_replica(struct ceph_osd_client *osdc,
+ const struct ceph_osds *acting)
+{
+ struct ceph_options *opt = osdc->client->options;
+ int best_i, best_locality;
+ int i = 0, locality;
+
+ do {
+ locality = ceph_get_crush_locality(osdc->osdmap,
+ acting->osds[i],
+ &opt->crush_locs);
+ if (i == 0 ||
+ (locality >= 0 && best_locality < 0) ||
+ (locality >= 0 && best_locality >= 0 &&
+ locality < best_locality)) {
+ best_i = i;
+ best_locality = locality;
+ }
+ } while (++i < acting->size);
+
+ dout("%s picked osd%d with locality %d, primary osd%d\n", __func__,
+ acting->osds[best_i], best_locality, acting->primary);
+ return best_i;
+}
+
enum calc_target_result {
CALC_TARGET_NO_ACTION = 0,
CALC_TARGET_NEED_RESEND,
@@ -1510,6 +1555,8 @@ static enum calc_target_result calc_target(struct ceph_osd_client *osdc,
struct ceph_pg_pool_info *pi;
struct ceph_pg pgid, last_pgid;
struct ceph_osds up, acting;
+ bool is_read = t->flags & CEPH_OSD_FLAG_READ;
+ bool is_write = t->flags & CEPH_OSD_FLAG_WRITE;
bool force_resend = false;
bool unpaused = false;
bool legacy_change = false;
@@ -1540,9 +1587,9 @@ static enum calc_target_result calc_target(struct ceph_osd_client *osdc,
ceph_oid_copy(&t->target_oid, &t->base_oid);
ceph_oloc_copy(&t->target_oloc, &t->base_oloc);
if ((t->flags & CEPH_OSD_FLAG_IGNORE_OVERLAY) == 0) {
- if (t->flags & CEPH_OSD_FLAG_READ && pi->read_tier >= 0)
+ if (is_read && pi->read_tier >= 0)
t->target_oloc.pool = pi->read_tier;
- if (t->flags & CEPH_OSD_FLAG_WRITE && pi->write_tier >= 0)
+ if (is_write && pi->write_tier >= 0)
t->target_oloc.pool = pi->write_tier;
pi = ceph_pg_pool_by_id(osdc->osdmap, t->target_oloc.pool);
@@ -1581,7 +1628,8 @@ static enum calc_target_result calc_target(struct ceph_osd_client *osdc,
unpaused = true;
}
legacy_change = ceph_pg_compare(&t->pgid, &pgid) ||
- ceph_osds_changed(&t->acting, &acting, any_change);
+ ceph_osds_changed(&t->acting, &acting,
+ t->used_replica || any_change);
if (t->pg_num)
split = ceph_pg_is_split(&last_pgid, t->pg_num, pi->pg_num);
@@ -1597,7 +1645,24 @@ static enum calc_target_result calc_target(struct ceph_osd_client *osdc,
t->sort_bitwise = sort_bitwise;
t->recovery_deletes = recovery_deletes;
- t->osd = acting.primary;
+ if ((t->flags & (CEPH_OSD_FLAG_BALANCE_READS |
+ CEPH_OSD_FLAG_LOCALIZE_READS)) &&
+ !is_write && pi->type == CEPH_POOL_TYPE_REP &&
+ acting.size > 1) {
+ int pos;
+
+ WARN_ON(!is_read || acting.osds[0] != acting.primary);
+ if (t->flags & CEPH_OSD_FLAG_BALANCE_READS) {
+ pos = pick_random_replica(&acting);
+ } else {
+ pos = pick_closest_replica(osdc, &acting);
+ }
+ t->osd = acting.osds[pos];
+ t->used_replica = pos > 0;
+ } else {
+ t->osd = acting.primary;
+ t->used_replica = false;
+ }
}
if (unpaused || legacy_change || force_resend || split)
@@ -2366,13 +2431,17 @@ promote:
static void account_request(struct ceph_osd_request *req)
{
+ struct ceph_osd_client *osdc = req->r_osdc;
+
WARN_ON(req->r_flags & (CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK));
WARN_ON(!(req->r_flags & (CEPH_OSD_FLAG_READ | CEPH_OSD_FLAG_WRITE)));
req->r_flags |= CEPH_OSD_FLAG_ONDISK;
- atomic_inc(&req->r_osdc->num_requests);
+ req->r_flags |= osdc->client->options->osd_req_flags;
+ atomic_inc(&osdc->num_requests);
req->r_start_stamp = jiffies;
+ req->r_start_latency = ktime_get();
}
static void submit_request(struct ceph_osd_request *req, bool wrlocked)
@@ -2389,6 +2458,8 @@ static void finish_request(struct ceph_osd_request *req)
WARN_ON(lookup_request_mc(&osdc->map_checks, req->r_tid));
dout("%s req %p tid %llu\n", __func__, req, req->r_tid);
+ req->r_end_latency = ktime_get();
+
if (req->r_osd)
unlink_request(req->r_osd, req);
atomic_dec(&osdc->num_requests);
@@ -3657,6 +3728,26 @@ static void handle_reply(struct ceph_osd *osd, struct ceph_msg *msg)
goto out_unlock_osdc;
}
+ if (m.result == -EAGAIN) {
+ dout("req %p tid %llu EAGAIN\n", req, req->r_tid);
+ unlink_request(osd, req);
+ mutex_unlock(&osd->lock);
+
+ /*
+ * The object is missing on the replica or not (yet)
+ * readable. Clear pgid to force a resend to the primary
+ * via legacy_change.
+ */
+ req->r_t.pgid.pool = 0;
+ req->r_t.pgid.seed = 0;
+ WARN_ON(!req->r_t.used_replica);
+ req->r_flags &= ~(CEPH_OSD_FLAG_BALANCE_READS |
+ CEPH_OSD_FLAG_LOCALIZE_READS);
+ req->r_tid = 0;
+ __submit_request(req, false);
+ goto out_unlock_osdc;
+ }
+
if (m.num_ops != req->r_num_ops) {
pr_err("num_ops %d != %d for tid %llu\n", m.num_ops,
req->r_num_ops, req->r_tid);
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index 2a6e63a8edbe42..96c25f5e064aaf 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -138,6 +138,79 @@ bad:
return -EINVAL;
}
+struct crush_name_node {
+ struct rb_node cn_node;
+ int cn_id;
+ char cn_name[];
+};
+
+static struct crush_name_node *alloc_crush_name(size_t name_len)
+{
+ struct crush_name_node *cn;
+
+ cn = kmalloc(sizeof(*cn) + name_len + 1, GFP_NOIO);
+ if (!cn)
+ return NULL;
+
+ RB_CLEAR_NODE(&cn->cn_node);
+ return cn;
+}
+
+static void free_crush_name(struct crush_name_node *cn)
+{
+ WARN_ON(!RB_EMPTY_NODE(&cn->cn_node));
+
+ kfree(cn);
+}
+
+DEFINE_RB_FUNCS(crush_name, struct crush_name_node, cn_id, cn_node)
+
+static int decode_crush_names(void **p, void *end, struct rb_root *root)
+{
+ u32 n;
+
+ ceph_decode_32_safe(p, end, n, e_inval);
+ while (n--) {
+ struct crush_name_node *cn;
+ int id;
+ u32 name_len;
+
+ ceph_decode_32_safe(p, end, id, e_inval);
+ ceph_decode_32_safe(p, end, name_len, e_inval);
+ ceph_decode_need(p, end, name_len, e_inval);
+
+ cn = alloc_crush_name(name_len);
+ if (!cn)
+ return -ENOMEM;
+
+ cn->cn_id = id;
+ memcpy(cn->cn_name, *p, name_len);
+ cn->cn_name[name_len] = '\0';
+ *p += name_len;
+
+ if (!__insert_crush_name(root, cn)) {
+ free_crush_name(cn);
+ return -EEXIST;
+ }
+ }
+
+ return 0;
+
+e_inval:
+ return -EINVAL;
+}
+
+void clear_crush_names(struct rb_root *root)
+{
+ while (!RB_EMPTY_ROOT(root)) {
+ struct crush_name_node *cn =
+ rb_entry(rb_first(root), struct crush_name_node, cn_node);
+
+ erase_crush_name(root, cn);
+ free_crush_name(cn);
+ }
+}
+
static struct crush_choose_arg_map *alloc_choose_arg_map(void)
{
struct crush_choose_arg_map *arg_map;
@@ -354,6 +427,8 @@ static struct crush_map *crush_decode(void *pbyval, void *end)
if (c == NULL)
return ERR_PTR(-ENOMEM);
+ c->type_names = RB_ROOT;
+ c->names = RB_ROOT;
c->choose_args = RB_ROOT;
/* set tunables to default values */
@@ -510,8 +585,14 @@ static struct crush_map *crush_decode(void *pbyval, void *end)
}
}
- ceph_decode_skip_map(p, end, 32, string, bad); /* type_map */
- ceph_decode_skip_map(p, end, 32, string, bad); /* name_map */
+ err = decode_crush_names(p, end, &c->type_names);
+ if (err)
+ goto fail;
+
+ err = decode_crush_names(p, end, &c->names);
+ if (err)
+ goto fail;
+
ceph_decode_skip_map(p, end, 32, string, bad); /* rule_name_map */
/* tunables */
@@ -636,48 +717,11 @@ DEFINE_RB_FUNCS2(pg_mapping, struct ceph_pg_mapping, pgid, ceph_pg_compare,
/*
* rbtree of pg pool info
*/
-static int __insert_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *new)
-{
- struct rb_node **p = &root->rb_node;
- struct rb_node *parent = NULL;
- struct ceph_pg_pool_info *pi = NULL;
-
- while (*p) {
- parent = *p;
- pi = rb_entry(parent, struct ceph_pg_pool_info, node);
- if (new->id < pi->id)
- p = &(*p)->rb_left;
- else if (new->id > pi->id)
- p = &(*p)->rb_right;
- else
- return -EEXIST;
- }
-
- rb_link_node(&new->node, parent, p);
- rb_insert_color(&new->node, root);
- return 0;
-}
-
-static struct ceph_pg_pool_info *__lookup_pg_pool(struct rb_root *root, u64 id)
-{
- struct ceph_pg_pool_info *pi;
- struct rb_node *n = root->rb_node;
-
- while (n) {
- pi = rb_entry(n, struct ceph_pg_pool_info, node);
- if (id < pi->id)
- n = n->rb_left;
- else if (id > pi->id)
- n = n->rb_right;
- else
- return pi;
- }
- return NULL;
-}
+DEFINE_RB_FUNCS(pg_pool, struct ceph_pg_pool_info, id, node)
struct ceph_pg_pool_info *ceph_pg_pool_by_id(struct ceph_osdmap *map, u64 id)
{
- return __lookup_pg_pool(&map->pg_pools, id);
+ return lookup_pg_pool(&map->pg_pools, id);
}
const char *ceph_pg_pool_name_by_id(struct ceph_osdmap *map, u64 id)
@@ -690,8 +734,7 @@ const char *ceph_pg_pool_name_by_id(struct ceph_osdmap *map, u64 id)
if (WARN_ON_ONCE(id > (u64) INT_MAX))
return NULL;
- pi = __lookup_pg_pool(&map->pg_pools, (int) id);
-
+ pi = lookup_pg_pool(&map->pg_pools, id);
return pi ? pi->name : NULL;
}
EXPORT_SYMBOL(ceph_pg_pool_name_by_id);
@@ -714,14 +757,14 @@ u64 ceph_pg_pool_flags(struct ceph_osdmap *map, u64 id)
{
struct ceph_pg_pool_info *pi;
- pi = __lookup_pg_pool(&map->pg_pools, id);
+ pi = lookup_pg_pool(&map->pg_pools, id);
return pi ? pi->flags : 0;
}
EXPORT_SYMBOL(ceph_pg_pool_flags);
static void __remove_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *pi)
{
- rb_erase(&pi->node, root);
+ erase_pg_pool(root, pi);
kfree(pi->name);
kfree(pi);
}
@@ -903,7 +946,7 @@ static int decode_pool_names(void **p, void *end, struct ceph_osdmap *map)
ceph_decode_32_safe(p, end, len, bad);
dout(" pool %llu len %d\n", pool, len);
ceph_decode_need(p, end, len, bad);
- pi = __lookup_pg_pool(&map->pg_pools, pool);
+ pi = lookup_pg_pool(&map->pg_pools, pool);
if (pi) {
char *name = kstrndup(*p, len, GFP_NOFS);
@@ -1154,18 +1197,18 @@ static int __decode_pools(void **p, void *end, struct ceph_osdmap *map,
ceph_decode_64_safe(p, end, pool, e_inval);
- pi = __lookup_pg_pool(&map->pg_pools, pool);
+ pi = lookup_pg_pool(&map->pg_pools, pool);
if (!incremental || !pi) {
pi = kzalloc(sizeof(*pi), GFP_NOFS);
if (!pi)
return -ENOMEM;
+ RB_CLEAR_NODE(&pi->node);
pi->id = pool;
- ret = __insert_pg_pool(&map->pg_pools, pi);
- if (ret) {
+ if (!__insert_pg_pool(&map->pg_pools, pi)) {
kfree(pi);
- return ret;
+ return -EEXIST;
}
}
@@ -1829,7 +1872,7 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
struct ceph_pg_pool_info *pi;
ceph_decode_64_safe(p, end, pool, e_inval);
- pi = __lookup_pg_pool(&map->pg_pools, pool);
+ pi = lookup_pg_pool(&map->pg_pools, pool);
if (pi)
__remove_pg_pool(&map->pg_pools, pi);
}
@@ -2672,3 +2715,221 @@ int ceph_pg_to_acting_primary(struct ceph_osdmap *osdmap,
return acting.primary;
}
EXPORT_SYMBOL(ceph_pg_to_acting_primary);
+
+static struct crush_loc_node *alloc_crush_loc(size_t type_name_len,
+ size_t name_len)
+{
+ struct crush_loc_node *loc;
+
+ loc = kmalloc(sizeof(*loc) + type_name_len + name_len + 2, GFP_NOIO);
+ if (!loc)
+ return NULL;
+
+ RB_CLEAR_NODE(&loc->cl_node);
+ return loc;
+}
+
+static void free_crush_loc(struct crush_loc_node *loc)
+{
+ WARN_ON(!RB_EMPTY_NODE(&loc->cl_node));
+
+ kfree(loc);
+}
+
+static int crush_loc_compare(const struct crush_loc *loc1,
+ const struct crush_loc *loc2)
+{
+ return strcmp(loc1->cl_type_name, loc2->cl_type_name) ?:
+ strcmp(loc1->cl_name, loc2->cl_name);
+}
+
+DEFINE_RB_FUNCS2(crush_loc, struct crush_loc_node, cl_loc, crush_loc_compare,
+ RB_BYPTR, const struct crush_loc *, cl_node)
+
+/*
+ * Parses a set of <bucket type name>':'<bucket name> pairs separated
+ * by '|', e.g. "rack:foo1|rack:foo2|datacenter:bar".
+ *
+ * Note that @crush_location is modified by strsep().
+ */
+int ceph_parse_crush_location(char *crush_location, struct rb_root *locs)
+{
+ struct crush_loc_node *loc;
+ const char *type_name, *name, *colon;
+ size_t type_name_len, name_len;
+
+ dout("%s '%s'\n", __func__, crush_location);
+ while ((type_name = strsep(&crush_location, "|"))) {
+ colon = strchr(type_name, ':');
+ if (!colon)
+ return -EINVAL;
+
+ type_name_len = colon - type_name;
+ if (type_name_len == 0)
+ return -EINVAL;
+
+ name = colon + 1;
+ name_len = strlen(name);
+ if (name_len == 0)
+ return -EINVAL;
+
+ loc = alloc_crush_loc(type_name_len, name_len);
+ if (!loc)
+ return -ENOMEM;
+
+ loc->cl_loc.cl_type_name = loc->cl_data;
+ memcpy(loc->cl_loc.cl_type_name, type_name, type_name_len);
+ loc->cl_loc.cl_type_name[type_name_len] = '\0';
+
+ loc->cl_loc.cl_name = loc->cl_data + type_name_len + 1;
+ memcpy(loc->cl_loc.cl_name, name, name_len);
+ loc->cl_loc.cl_name[name_len] = '\0';
+
+ if (!__insert_crush_loc(locs, loc)) {
+ free_crush_loc(loc);
+ return -EEXIST;
+ }
+
+ dout("%s type_name '%s' name '%s'\n", __func__,
+ loc->cl_loc.cl_type_name, loc->cl_loc.cl_name);
+ }
+
+ return 0;
+}
+
+int ceph_compare_crush_locs(struct rb_root *locs1, struct rb_root *locs2)
+{
+ struct rb_node *n1 = rb_first(locs1);
+ struct rb_node *n2 = rb_first(locs2);
+ int ret;
+
+ for ( ; n1 && n2; n1 = rb_next(n1), n2 = rb_next(n2)) {
+ struct crush_loc_node *loc1 =
+ rb_entry(n1, struct crush_loc_node, cl_node);
+ struct crush_loc_node *loc2 =
+ rb_entry(n2, struct crush_loc_node, cl_node);
+
+ ret = crush_loc_compare(&loc1->cl_loc, &loc2->cl_loc);
+ if (ret)
+ return ret;
+ }
+
+ if (!n1 && n2)
+ return -1;
+ if (n1 && !n2)
+ return 1;
+ return 0;
+}
+
+void ceph_clear_crush_locs(struct rb_root *locs)
+{
+ while (!RB_EMPTY_ROOT(locs)) {
+ struct crush_loc_node *loc =
+ rb_entry(rb_first(locs), struct crush_loc_node, cl_node);
+
+ erase_crush_loc(locs, loc);
+ free_crush_loc(loc);
+ }
+}
+
+/*
+ * [a-zA-Z0-9-_.]+
+ */
+static bool is_valid_crush_name(const char *name)
+{
+ do {
+ if (!('a' <= *name && *name <= 'z') &&
+ !('A' <= *name && *name <= 'Z') &&
+ !('0' <= *name && *name <= '9') &&
+ *name != '-' && *name != '_' && *name != '.')
+ return false;
+ } while (*++name != '\0');
+
+ return true;
+}
+
+/*
+ * Gets the parent of an item. Returns its id (<0 because the
+ * parent is always a bucket), type id (>0 for the same reason,
+ * via @parent_type_id) and location (via @parent_loc). If no
+ * parent, returns 0.
+ *
+ * Does a linear search, as there are no parent pointers of any
+ * kind. Note that the result is ambigous for items that occur
+ * multiple times in the map.
+ */
+static int get_immediate_parent(struct crush_map *c, int id,
+ u16 *parent_type_id,
+ struct crush_loc *parent_loc)
+{
+ struct crush_bucket *b;
+ struct crush_name_node *type_cn, *cn;
+ int i, j;
+
+ for (i = 0; i < c->max_buckets; i++) {
+ b = c->buckets[i];
+ if (!b)
+ continue;
+
+ /* ignore per-class shadow hierarchy */
+ cn = lookup_crush_name(&c->names, b->id);
+ if (!cn || !is_valid_crush_name(cn->cn_name))
+ continue;
+
+ for (j = 0; j < b->size; j++) {
+ if (b->items[j] != id)
+ continue;
+
+ *parent_type_id = b->type;
+ type_cn = lookup_crush_name(&c->type_names, b->type);
+ parent_loc->cl_type_name = type_cn->cn_name;
+ parent_loc->cl_name = cn->cn_name;
+ return b->id;
+ }
+ }
+
+ return 0; /* no parent */
+}
+
+/*
+ * Calculates the locality/distance from an item to a client
+ * location expressed in terms of CRUSH hierarchy as a set of
+ * (bucket type name, bucket name) pairs. Specifically, looks
+ * for the lowest-valued bucket type for which the location of
+ * @id matches one of the locations in @locs, so for standard
+ * bucket types (host = 1, rack = 3, datacenter = 8, zone = 9)
+ * a matching host is closer than a matching rack and a matching
+ * data center is closer than a matching zone.
+ *
+ * Specifying multiple locations (a "multipath" location) such
+ * as "rack=foo1 rack=foo2 datacenter=bar" is allowed -- @locs
+ * is a multimap. The locality will be:
+ *
+ * - 3 for OSDs in racks foo1 and foo2
+ * - 8 for OSDs in data center bar
+ * - -1 for all other OSDs
+ *
+ * The lowest possible bucket type is 1, so the best locality
+ * for an OSD is 1 (i.e. a matching host). Locality 0 would be
+ * the OSD itself.
+ */
+int ceph_get_crush_locality(struct ceph_osdmap *osdmap, int id,
+ struct rb_root *locs)
+{
+ struct crush_loc loc;
+ u16 type_id;
+
+ /*
+ * Instead of repeated get_immediate_parent() calls,
+ * the location of @id could be obtained with a single
+ * depth-first traversal.
+ */
+ for (;;) {
+ id = get_immediate_parent(osdmap->crush, id, &type_id, &loc);
+ if (id >= 0)
+ return -1; /* not local */
+
+ if (lookup_crush_loc(locs, &loc))
+ return type_id;
+ }
+}