crush 核心函数 crush_do_rule

Posted Jason__Zhou

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了crush 核心函数 crush_do_rule相关的知识,希望对你有一定的参考价值。

crush 核心函数

crush_do_rule
位置crush/mapper.c

重要过程

  • crush_do_rule: do crushrules iteratively
  • crush_choose_firstn: choose buckets or devices of specified type recursively
  • crush_bucket_choose: directly choose a son of the input bucket

步骤操作标志

/* step op codes */
enum 
    CRUSH_RULE_NOOP = 0,
    CRUSH_RULE_TAKE = 1,          /* arg1 = value to start with */
    CRUSH_RULE_CHOOSE_FIRSTN = 2, /* arg1 = num items to pick */
                      /* arg2 = type */
    CRUSH_RULE_CHOOSE_INDEP = 3,  /* same */
    CRUSH_RULE_EMIT = 4,          /* no args */
    CRUSH_RULE_CHOOSELEAF_FIRSTN = 6,
    CRUSH_RULE_CHOOSELEAF_INDEP = 7,

    CRUSH_RULE_SET_CHOOSE_TRIES = 8, /* override choose_total_tries */
    CRUSH_RULE_SET_CHOOSELEAF_TRIES = 9, /* override chooseleaf_descend_once */
    CRUSH_RULE_SET_CHOOSE_LOCAL_TRIES = 10,
    CRUSH_RULE_SET_CHOOSE_LOCAL_FALLBACK_TRIES = 11,
    CRUSH_RULE_SET_CHOOSELEAF_VARY_R = 12,
    CRUSH_RULE_SET_CHOOSELEAF_STABLE = 13
;

最核心的函数

/**
 * crush_do_rule - calculate a mapping with the given input and rule
 * @map: the crush_map
 * @ruleno: the rule id
 * @x: hash input
 * @result: pointer to result vector
 * @result_max: maximum result size
 * @weight: weight vector (for map leaves)
 * @weight_max: size of weight vector
 * @scratch: scratch vector for private use; must be >= 3 * result_max
 */
int crush_do_rule(const struct crush_map *map,
          int ruleno, int x, int *result, int result_max,
          const __u32 *weight, int weight_max,
          int *scratch)

    int result_len;
    int *a = scratch;
    int *b = scratch + result_max;
    int *c = scratch + result_max*2;
    int recurse_to_leaf;
    int *w;
    int wsize = 0;
    int *o;
    int osize;
    int *tmp;
    struct crush_rule *rule;
    __u32 step;
    int i, j;
    int numrep;
    int out_size;
    /*
     * the original choose_total_tries value was off by one (it
     * counted "retries" and not "tries").  add one.
     */
    int choose_tries = map->choose_total_tries + 1;
    int choose_leaf_tries = 0;
    /*
     * the local tries values were counted as "retries", though,
     * and need no adjustment
     */
    int choose_local_retries = map->choose_local_tries;
    int choose_local_fallback_retries = map->choose_local_fallback_tries;

    int vary_r = map->chooseleaf_vary_r;
    int stable = map->chooseleaf_stable;

    if ((__u32)ruleno >= map->max_rules) 
        dprintk(" bad ruleno %d\\n", ruleno);
        return 0;
    

    rule = map->rules[ruleno];
    result_len = 0;
    w = a;
    o = b;

    for (step = 0; step < rule->len; step++) 
        int firstn = 0;
        struct crush_rule_step *curstep = &rule->steps[step];

        switch (curstep->op) 
        case CRUSH_RULE_TAKE:
            if ((curstep->arg1 >= 0 &&
                 curstep->arg1 < map->max_devices) ||
                (-1-curstep->arg1 >= 0 &&
                 -1-curstep->arg1 < map->max_buckets &&
                 map->buckets[-1-curstep->arg1])) 
                w[0] = curstep->arg1;
                wsize = 1;
             else 
                dprintk(" bad take value %d\\n", curstep->arg1);
            
            break;

        case CRUSH_RULE_SET_CHOOSE_TRIES:
            if (curstep->arg1 > 0)
                choose_tries = curstep->arg1;
            break;

        case CRUSH_RULE_SET_CHOOSELEAF_TRIES:
            if (curstep->arg1 > 0)
                choose_leaf_tries = curstep->arg1;
            break;

        case CRUSH_RULE_SET_CHOOSE_LOCAL_TRIES:
            if (curstep->arg1 >= 0)
                choose_local_retries = curstep->arg1;
            break;

        case CRUSH_RULE_SET_CHOOSE_LOCAL_FALLBACK_TRIES:
            if (curstep->arg1 >= 0)
                choose_local_fallback_retries = curstep->arg1;
            break;

        case CRUSH_RULE_SET_CHOOSELEAF_VARY_R:
            if (curstep->arg1 >= 0)
                vary_r = curstep->arg1;
            break;

        case CRUSH_RULE_SET_CHOOSELEAF_STABLE:
            if (curstep->arg1 >= 0)
                stable = curstep->arg1;
            break;

        case CRUSH_RULE_CHOOSELEAF_FIRSTN:
        case CRUSH_RULE_CHOOSE_FIRSTN:
            firstn = 1;
            /* fall through */
        case CRUSH_RULE_CHOOSELEAF_INDEP:
        case CRUSH_RULE_CHOOSE_INDEP:
            if (wsize == 0)
                break;

            recurse_to_leaf =
                curstep->op ==
                 CRUSH_RULE_CHOOSELEAF_FIRSTN ||
                curstep->op ==
                CRUSH_RULE_CHOOSELEAF_INDEP;

            /* reset output */
            osize = 0;

            for (i = 0; i < wsize; i++) 
                int bno;
                /*
                 * see CRUSH_N, CRUSH_N_MINUS macros.
                 * basically, numrep <= 0 means relative to
                 * the provided result_max
                 */
                numrep = curstep->arg1;
                if (numrep <= 0) 
                    numrep += result_max;
                    if (numrep <= 0)
                        continue;
                
                j = 0;
                /* make sure bucket id is valid */
                bno = -1 - w[i];
                if (bno < 0 || bno >= map->max_buckets) 
                    // w[i] is probably CRUSH_ITEM_NONE
                    dprintk("  bad w[i] %d\\n", w[i]);
                    continue;
                
                if (firstn) 
                    int recurse_tries;
                    if (choose_leaf_tries)
                        recurse_tries =
                            choose_leaf_tries;
                    else if (map->chooseleaf_descend_once)
                        recurse_tries = 1;
                    else
                        recurse_tries = choose_tries;
                    osize += crush_choose_firstn(
                        map,
                        map->buckets[bno],
                        weight, weight_max,
                        x, numrep,
                        curstep->arg2,
                        o+osize, j,
                        result_max-osize,
                        choose_tries,
                        recurse_tries,
                        choose_local_retries,
                        choose_local_fallback_retries,
                        recurse_to_leaf,
                        vary_r,
                        stable,
                        c+osize,
                        0);
                 else 
                    out_size = ((numrep < (result_max-osize)) ?
                            numrep : (result_max-osize));
                    crush_choose_indep(
                        map,
                        map->buckets[bno],
                        weight, weight_max,
                        x, out_size, numrep,
                        curstep->arg2,
                        o+osize, j,
                        choose_tries,
                        choose_leaf_tries ?
                           choose_leaf_tries : 1,
                        recurse_to_leaf,
                        c+osize,
                        0);
                    osize += out_size;
                
            

            if (recurse_to_leaf)
                /* copy final _leaf_ values to output set */
                memcpy(o, c, osize*sizeof(*o));

            /* swap o and w arrays */
            tmp = o;
            o = w;
            w = tmp;
            wsize = osize;
            break;


        case CRUSH_RULE_EMIT:
            for (i = 0; i < wsize && result_len < result_max; i++) 
                result[result_len] = w[i];
                result_len++;
            
            wsize = 0;
            break;

        default:
            dprintk(" unknown op %d at step %d\\n",
                curstep->op, step);
            break;
        
    
    return result_len;

crush_choose_firstn - choose numrep distinct items of given type

/**
 * crush_choose_firstn - choose numrep distinct items of given type
 * @map: the crush_map
 * @bucket: the bucket we are choose an item from
 * @x: crush input value
 * @numrep: the number of items to choose
 * @type: the type of item to choose
 * @out: pointer to output vector
 * @outpos: our position in that vector
 * @out_size: size of the out vector
 * @tries: number of attempts to make
 * @recurse_tries: number of attempts to have recursive chooseleaf make
 * @local_retries: localized retries
 * @local_fallback_retries: localized fallback retries
 * @recurse_to_leaf: true if we want one device under each item of given type (chooseleaf instead of choose)
 * @stable: stable mode starts rep=0 in the recursive call for all replicas
 * @vary_r: pass r to recursive calls
 * @out2: second output vector for leaf items (if @recurse_to_leaf)
 * @parent_r: r value passed from the parent
 */
static int crush_choose_firstn(const struct crush_map *map,
                   struct crush_bucket *bucket,
                   const __u32 *weight, int weight_max,
                   int x, int numrep, int type,
                   int *out, int outpos,
                   int out_size,
                   unsigned int tries,
                   unsigned int recurse_tries,
                   unsigned int local_retries,
                   unsigned int local_fallback_retries,
                   int recurse_to_leaf,
                   unsigned int vary_r,
                   unsigned int stable,
                   int *out2,
                   int parent_r)

    int rep;
    unsigned int ftotal, flocal;
    int retry_descent, retry_bucket, skip_rep;
    struct crush_bucket *in = bucket;
    int r;
    int i;
    int item = 0;
    int itemtype;
    int collide, reject;
    int count = out_size;

    dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d tries %d \\
recurse_tries %d local_retries %d local_fallback_retries %d \\
parent_r %d stable %d\\n",
        recurse_to_leaf ? "_LEAF" : "",
        bucket->id, x, outpos, numrep,
        tries, recurse_tries, local_retries, local_fallback_retries,
        parent_r, stable);

    for (rep = stable ? 0 : outpos; rep < numrep && count > 0 ; rep++) 
        /* keep trying until we get a non-out, non-colliding item */
        ftotal = 0;
        skip_rep = 0;
        do 
            retry_descent = 0;
            in = bucket;               /* initial bucket */

            /* choose through intervening buckets */
            flocal = 0;
            do 
                collide = 0;
                retry_bucket = 0;
                r = rep + parent_r;
                /* r' = r + f_total */
                r += ftotal;

                /* bucket choose */
                if (in->size == 0) 
                    reject = 1;
                    goto reject;
                
                if (local_fallback_retries > 0 &&
                    flocal >= (in->size>>1) &&
                    flocal > local_fallback_retries)
                    item = bucket_perm_choose(in, x, r);
                else
                    item = crush_bucket_choose(in, x, r);
                if (item >= map->max_devices) 
                    dprintk("   bad item %d\\n", item);
                    skip_rep = 1;
                    break;
                

                /* desired type? */
                if (item < 0)
                    itemtype = map->buckets[-1-item]->type;
                else
                    itemtype = 0;
                dprintk("  item %d type %d\\n", item, itemtype);

                /* keep going? */
                if (itemtype != type) 
                    if (item >= 0 ||
                        (-1-item) >= map->max_buckets) 
                        dprintk("   bad item type %d\\n", type);
                        skip_rep = 1;
                        break;
                    
                    in = map->buckets[-1-item];
                    retry_bucket = 1;
                    continue;
                

                /* collision? */
                for (i = 0; i < outpos; i++) 
                    if (out[i] == item) 
                        collide = 1;
                        break;
                    
                

                reject = 0;
                if (!collide && recurse_to_leaf) 
                    if (item < 0) 
                        int sub_r;
                        if (vary_r)
                            sub_r = r >> (vary_r-1);
                        else
                            sub_r = 0;
                        if (crush_choose_firstn(map,
                             map->buckets[-1-item],
                             weight, weight_max,
                             x, stable ? 1 : outpos+1, 0,
                             out2, outpos, count,
                             recurse_tries, 0,
                             local_retries,
                             local_fallback_retries,
                             0,
                             vary_r,
                             stable,
                             NULL,
                             sub_r) <= outpos)
                            /* didn't get leaf */
                            reject = 1;
                     else 
                        /* we already have a leaf! */
                        out2[outpos] = item;
                    
                

                if (!reject) 
                    /* out? */
                    if (itemtype == 0)
                        reject = is_out(map, weight,
                                weight_max,
                                item, x);
                    else
                        reject = 0;
                

reject:
                if (reject || collide) 
                    ftotal++;
                    flocal++;

                    if (collide && flocal <= local_retries)
                        /* retry locally a few times */
                        retry_bucket = 1;
                    else if (local_fallback_retries > 0 &&
                         flocal <= in->size + local_fallback_retries)
                        /* exhaustive bucket search */
                        retry_bucket = 1;
                    else if (ftotal < tries)
                        /* then retry descent */
                        retry_descent = 1;
                    else
                        /* else give up */
                        skip_rep = 1;
                    dprintk("  reject %d  collide %d  "
                        "ftotal %u  flocal %u\\n",
                        reject, collide, ftotal,
                        flocal);
                
             while (retry_bucket);
         while (retry_descent);

        if (skip_rep) 
            dprintk("skip rep\\n");
            continue;
        

        dprintk("CHOOSE got %d\\n", item);
        out[outpos] = item;
        outpos++;
        count--;
#ifndef __KERNEL__
        if (map->choose_tries && ftotal <= map->choose_total_tries)
            map->choose_tries[ftotal]++;
#endif
    

    dprintk("CHOOSE returns %d\\n", outpos);
    return outpos;

crush_bucket_choose

static int crush_bucket_choose(struct crush_bucket *in, int x, int r)

    dprintk(" crush_bucket_choose %d x=%d r=%d\\n", in->id, x, r);
    BUG_ON(in->size == 0);
    switch (in->alg) 
    case CRUSH_BUCKET_UNIFORM:
        return bucket_uniform_choose((struct crush_bucket_uniform *)in,
                      x, r);
    case CRUSH_BUCKET_LIST:
        return bucket_list_choose((struct crush_bucket_list *)in,
                      x, r);
    case CRUSH_BUCKET_TREE:
        return bucket_tree_choose((struct crush_bucket_tree *)in,
                      x, r);
    case CRUSH_BUCKET_STRAW:
        return bucket_straw_choose((struct crush_bucket_straw *)in,
                       x, r);
    case CRUSH_BUCKET_STRAW2:
        return bucket_straw2_choose((struct crush_bucket_straw2 *)in,
                        x, r);
    default:
        dprintk("unknown bucket %d alg %d\\n", in->id, in->alg);
        return in->items[0];
    

bucket_straw_choose

static int bucket_straw_choose(struct crush_bucket_straw *bucket,
                   int x, int r)
                   
    __u32 i;
    int high = 0;
    __u64 high_draw = 0;
    __u64 draw;

    for (i = 0; i < bucket->h.size; i++) 
        draw = crush_hash32_3(bucket->h.hash, x, bucket->h.items[i], r);
        draw &= 0xffff;
        draw *= bucket->straws[i];
        if (i == 0 || draw > high_draw) 
            high = i;
            high_draw = draw;
        
    
    return bucket->h.items[high];

以上是关于crush 核心函数 crush_do_rule的主要内容,如果未能解决你的问题,请参考以下文章

一致性hash与CRUSH算法总结

crash 和crush有何区别/

ceph Luminous crush device class(crush 设备分类)

CEPH CRUSH 算法源码分析 原文CEPH CRUSH algorithm source code analysis

Ceph Crush算法详解

723. Candy Crush