哈希表之开地址法解决冲突

Posted NK_test

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了哈希表之开地址法解决冲突相关的知识,希望对你有一定的参考价值。

在上一篇博文中,我们讲述了使用链地址法解决冲突的方法。这里我们介绍另一种方式:开地址法解决冲突。

基本思想:当关键码key的哈希地址H0 = hash(key)出现冲突时,以H0为基础,产生另一个哈希地址H1 ,如果H1仍然冲突,再以H0

为基础,产生另一个哈希地址H2 ,…,直到找出一个不冲突的哈希地址Hi ,将相应元素存入其中。根据增量序列的取值方式不同,相应的再散列方式也不同。主要有以下四种


线性探测再散列

二次探测再散列

伪随机探测再散列

双散列法


(一)线性探测再散列


理解起来很简单,就是如果使用哈希函数映射的位置已经有数据,那么就依次顺序的向后查找,直到有一个位置还没有数据,将其放入。或者表已经满了。注意:表元素个数/表长<=1是基本要求(也就是 装填因子 )。

堆积现象

定义:用线性探测法处理冲突时,当表中i,i+1,i+2个位置上都有数据时,下一个散列地址如果是i,i+1,i+2和i+3都会要求填入i+3的位置,多个第一个散列地址不同的记录争夺同一个后继散列地址。

若散列函数不好、或装填因子a 过大,都会使堆积现象加剧。

我们将链地址法的代码稍加改动,status 保存状态,有EMPTY, DELETED, ACTIVE,删除的时候只是逻辑删除,即将状态置为DELETED,当插入新的key 时,只要不是ACTIVE 的位置都是可以放入,如果是DELETED位置,需要将原来元素先释放free掉,再插入。

common.h

#ifndef _COMMON_H_
#define _COMMON_H_

#include <unistd.h>
#include <sys/types.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>


#define ERR_EXIT(m) 
  do 
  { 
    perror(m); 
    exit(EXIT_FAILURE); 
  } 
  while (0)

#endif
hash.h

#ifndef _HASH_H_
#define _HASH_H_

typedef struct hash hash_t;
typedef unsigned int (*hashfunc_t)(unsigned int, void *);

hash_t *hash_alloc(unsigned int buckets, hashfunc_t hash_func);
void hash_free(hash_t *hash);
void *hash_lookup_entry(hash_t *hash, void *key, unsigned int key_size);
void hash_add_entry(hash_t *hash, void *key, unsigned int key_size,
                    void *value, unsigned int value_size);
void hash_free_entry(hash_t *hash, void *key, unsigned int key_size);


#endif /* _HASH_H_ */

hash.c

#include "hash.h"
#include "common.h"
#include <assert.h>


typedef enum entry_status
{
    EMPTY,
    ACTIVE,
    DELETED
} entry_status_t;

typedef struct hash_node
{
    enum entry_status status;
    void *key;
    void *value;
} hash_node_t;


struct hash
{
    unsigned int buckets;
    hashfunc_t hash_func;
    hash_node_t *nodes;
};

unsigned int hash_get_bucket(hash_t *hash, void *key);
hash_node_t *hash_get_node_by_key(hash_t *hash, void *key, unsigned int key_size);


hash_t *hash_alloc(unsigned int buckets, hashfunc_t hash_func)
{
    hash_t *hash = (hash_t *)malloc(sizeof(hash_t));
    //assert(hash != NULL);
    hash->buckets = buckets;
    hash->hash_func = hash_func;
    int size = buckets * sizeof(hash_node_t);
    hash->nodes = (hash_node_t *)malloc(size);
    memset(hash->nodes, 0, size);
    printf("The hash table has allocate.
");
    return hash;
}

void hash_free(hash_t *hash)
{
    unsigned int buckets = hash->buckets;
    int i;
    for (i = 0; i < buckets; i++)
    {
        if (hash->nodes[i].status != EMPTY)
        {
            free(hash->nodes[i].key);
            free(hash->nodes[i].value);
        }
    }

    free(hash->nodes);
    free(hash);

    printf("The hash table has free.
");
}

void *hash_lookup_entry(hash_t *hash, void *key, unsigned int key_size)
{
    hash_node_t *node = hash_get_node_by_key(hash, key, key_size);
    if (node == NULL)
    {
        return NULL;
    }

    return node->value;
}

void hash_add_entry(hash_t *hash, void *key, unsigned int key_size,
                    void *value, unsigned int value_size)
{
    if (hash_lookup_entry(hash, key, key_size))
    {
        fprintf(stderr, "duplicate hash key
");
        return;
    }

    unsigned int bucket = hash_get_bucket(hash, key);
    unsigned int i = bucket;
    // 找到的位置已经有人存活,向下探测
    while (hash->nodes[i].status == ACTIVE)
    {
        i = (i + 1) % hash->buckets;
        if (i == bucket)
        {
            // 没找到,并且表满
            return;
        }
    }

    hash->nodes[i].status = ACTIVE;
    if (hash->nodes[i].key) //释放原来被逻辑删除的项的内存
    {
        free(hash->nodes[i].key);
    }
    hash->nodes[i].key = malloc(key_size);
    memcpy(hash->nodes[i].key, key, key_size);
    if (hash->nodes[i].value) //释放原来被逻辑删除的项的内存
    {
        free(hash->nodes[i].value);
    }
    hash->nodes[i].value = malloc(value_size);
    memcpy(hash->nodes[i].value, value, value_size);

}

void hash_free_entry(hash_t *hash, void *key, unsigned int key_size)
{
    hash_node_t *node = hash_get_node_by_key(hash, key, key_size);
    if (node == NULL)
        return;

    // 逻辑删除,置标志位
    node->status = DELETED;
}

unsigned int hash_get_bucket(hash_t *hash, void *key)
{
    // 返回哈希地址
    unsigned int bucket = hash->hash_func(hash->buckets, key);
    if (bucket >= hash->buckets)
    {
        fprintf(stderr, "bad bucket lookup
");
        exit(EXIT_FAILURE);
    }

    return bucket;
}

hash_node_t *hash_get_node_by_key(hash_t *hash, void *key, unsigned int key_size)
{
    unsigned int bucket = hash_get_bucket(hash, key);
    unsigned int i = bucket;
    while (hash->nodes[i].status != EMPTY && memcmp(key, hash->nodes[i].key, key_size) != 0)
    {
        i = (i + 1) % hash->buckets;
        if (i == bucket)        // 探测了一圈
        {
            // 没找到,并且表满
            return NULL;
        }
    }
    // 比对正确,还得确认是否还存活
    if (hash->nodes[i].status == ACTIVE)
    {
        return &(hash->nodes[i]);
    }

    // 如果运行到这里,说明i为空位或已被删除

    return NULL;
}
main.c(测试代码)

#include "hash.h"
#include "common.h"

typedef struct stu
{
    char sno[5];
    char name[32];
    int age;
} stu_t;

typedef struct stu2
{
    int sno;
    char name[32];
    int age;
} stu2_t;


unsigned int hash_str(unsigned int buckets, void *key)
{
    char *sno = (char *)key;
    unsigned int index = 0;

    while (*sno)
    {
        index = *sno + 4 * index;
        sno++;
    }

    return index % buckets;
}

unsigned int hash_int(unsigned int buckets, void *key)
{
    int *sno = (int *)key;
    return (*sno) % buckets;
}

int main(void)
{

    stu2_t stu_arr[] =
    {
        { 1234, "AAAA", 20 },
        { 4568, "BBBB", 23 },
        { 6729, "AAAA", 19 }
    };

    hash_t *hash = hash_alloc(256, hash_int);

    int size = sizeof(stu_arr) / sizeof(stu_arr[0]);
    int i;
    for (i = 0; i < size; i++)
    {
        hash_add_entry(hash, &(stu_arr[i].sno), sizeof(stu_arr[i].sno),
                       &stu_arr[i], sizeof(stu_arr[i]));
    }

    int sno = 4568;
    stu2_t *s = (stu2_t *)hash_lookup_entry(hash, &sno, sizeof(sno));
    if (s)
    {
        printf("%d %s %d
", s->sno, s->name, s->age);
    }
    else
    {
        printf("not found
");
    }

    sno = 1234;
    hash_free_entry(hash, &sno, sizeof(sno));
    s = (stu2_t *)hash_lookup_entry(hash, &sno, sizeof(sno));
    if (s)
    {
        printf("%d %s %d
", s->sno, s->name, s->age);
    }
    else
    {
        printf("not found
");
    }

    hash_free(hash);

    return 0;
}
输出:

The hash table has allocate.
4568 BBBB 23
not found
The hash table has free.
(二)二次探测再散列

为改善“堆积”问题,减少为完成搜索所需的平均探查次数,可使用二次探测法。

可以证明:当表的长度>buckets为质数并且表的装填因子不超过0.5的时候,新的表项一定可以插入,而且任意一个位置不会被探查两次。

具体代码实现,跟前面讲过的线性探测再散列 差不多,只是探测的方法不同,但使用的数据结构也有点不一样。此外还实现了开裂处理(也就是表的长度要扩充一倍,然后取比他大的最小的一个质数),如果装载因子 a > 1/2; 则建立新表,将旧表内容拷贝过去,所以hash_t 结构体需要再保存一个size 成员,同样的原因,为了将旧表内容拷贝过去,hash_node_t 结构体需要再保存 *key 和 *value 的size。

hash.c

#include "hash.h"
#include "common.h"
#include <assert.h>


typedef enum entry_status
{
    EMPTY,
    ACTIVE,
    DELETED
} entry_status_t;

typedef struct hash_node
{
    enum entry_status status;
    void *key;
    unsigned int key_size; //在拷贝进新的哈希表时有用
    void *value;
    unsigned int value_size; //在拷贝进新的哈希表时有用
} hash_node_t;


struct hash
{
    unsigned int buckets;
    unsigned int size; //累加,如果size > buckets / 2 ,则需要开裂建立新表
    hashfunc_t hash_func;
    hash_node_t *nodes;
};

unsigned int next_prime(unsigned int n);
int is_prime(unsigned int n);

unsigned int hash_get_bucket(hash_t *hash, void *key);
hash_node_t *hash_get_node_by_key(hash_t *hash, void *key, unsigned int key_size);


hash_t *hash_alloc(unsigned int buckets, hashfunc_t hash_func)
{
    hash_t *hash = (hash_t *)malloc(sizeof(hash_t));
    //assert(hash != NULL);
    hash->buckets = buckets;
    hash->hash_func = hash_func;
    int size = buckets * sizeof(hash_node_t);
    hash->nodes = (hash_node_t *)malloc(size);
    memset(hash->nodes, 0, size);
    printf("The hash table has allocate.
");
    return hash;
}

void hash_free(hash_t *hash)
{
    unsigned int buckets = hash->buckets;
    int i;
    for (i = 0; i < buckets; i++)
    {
        if (hash->nodes[i].status != EMPTY)
        {
            free(hash->nodes[i].key);
            free(hash->nodes[i].value);
        }
    }

    free(hash->nodes);

    printf("The hash table has free.
");
}

void *hash_lookup_entry(hash_t *hash, void *key, unsigned int key_size)
{
    hash_node_t *node = hash_get_node_by_key(hash, key, key_size);
    if (node == NULL)
    {
        return NULL;
    }

    return node->value;
}

void hash_add_entry(hash_t *hash, void *key, unsigned int key_size,
                    void *value, unsigned int value_size)
{
    if (hash_lookup_entry(hash, key, key_size))
    {
        fprintf(stderr, "duplicate hash key
");
        return;
    }

    unsigned int bucket = hash_get_bucket(hash, key);
    unsigned int i = bucket;
    unsigned int j = i;
    int k  = 1;
    int odd = 1;

    while (hash->nodes[i].status == ACTIVE)
    {
        if (odd)
        {
            i = j + k * k;

            odd = 0;

            // i % hash->buckets;
            while (i >= hash->buckets)
            {
                i -= hash->buckets;
            }
        }
        else
        {
            i = j - k * k;
            odd = 1;

            while (i < 0)
            {
                i += hash->buckets;
            }

            ++k;
        }
    }

    hash->nodes[i].status = ACTIVE;
    if (hash->nodes[i].key) 释放原来被逻辑删除的项的内存
    {
        free(hash->nodes[i].key);
    }
    hash->nodes[i].key = malloc(key_size);
    hash->nodes[i].key_size = key_size; //保存key_size;
    memcpy(hash->nodes[i].key, key, key_size);
    if (hash->nodes[i].value) //释放原来被逻辑删除的项的内存
    {
        free(hash->nodes[i].value);
    }
    hash->nodes[i].value = malloc(value_size);
    hash->nodes[i].value_size = value_size; //保存value_size;
    memcpy(hash->nodes[i].value, value, value_size);

    if (++(hash->size) < hash->buckets / 2)
        return;


    //在搜索时可以不考虑表装满的情况;
    //但在插入时必须确保表的装填因子不超过0.5。
    //如果超出,必须将表长度扩充一倍,进行表的分裂。

    unsigned int old_buckets = hash->buckets;

    hash->buckets = next_prime(2 * old_buckets);

    hash_node_t *p = hash->nodes;
    unsigned int size;
    hash->size = 0;  //从0 开始计算
    size = sizeof(hash_node_t) * hash->buckets;
    hash->nodes = (hash_node_t *)malloc(size);
    memset(hash->nodes, 0, size);

    for (i = 0; i < old_buckets; i++)
    {
        if (p[i].status == ACTIVE)
        {
            hash_add_entry(hash, p[i].key, p[i].key_size, p[i].value, p[i].value_size);
        }
    }

    for (i = 0; i < old_buckets; i++)
    {
// active or deleted
        if (p[i].key)
        {
            free(p[i].key);
        }
        if (p[i].value)
        {
            free(p[i].value);
        }
    }

    free(p); //释放旧表

}

void hash_free_entry(hash_t *hash, void *key, unsigned int key_size)
{
    hash_node_t *node = hash_get_node_by_key(hash, key, key_size);
    if (node == NULL)
        return;

    // 逻辑删除
    node->status = DELETED;
}

unsigned int hash_get_bucket(hash_t *hash, void *key)
{
    unsigned int bucket = hash->hash_func(hash->buckets, key);
    if (bucket >= hash->buckets)
    {
        fprintf(stderr, "bad bucket lookup
");
        exit(EXIT_FAILURE);
    }

    return bucket;
}

hash_node_t *hash_get_node_by_key(hash_t *hash, void *key, unsigned int key_size)
{
    unsigned int bucket = hash_get_bucket(hash, key);
    unsigned int i = 1;
    unsigned int pos = bucket;
    int odd = 1;
    unsigned int tmp = pos;
    while (hash->nodes[pos].status != EMPTY && memcmp(key, hash->nodes[pos].key, key_size) != 0)
    {
        if (odd)
        {
            pos = tmp + i * i;

            odd = 0;

            // pos % hash->buckets;
            while (pos >= hash->buckets)
            {
                pos -= hash->buckets;
            }
        }
        else
        {
            pos = tmp - i * i;
            odd = 1;

            while (pos < 0)
            {
                pos += hash->buckets;
            }

            i++;
        }

    }

    if (hash->nodes[pos].status == ACTIVE)
    {
        return &(hash->nodes[pos]);
    }

    // 如果运行到这里,说明pos为空位或者被逻辑删除

    // 可以证明,当表的长度hash->buckets为质数且表的装填因子不超过0.5时,
    // 新的表项 x 一定能够插入,而且任何一个位置不会被探查两次。
    // 因此,只要表中至少有一半空的,就不会有表满问题。

    return NULL;
}

unsigned int next_prime(unsigned int n)
{
    // 偶数不是质数
    if (n % 2 == 0)
    {
        n++;
    }

    for (; !is_prime(n); n += 2); // 不是质数,继续求
    return n;
}

int is_prime(unsigned int n)
{
    unsigned int i;
    for (i = 3; i * i <= n; i += 2)
    {
        if (n % i == 0)
        {
            // 不是,返回0
            return 0;
        }
    }

    // 是,返回1
    return 1;
}
(三)伪随机探测再散列


(四)双散列法


下面是一定数据下各种方式的性能分析:


我们可以得出一般性结论:

处理冲突的方法最好采用链地址法,哈希函数使用除留余数法(其中哈希函数最好与关键码的特征关联性强一些)性能最佳。

以上是关于哈希表之开地址法解决冲突的主要内容,如果未能解决你的问题,请参考以下文章

哈希表之开散列表——key为字符串.c

哈希表之二哈希函数的构造

哈希表(散列表)冲突解决方法

哈希表原理及如何避免键值冲突法?

Java解决Hash(散列)冲突的四种方法--开放地址法(线性探测,二次探测,伪随机探测)链地址法再哈希建立公共溢出区

哈希表之拉链法