In this post, we’ll discover the power of hash tables in solving LeetCode’s Two sum problem.
Aim
- Implements hashtables to solve Two sum.
- Understand how the hashtables are implemented in Linux.
Hash struct introduction
source: 2022q1 第 1 週測驗題
The hash table
is implemented using a non-circular doubly-linked list structure, consisting of (define in include/linux/types.h):
hlist_head
: the head of a linked list and represents a bucket.hlist_node
: a node in the bucket list.
The map_t
- bits: size of
struct hlist_head
array. - ht: the head of the hash table.
The hash_key
- the struct stores keys and values and can be linked by
struct hlist_node
.
struct hlist_head {
struct hlist_node *first;
};
struct hlist_node {
struct hlist_node *next, **pprev;
};
typedef struct {
int bits;
struct hlist_head *ht;
} map_t;
struct hash_key {
int key;
void *data;
struct hlist_node node;
};
Why use **pprev
not *pprev
in hlist_node
?
The hashtable is implemented through the use of two key structures: hlist_head
and hlist_node
. For purpose of saving memory, it doesn’t include pprev
in the head.
There are two scenarios for hlist_node
:
- The previous node is a struct
hlist_head
. - The previous node is a struct
hlist_node
.
Code Explorations: *pprev
vs. **pprev
That’s see what will happen with two different structure definitions:
*pprev
scenario
struct hlist_head {
struct hlist_node *first;
};
struct hlist_node {
struct hlist_node *next, *pprev;
};
struct hlist_head *h;
struct hlist_node *n1;
struct hlist_node *n2;
// case 1
n2->pprev = (struct hlist_node *)h;
// case 2
n2->pprev = n1;
**pprev
scenario
struct hlist_head {
struct hlist_node *first;
};
struct hlist_node {
struct hlist_node *next, **pprev;
};
struct hlist_head *h;
struct hlist_node *n1;
struct hlist_node *n2;
// case 1
n2->pprev = &h->first;
// case 2
n2->pprev = &n1->next;
The Magic of **pprev
What **pprev
accomplishes is direct access to the address of the previous node rather than its value, mitigating the need for intricate type casting. This design choice results in cleaner, more straightforward code.
Beyond simplicity, other advantages emerge, especially during node deletion in hlist. With **pprev
representing the address of the node, the code can seamlessly proceed without the necessity to check whether the value of *pprev is NULL or not.
Check include/linux/list.h
No list corruption checks in __hlist_del
:
static inline void __hlist_del(struct hlist_node *n)
{
struct hlist_node *next = n->next;
struct hlist_node **pprev = n->pprev;
WRITE_ONCE(*pprev, next);
if (next)
WRITE_ONCE(next->pprev, pprev);
}
In __list_del_entry
, line 3
is for list corruption checks.
static inline void __list_del_entry(struct list_head *entry)
{
if (!__list_del_entry_valid(entry))
return;
__list_del(entry->prev, entry->next);
}
Solve two sum using hashtable
map_init
Create and initialize an array of fixed size of struct hlist_head
.
#define MAP_HASH_SIZE(bits) (1 << bits)
map_t *map_init(int bits) {
map_t *map = malloc(sizeof(map_t));
if (!map)
return NULL;
map->bits = bits;
map->ht = malloc(sizeof(struct hlist_head) * MAP_HASH_SIZE(map->bits));
if (map->ht) {
for (int i = 0; i < MAP_HASH_SIZE(map->bits); i++)
(map->ht)[i].first = NULL;
} else {
free(map);
map = NULL;
}
return map;
}
hash function
#define GOLDEN_RATIO_32 0x61C88647
static inline unsigned int hash(unsigned int val, unsigned int bits) {
/* High bits are more random, so use them. */
return (val * GOLDEN_RATIO_32) >> (32 - bits);
}
find_key
Calculate the hash value for the given key
.
Due to the possibility of different key values producing the same hash value, it is necessary to access the hash table at (map->ht)[hash value]
and check for a matching key.
#define container_of(ptr, type, member) \
({ \
void *__mptr = (void *) (ptr); \
((type *) (__mptr - offsetof(type, member))); \
})
static struct hash_key *find_key(map_t *map, int key) {
struct hlist_head *head = &(map->ht)[hash(key, map->bits)];
for (struct hlist_node *p = head->first; p; p = p->next) {
struct hash_key *kn = container_of(p, struct hash_key, node);
if (kn->key == key)
return kn;
}
return NULL;
}
map_get
If a matching key exists, return its bucket. If it doesn’t, return NULL.
void *map_get(map_t *map, int key)
{
struct hash_key *kn = find_key(map, key);
return kn ? kn->data : NULL;
}
map_add
- Initially, check for the presence of a matching key. If found, return.
- Next, locate the head of the corresponding hash list based on the hash value.
n->next = first;
: Place the new noden
before the existingfirst
.if (first) first->pprev = &n->next;
: For cases where the first node exists (indicating at least one node shares the same hash value as the new one), establish a backward link from it ton
.h->first = n;
: Move to thefirst
.n->pprev = &h->first;
: Establish a backward link fromn
to the hash list head.
void map_add(map_t *map, int key, void *data)
{
struct hash_key *kn = find_key(map, key);
if (kn)
return;
kn = malloc(sizeof(struct hash_key));
kn->key = key, kn->data = data;
struct hlist_head *h = &map->ht[hash(key, map->bits)];
struct hlist_node *n = &kn->node, *first = h->first;
n->next = first;
if (first)
first->pprev = &n->next;
h->first = n;
n->pprev = &h->first;
}
map_deinit
Remove hash table.
void map_deinit(map_t *map)
{
if (!map)
return;
for (int i = 0; i < MAP_HASH_SIZE(map->bits); i++) {
struct hlist_head *head = &map->ht[i];
for (struct hlist_node *p = head->first; p;) {
struct hash_key *kn = container_of(p, struct hash_key, node);
struct hlist_node *n = p;
p = p->next;
if (!n->pprev) /* unhashed */
goto bail;
struct hlist_node *next = n->next, **pprev = n->pprev;
*pprev = next;
if (next)
next->pprev = pprev;
n->next = NULL, n->pprev = NULL;
bail:
free(kn->data);
free(kn);
}
}
free(map);
}
two_sum
- Create an hashtable.
- Iterate through the
nums
array to search for the valuetarget - nums[i]
in the hash table. - If the value is not found, insert the
nums[i]
into the hash table.
int *twoSum(int *nums, int numsSize, int target, int *returnSize)
{
map_t *map = map_init(10);
*returnSize = 0;
int *ret = malloc(sizeof(int) * 2);
if (!ret)
goto bail;
for (int i = 0; i < numsSize; i++) {
int *p = map_get(map, target - nums[i]);
if (p) { /* found */
ret[0] = i, ret[1] = *p;
*returnSize = 2;
break;
}
p = malloc(sizeof(int));
*p = i;
map_add(map, nums[i], p);
}
bail:
map_deinit(map);
return ret;
}
Hashtables
- Every bucket in the hashtable is a linked list which will hold all objects that are hashed to the same bucket.
- Hashtable contains all elements in different buckets.
- In case a collision does happen, the elements are chained.
- A good hash function should make sure you get O(1) elements into every bucket.
Hash collision
A hash function transforms the search key into an array index. The ideal case is such that no two search keys hashes to the same array index. However, this is not always the case and is impossible to guarantee for unseen given data.
Perfect hash function
A perfect hash function is defined as an one-to-one function such that each element maps to a unique value in a hastable. It can be created if all the keys are known ahead of time.
Hashtables in Linux
- The hashtable is an array of
struct hlist_head
pointers, where each one points to a different list, and each one of those lists holds all elements that are hashed to the same bucket. - Every element is essentially part of a hlist and the hashtable only holds the head of these lists.