NSFL Hashes and Stuff

Directional Vector · Dec 10, 2021

I was bored the last few days, read a bit about hashes and hash tables and thought it would be fun to rewrite them.
Anyways here is my abomination that may contain code or logic from random websites. Please tell me that I'm retarded (pointing out the issue optional)

main.cpp (nothing interesting here)

C++:

#include <iostream>
#include "hashtable.h"

int main(int argc, char const *argv[])
{
    hst::HashTable<int> map = hst::HashTable<int>(150); // New map type int; size 150
    map.insert("integer", 99);                          // Insert integer
    map.insert("2", 21);
    map.insert("3", 19);
    map.remove("3");                              // Remove by key
    map.resize(200);                              // Resize map (size 200)
    std::cout << map.get("integer") << std::endl; // Get value by key
    std::cout << map.get("2") << std::endl;
    return 0;
}

hashtable.h. The actual interesting bit

C++:

#ifndef HASHTABLE_INCLUDE
#define HASHTABLE_INCLUDE
#include <iostream>
#include <stdlib.h>
#include <vector>
#include "prime.h"

namespace hst
{
    template <class T>
    class HashTable
    {
    private:
        T *arr;                        // The content of the map
        std::vector<std::string> keys; // A list of used keys. Used for reallocating and returning every key.
        int r1;                        // Random parameter 1
        int r2;                        // Random parameter 2
        long long int p;               // Prime number larger than arr

        __int128 hash(const std::string &key) // Generate a hash based on the key
        {
            __int128 hash = 0;
            for (int i = 0; i < key.length(); i++)
            {
                hash = (hash << 5) - hash + key[i]; // Hash = hash*31 + key[i] (a lot of collisions don't @ me)
            }
            return hash;
        }

        int compressedHash(__int128 hash)
        {
            return ((r1 * hash + r2) % p) % sizeof(arr); // Compress hash into usable index (same as above)
        }

    public:
        HashTable(int size)
        {
            srand(time(NULL));                                   // Seed
            r1 = rand() % 5000 + 200;                            // Set first random parameter
            r2 = rand() % 5000 + 200;                            // Set second random parameter
            arr = new T[size];                                   // Set array to size
            p = prime::highest(sizeof(arr), sizeof(arr) + 1000); // Get a (more or less) appropriate prime number
        }

        void insert(std::string key, T content)
        {
            arr[compressedHash(hash(key))] = content; // Set content
            keys.push_back(key);                      // Add to used keys
        }

        T get(std::string key)
        {
            return arr[compressedHash(hash(key))]; // Kind of self explaining
        }

        void remove(std::string key)
        {
            arr[compressedHash(hash(key))] = NULL;                // This will give you a warning but fuck it
            std::vector<std::string>::iterator el = keys.begin(); // First element
            for (int i = 0; i < keys.size(); i++)                 //For every element
            {
                if (keys[i] == key)
                {
                    keys.erase(el);
                    return;
                }
                el++; // Next element
            }
        }

        void resize(int size)
        {
            T *contents = new T[keys.size()]; // Temporary array for every value by key
            for (int i = 0; i < keys.size(); i++)
            {
                contents[i] = get(keys[i]);
            }
            arr = new T[size]; // I'm pretty sure this is memory leakage
            for (int i = 0; i < keys.size(); i++)
            {
                arr[compressedHash(hash(keys[i]))] = contents[i]; // Rehash & store every stored element
            }
            delete[] contents; // Delete temp array
        }

        std::vector<std::string> getKeys()
        {
            return keys; // Yeah
        }

        /**
         * Get a pointer to a new array holding every inserted key.
         * Remember to delete.
        */
        std::string *getKeysArray()
        {
            std::string *k = new std::string[keys.size()];
            for (int i = 0; i < keys.size(); i++)
            {
                k[i] = keys[i];
            }

            return k;
        }
    };
}
#endif

prime.h. "Calculates" a high prime number that I'm using in the compressing function.

C++:

#ifndef PRIME_INCLUDE
#define PRIME_INCLUDE

namespace prime
{
    /**
     * I copied this from SO and only edited a few bits.
     * Don't ask why this works.
     * Returns the highest prime number (lower than max & higher than min)
    */
    int highest(long long int min, long long int max)
    {
        long long int p;

        for (int i = min; i <= max; i++)
        {

            for (int j = 2; j <= i; j++)
            {
                if (!(i % j) && (i != j))
                {
                    break;
                }

                if (j == i)
                {
                    p = i;
                }
            }
        }
        return p;
    }
}

#endif

This is pretty shit but it werks most of the time. Collisions happen often tho and trying to make an instance of hst::HashTable<std::string> is crashing with a lot of errors that I don't have the patience to actually work out.
Also the arr[compressedHash(hash(key))] = NULL; is bullshit but I have no memory efficient idea to make this work with every type.

Jahy · Dec 11, 2021

You're retarded.

That aside, I think this is a really cool exercise. When I was pursuing my studies in university, learning about data structures was one of my favorite parts of the programming courses. Learning how to make them from scratch rather than blindly using the self-included libraries truly gives you a better and deeper understanding of both their function and purpose.

There are problems with this, of course, but I think it's overall a relatively decent attempt. Collissions are already an issue as far as hashes go and doing your best to minimize them is the way to most effectively using this structure. If you want to continue with this code, I would advise that's what you focus on improving. Also, please either single space or equally space out your comments within each file; you are triggering my autism.

If I have both the time and motivation, I'll play around with it a bit and see if there's anything I can offer. I'm a bit rusty on C++ as I've focused more on C#, Java, and even Python, but it'll be a good refresher exercise for me all the same.

Directional Vector · Dec 12, 2021

Jahy said:
If I have both the time and motivation, I'll play around with it a bit

Knock yourself out. I'll look into it if someone posts improvements and shit but I don't plan to make my code usable (and fixing the collision issue)

Jahy said:
comments

To be honest I didn't expect anyone to actually read my code so I just commented the bare minimum and hit auto format. I'll keep in mind to not format code after commenting.

I looked into Bloom Filters because they're so close to Hashtables and made a quick implementation myself within a spare hour

C++:

// excerpt from hashdatastruct.h
     /**
     * A basic implementation of a bloom filter.
     * Bloom filters are probalistic data structures that can confirm if an item is in the set.
     * BFs are prone to false positives but won't produce false negatives.
     * Collision probability can be reduced by a few things:
     * [*] Larger bloom filter
     * [*] Hashing functions
     * [*] Compression functions that actually work (unlike the currently implemented)
     * [*] Multiple compressed hashes for a key
    */
    template <class T>
    class BloomFilter
    {
    private:
        bool *arr; // Contents
        int r1; // Random hashing parameters
        int r2;
        int p; // Prime number larger than arr
        std::hash<T> key_hash; // Base hash of key

        void setLength(int length)
        {
            arr = new bool[length]{false};
        }

        /**
         * Bloom filters need multiple different hashes for the same key to work properly.
         * I made these calculations without any mathematical consideration. That's why they are so bad
        */
        int uniqueHash1(__int128 hash) { return ((hash * r1 << r2) % p) % sizeof(arr); }
        int uniqueHash2(__int128 hash) { return hash % p % sizeof(arr) + r1 - r2; }
        int uniqueHash3(__int128 hash) { return (hash << r2) / r1 % p % sizeof(arr) + r2 << r2 / r1; }

    public:
        // Constructor
        BloomFilter(int length)
        {
            setLength(length); // Set content array
            srand(time(NULL)); // Seed
            r1 = rand() % (length - 30) + 30; // Compression parameters
            r2 = rand() % 9 + 1;
            p = prime::highest(length, length + 500);
        }

        /**
         * Checks if a key is probably inserted into the filter
        */
        bool inFilter(T key)
        {
            __int128 hash = key_hash(key);
            if (arr[uniqueHash1(hash)] && arr[uniqueHash2(hash)] && arr[uniqueHash3(hash)])
                return true;
            return false;
        }

        /**
         * Inserts a key into the filter. Ground breaking I know
        */
        void insert(T key)
        {
            __int128 hash = key_hash(key);
            arr[uniqueHash1(hash)] = true;
            arr[uniqueHash2(hash)] = true;
            arr[uniqueHash3(hash)] = true;
        }

        void operator delete(void *p)
        {
            free(p);
        }
    };

There are a lot of collisions

C++:

#include <iostream>
#include <string>
#include <stdlib.h>
#include "dependencies/hashDataStruct.h"

std::string getRandomString(int length)
{
    std::string s;
    for (int i = 0; i < length; i++)
    {
        s += (char)(rand() % 95 + 32);
    }

    return s;
}

int main(int argc, char const *argv[])
{
    if (argc < 3) // [file] [number of string to insert] [string length]
    {
        std::cout << "Not enough arguments" << std::endl;
        return 0;
    }
    srand(time(NULL));
    int strn = std::stoi(argv[1]);
    int strl = std::stoi(argv[2]);
    int collisions = 0;
    hst::BloomFilter<std::string> b = hst::BloomFilter<std::string>(100);

    std::string strings[strn];
    for (int i = 0; i < strn; i++)
    {
        std::string s = getRandomString(strl);
        std::cout << i << ") " << s << std::endl;
        strings[i] = s;
        b.insert(s);
    }

    for (int i = 0; i < 500; i++)
    {
        std::string s = getRandomString(strl);
        if (b.inFilter(s))
        {
            collisions++;
            int collidedString = -1;
            for (int i = 0; i < strn; i++)
            {
                if (s == strings[i])
                {
                    collidedString = i;
                    break;
                }
            }
            std::cout << "bloom collision "
                      << "(" << collidedString << "): " << s << std::endl;
        }
    }
    std::cout << "collisions: " << collisions << std::endl;
    return 0;
}

C++:

#ifndef HASHSTRUCTS_INCLUDE
#define HASHSTRUCTS_INCLUDE
#include <stdlib.h>
#include <vector>
#include "prime.h"

namespace hst
{
    template <class T>
    class HashTable
    {
    private:
        T *arr; // The content of the map
        std::vector<std::string> keys; // A list of used keys. Used for reallocating and returning every key.
        int r1; // Random parameter 1
        int r2; // Random parameter 2
        int p; // Prime number larger than arr

        __int128 hash(const std::string &key) // Generate a hash based on the key
        {
            __int128 hash = 0;
            for (int i = 0; i < key.length(); i++)
            {
                hash = (hash << 5) - hash + key[i]; // Hash = hash*31 + key[i] (a lot of collisions don't @ me)
            }
            return hash;
        }

        int compressedHash(__int128 hash)
        {
            return ((r1 * hash + r2) % p) % sizeof(arr); // Compress hash into usable index (same as above)
        }

    public:
        HashTable(int size)
        {
            srand(time(NULL)); // Seed
            r1 = rand() % 5000 + 200; // Set first random parameter
            r2 = rand() % 5000 + 200; // Set second random parameter
            arr = new T[size]; // Set array to size
            p = prime::highest(sizeof(arr), sizeof(arr) + 1000); // Get a (more or less) appropriate prime number
        }

        void insert(std::string key, T content)
        {
            arr[compressedHash(hash(key))] = content; // Set content
            keys.push_back(key); // Add to used keys
        }

        T get(std::string key)
        {
            return arr[compressedHash(hash(key))]; // Kind of self explaining
        }

        void remove(std::string key)
        {
            arr[compressedHash(hash(key))] = NULL; // This will give you a warning but fuck it
            std::vector<std::string>::iterator el = keys.begin(); // First element
            for (int i = 0; i < keys.size(); i++) //For every element
            {
                if (keys[i] == key)
                {
                    keys.erase(el);
                    return;
                }
                el++; // Next element
            }
        }

        void resize(int size)
        {
            T *contents = new T[keys.size()]; // Temporary array for every value by key
            for (int i = 0; i < keys.size(); i++)
            {
                contents[i] = get(keys[i]);
            }
            arr = new T[size]; // Resize array
            for (int i = 0; i < keys.size(); i++)
            {
                arr[compressedHash(hash(keys[i]))] = contents[i]; // Rehash & store every stored element
            }
            delete[] contents; // Delete temp array
        }

        std::vector<std::string> getKeys()
        {
            return keys; // Yeah
        }

        /**
         * Get a pointer to a new array holding every inserted key.
         * Remember to delete.
        */
        std::string *getKeysArray()
        {
            std::string *k = new std::string[keys.size()];
            for (int i = 0; i < keys.size(); i++)
            {
                k[i] = keys[i];
            }

            return k;
        }

        void operator delete(void *p)
        {
            free(p);
        }
    };

    /**
     * A basic implementation of a bloom filter.
     * Bloom filters are probalistic data structures that can confirm if an item is in the set.
     * BFs are prone to false positives but won't produce false negatives.
     * Collision probability can be reduced by a few things:
     * [*] Larger bloom filter
     * [*] Hashing functions
     * [*] Compression functions that actually work (unlike the currently implemented)
     * [*] Multiple compressed hashes for a key
    */
    template <class T>
    class BloomFilter
    {
    private:
        bool *arr; // Contents
        int r1; // Random hashing parameters
        int r2;
        int p; // Prime number larger than arr
        std::hash<T> key_hash; // Base hash of key

        void setLength(int length)
        {
            arr = new bool[length]{false};
        }

        /**
         * Bloom filters need multiple different hashes for the same key to work properly.
         * I made these calculations without any mathematical consideration. That's why they are so bad
        */
        int uniqueHash1(__int128 hash) { return ((hash * r1 << r2) % p) % sizeof(arr); }
        int uniqueHash2(__int128 hash) { return hash % p % sizeof(arr) + r1 - r2; }
        int uniqueHash3(__int128 hash) { return (hash << r2) / r1 % p % sizeof(arr) + r2 << r2 / r1; }

    public:
        // Constructor
        BloomFilter(int length)
        {
            setLength(length); // Set content array
            srand(time(NULL)); // Seed
            r1 = rand() % (length - 30) + 30; // Compression parameters
            r2 = rand() % 9 + 1;
            p = prime::highest(length, length + 500);
        }

        /**
         * Checks if a key is probably inserted into the filter
        */
        bool inFilter(T key)
        {
            __int128 hash = key_hash(key);
            if (arr[uniqueHash1(hash)] && arr[uniqueHash2(hash)] && arr[uniqueHash3(hash)])
                return true;
            return false;
        }

        /**
         * Inserts a key into the filter. Ground breaking I know
        */
        void insert(T key)
        {
            __int128 hash = key_hash(key);
            arr[uniqueHash1(hash)] = true;
            arr[uniqueHash2(hash)] = true;
            arr[uniqueHash3(hash)] = true;
        }

        void operator delete(void *p)
        {
            free(p);
        }
    };
}
#endif

Directional Vector · Dec 16, 2021

Kinda related but I guess I just need a place to rant about troons.
I was looking into brute forcing shinigami eyes extension's bloom filter which turned out to be more difficult than expected.

God I hate JavaScript so much
- Ok it's actually TypeScript but my point still stands
I have only worked with browser extensions on a very basic level and I don't understand some magic that's going on
- From the code it looks like the extension is reading local files? Am I retarded or shouldn't they not be able to do that?
The troon who made this pile of surprisingly popular shit didn't even bother to make the source readable.

The last point is the most retarded one obviously. How the fuck am I supposed to understand what you were thinking if you refuse to make a fucking comments or god forbid, split your code in multiple fucking functions with descriptive names?
Whoever made this is apparently too retarded to use a debugger and is therefore printing everything to the fucking console. This wouldn't be so bad if the troon would invest half a fucking minute to remove unused code after debugging instead of commenting it out. You are already using source control why won't you just delete unused lines you fucking monkey?

Now you might be thinking something along the lines of "You fucking faggot look at your previous posts you did this shit yourself what are you complaining about?" or maybe just "You fucking faggot". However I didn't post my shit expecting someone else to work with it and I didn't publish my shit to fucking github.

I'm serious. Look at this:

JavaScript:

function getIdentifierFromElementImpl(element: HTMLAnchorElement, originalTarget: HTMLElement): string {
    if (!element) return null;

    const dataset = element.dataset;

    if (hostname == 'reddit.com') {
        const parent = element.parentElement;
        if (parent && parent.classList.contains('domain') && element.textContent.startsWith('self.')) return null;
    } else if (hostname == 'disqus.com') {
        if (element.classList && element.classList.contains('time-ago')) return null;
    } else if (hostname == 'facebook.com') {
        const parent = element.parentElement;
        const firstChild = <HTMLElement>element.firstChild;
        if (parent && (parent.tagName == 'H1' || parent.id == 'fb-timeline-cover-name')) {
            const id = getCurrentFacebookPageId();
            //console.log('Current fb page: ' + id)
            if (id)
                return 'facebook.com/' + id;
        }

        // comment timestamp
        if (firstChild && firstChild.tagName == 'ABBR' && element.lastChild == firstChild) return null;

        // post 'see more'
        if (element.classList.contains('see_more_link')) return null;

        // post 'continue reading'
        if (parent && parent.classList.contains('text_exposed_link')) return null;

        // React comment timestamp
        if (parent && parent.tagName == 'LI') return null;

        // React post timestamp
        if (element.getAttribute('role') == 'link' && parent && parent.tagName == 'SPAN' && firstChild && firstChild.tagName == 'SPAN' && firstChild.tabIndex == 0)
            return null;

        // React big profile picture (user or page)
        if (originalTarget instanceof SVGImageElement && isFacebookPictureLink(element) && !getMatchingAncestorByCss(element, '[role=article]')) {
            return getIdentifier(window.location.href);
        }

        // React cover picture
        if (originalTarget instanceof HTMLImageElement && isFacebookPictureLink(element) && element.getAttribute('aria-label') && !getMatchingAncestorByCss(element, '[role=article]')) {
            return getIdentifier(window.location.href);
        }

        if (dataset) {
            const hovercard = dataset.hovercard;
            if (hovercard) {
                const id = captureRegex(hovercard, /id=(\d+)/);
                if (id)
                    return 'facebook.com/' + id;
            }

            // post Comments link
            if (dataset.testid == 'UFI2CommentsCount/root') return null;

            // notification
            if (dataset.testid == 'notif_list_item_link') return null;

            // post Comments link
            if (dataset.commentPreludeRef) return null;

            // page left sidebar
            if (dataset.endpoint) return null;

            // profile tabs
            if (dataset.tabKey) return null;

            const gt = dataset.gt;
            if (gt) {
                const gtParsed = JSON.parse(gt);
                if (gtParsed.engagement && gtParsed.engagement.eng_tid) {
                    return 'facebook.com/' + gtParsed.engagement.eng_tid;
                }
            }

            // comment interaction buttons
            if (dataset.sigil) return null;

            let p = <HTMLElement>element;
            while (p) {
                const bt = p.dataset.bt;
                if (bt) {
                    const btParsed = JSON.parse(bt);
                    if (btParsed.id) return 'facebook.com/' + btParsed.id;
                }
                p = p.parentElement;
            }
        }
    } else if (hostname == 'twitter.com') {
        if (dataset && dataset.expandedUrl) return getIdentifier(dataset.expandedUrl);
        if (element.href.startsWith('https://t.co/')) {
            const title = element.title;
            if (title && (title.startsWith('http://') || title.startsWith('https://')))
                return getIdentifier(title);
            const content = element.textContent;
            if (!content.includes(' ') && content.includes('.') && !content.includes('…')) {
                const url = content.startsWith('http://') || content.startsWith('https://') ? content : 'http://' + content;
                return getIdentifier(url);
            }
        }
    } else if (domainIs(hostname, 'wikipedia.org')) {
        if (element.classList.contains('interlanguage-link-target')) return null;
    }

    if (element.classList.contains('tumblelog')) return element.textContent.replace('@', '') + '.tumblr.com';

    const href = element.href;
    if (href && (!href.endsWith('#') || href.includes('&stick='))) return getIdentifierFromURLImpl(tryParseURL(href));
    return null;
}

That's one hell of a fat function. Surprisingly with a lot of comments. Sadly, most of the if statements DO THE EXACT SAME FUCKING THING. Put all of that shit in a fucking function I'm begging you and maybe start commenting on your logic as well.
The sad part is, that wasn't even the worst fucking function. Look at this lunacy.

JavaScript:

function getIdentifierFromURLImpl(url: URL): string {
    if (!url) return null;

    // nested urls
    const nested = tryUnwrapNestedURL(url);
    if (nested) {
        return getIdentifierFromURLImpl(nested);
    }

    // fb group member badge
    if (url.pathname.includes('/badge_member_list/')) return null;

    let host = url.hostname;
    const searchParams = url.searchParams;
    if (domainIs(host, 'web.archive.org')) {
        const match = captureRegex(url.href, /\/web\/\w+\/(.*)/);
        if (!match) return null;
        return getIdentifierFromURLImpl(tryParseURL('http://' + match));
    }

    if (host.startsWith('www.')) host = host.substring(4);

    const pathArray = url.pathname.split('/');

    if (domainIs(host, 'facebook.com')) {
        if (searchParams.get('story_fbid')) return null;
        const fbId = searchParams.get('id');
        const p = url.pathname.replace('/pg/', '/');
        const isGroup = p.startsWith('/groups/');
        if (isGroup && p.includes('/user/')) return 'facebook.com/' + pathArray[4]; // fb.com/groups/.../user/...
        return 'facebook.com/' + (fbId || getPartialPath(p, isGroup ? 2 : 1).substring(1));
    } else if (domainIs(host, 'reddit.com')) {
        const pathname = url.pathname.replace('/u/', '/user/');
        if (!pathname.startsWith('/user/') && !pathname.startsWith('/r/')) return null;
        if (pathname.includes('/comments/') && hostname == 'reddit.com') return null;
        return 'reddit.com' + getPartialPath(pathname, 2);
    } else if (domainIs(host, 'twitter.com')) {
        return 'twitter.com' + getPartialPath(url.pathname, 1);
    } else if (domainIs(host, 'youtube.com')) {
        const pathname = url.pathname;
        if (pathname.startsWith('/user/') || pathname.startsWith('/c/') || pathname.startsWith('/channel/'))
            return 'youtube.com' + getPartialPath(pathname, 2);
        return 'youtube.com' + getPartialPath(pathname, 1);
    } else if (domainIs(host, 'disqus.com') && url.pathname.startsWith('/by/')) {
        return 'disqus.com' + getPartialPath(url.pathname, 2);
    } else if (domainIs(host, 'medium.com')) {
        const hostParts = host.split('.');
        if (hostParts.length == 3 && hostParts[0] != 'www') {
            return host;
        }
        return 'medium.com' + getPartialPath(url.pathname.replace('/t/', '/'), 1);
    } else if (domainIs(host, 'tumblr.com')) {
        if (url.pathname.startsWith('/register/follow/')) {
            const name = getPathPart(url.pathname, 2);
            return name ? name + '.tumblr.com' : null;
        }
        if (host != 'www.tumblr.com' && host != 'assets.tumblr.com' && host.indexOf('.media.') == -1) {
            if (!url.pathname.startsWith('/tagged/')) return url.host;
        }
        return null;
    } else if (domainIs(host, 'wikipedia.org') || domainIs(host, 'rationalwiki.org')) {
        const pathname = url.pathname;
        if (url.hash) return null;
        if (pathname == '/w/index.php' && searchParams.get('action') == 'edit') {
            const title = searchParams.get('title');
            if (title && title.startsWith('User:')) {
                return 'wikipedia.org/wiki/' + title;
            }
        }
        if (pathname.startsWith('/wiki/Special:Contributions/') && url.href == window.location.href)
            return 'wikipedia.org/wiki/User:' + pathArray[3];
        if (pathname.startsWith('/wiki/User:'))
            return 'wikipedia.org/wiki/User:' + pathArray[2].split(':')[1];
        if (pathname.includes(':')) return null;
        if (pathname.startsWith('/wiki/')) return 'wikipedia.org' + decodeURIComponent(getPartialPath(pathname, 2));
        else return null;
    } else if (host.indexOf('.blogspot.') != -1) {
        const m = captureRegex(host, /([a-zA-Z0-9\-]*)\.blogspot/);
        if (m) return m + '.blogspot.com';
        else return null;
    } else if (host.includes('google.')) {
        if (url.pathname == '/search' && searchParams.get('stick') && !searchParams.get('tbm') && !searchParams.get('start')) {
            const q = searchParams.get('q');
            if (q) return 'wikipedia.org/wiki/' + q.replace(/\s/g, '_');
        }
        return null;
    } else {
        if (host.startsWith('m.')) host = host.substr(2);
        return host;
    }
}

A great total of two (2) comments. Thanks mate that really helped understanding every fucking detail of your fucking function.
Is that what HRT does to your brain? I thought programming socks are supposed to make your code better? I have so many questions.

hzp · Jan 7, 2022

You're retarded.
That said, it seems there is one fundamental thing you misunderstood with hashes & hashtables. Yes, hash collisions are inevitable. Yes, minimizing them is great (although that's completely out of scope for someone who's either not doing a PhD. in Group Theory or some autistic CS subfield nor paid a 7digit salary at a FAANG). But: No, it's not okay to not deal with them.
It's hard to explain without being confused, and I'm not sure how comfortable you are with the subject, so my apologies.

Basically, hash collisions will impact performance. They should not, in any case (and that is the case of every general purpose data structure you can come up with) impact correctness.
Currently you have no strategy to deal with collisions, and that strategy is really the main difference between all the different flavours and algorithms of hash tables (some are better than others, some have tradeoffs, etc.). You have to.
If you hash a key, and then a second one, you need someway to differentiate the two values, because they could have the same hash! Collisions are impossible to avoid: you're transforming a piece of data that can have any size (but for the sake of examples they are 128 byte long, like IPv6 addresses, and you're reducing them to an index that can only be 32byte long (64 if you use `size_t`), 2^96 ip addresses will be put at the same index in your array. Stuff actually becomes even more perverse, because obviously you wouldn't crate an array of 2^32 * sizeof(key) in size, so you're doing a modulo. That creates collisions within your hashes.

I recommend you go over the wikipedia article or some other similar resources. They have graphics and explanation about the different strategies. "chaining" being the easiest one.

There's a lot of other things I could comment on (like you should have your hashtable do the resizing, and not the user of your hashtable), but that's really the most important one.

hzp · Jan 7, 2022

oh yeah I didn't notice that arr was of type `T *`.
So basically, another big mistake is that: sizeof(arr) doesn't do what you think it does: https://godbolt.org/z/eoPW5Mhr4 .

Directional Vector · Jan 8, 2022

hzp said:
oh yeah I didn't notice that arr was of type `T *`.
So basically, another big mistake is that: sizeof(arr) doesn't do what you think it does: https://godbolt.org/z/eoPW5Mhr4 .

hzp said:
You're retarded.
That said, it seems there is one fundamental thing you misunderstood with hashes & hashtables. Yes, hash collisions are inevitable. Yes, minimizing them is great (although that's completely out of scope for someone who's either not doing a PhD. in Group Theory or some autistic CS subfield nor paid a 7digit salary at a FAANG). But: No, it's not okay to not deal with them.
It's hard to explain without being confused, and I'm not sure how comfortable you are with the subject, so my apologies.

Basically, hash collisions will impact performance. They should not, in any case (and that is the case of every general purpose data structure you can come up with) impact correctness.
Currently you have no strategy to deal with collisions, and that strategy is really the main difference between all the different flavours and algorithms of hash tables (some are better than others, some have tradeoffs, etc.). You have to.
If you hash a key, and then a second one, you need someway to differentiate the two values, because they could have the same hash! Collisions are impossible to avoid: you're transforming a piece of data that can have any size (but for the sake of examples they are 128 byte long, like IPv6 addresses, and you're reducing them to an index that can only be 32byte long (64 if you use `size_t`), 2^96 ip addresses will be put at the same index in your array. Stuff actually becomes even more perverse, because obviously you wouldn't crate an array of 2^32 * sizeof(key) in size, so you're doing a modulo. That creates collisions within your hashes.

I recommend you go over the wikipedia article or some other similar resources. They have graphics and explanation about the different strategies. "chaining" being the easiest one.

There's a lot of other things I could comment on (like you should have your hashtable do the resizing, and not the user of your hashtable), but that's really the most important one.

I'm going to improve my shit once I've got some spare time on my hands. Because that's going to take some time, feel free to continue taking my code apart.

Directional Vector · Jan 25, 2022

Okay so I sat down a few days ago and made a basic implementation of linear probing. There are still some things bugging me but I don't care right know. And the compiler is still crying about me using a variable array length.

C++:

__extension__ typedef __int128 int128; // The compiler wouldn't shut up.
template <class T>
class Container // Container class to store key, status and content
{
private:
  Container(int s) { status = s; }

public:
  T content;
  std::string key;
  int status = 0; // 0 unused; 1 deleted; 2 in use.
  Container() {}
  void setContent(std::string k, T c) {
    content = c;
    key = k;
    status = 2;
  }
  void deleteContainer() { *this = Container(1); }
};

template <class T> class HashTable {
private:
  Container<T> *arr;
  // T *arr; // The content of the map
  std::vector<std::string> keys; // A list of used keys. Used for reallocating
                                 // and returning every key.
  int r1;                        // Random parameter 1
  int r2;                        // Random parameter 2
  int p;                         // Prime number larger than arr
  int size;                      // Array length. Yeah I'm lazy.

  int128 hash(const std::string &key) // Generate a hash based on the key
  {
    int128 hash = 0;
    for (size_t i = 0; i < key.length(); i++) {
      hash = (hash << 5) - hash +
             key[i]; // Hash = hash*31 + key[i] (a lot of collisions don't @ me)
    }
    return hash;
  }

  int compressedHash(int128 hash) {
    return ((r1 * hash + r2) % p) %
           size; // Compress hash into usable index (same as above)
  }

  void resize(int n) {
    T contents[keys.size()]; // Temporary array for every value by key
    for (size_t i = 0; i < keys.size(); i++) {
      contents[i] = get(keys[i]);
    }
    arr = new Container<T>[n]; // Resize array
    for (size_t i = 0; i < keys.size(); i++) {
      insert(keys[i], contents[i]);
    }
  }

public:
  HashTable(int pSize = 150) {
    srand(time(NULL)); // Seed
    size = pSize;
    r1 = rand() % 5000 + 200;      // Set first random parameter
    r2 = rand() % 5000 + 200;      // Set second random parameter
    arr = new Container<T>[pSize]; // Set array to size
    p = prime::highest(
        pSize, pSize + 1000); // Get a (more or less) appropriate prime number
  }

  /**
  Insert `content` at `key`.
  */
  void insert(std::string key, T content) {
    int h = compressedHash(hash(key)); // Hash
    int steps = 0;           // Count steps until empty bucket is located
    Container<T> c = arr[h]; // Container at hash location
    while (c.status == 2) {  // While Container is used ...
      if (c.key != key) {
        c = arr[h == size ? h = 0 : h++]; // Increment Hash by 1 or back to 0
        steps++;
      } else // Same key => overwrite container
        break;
    }
    c.setContent(key, content);
    arr[h] = c; // Maybe I should have used pointers.
    if (steps > size * 0.75)
      resize(size + size * 0.5);
  }

  /**
  Retrieve content of type `T` at `key`
  */
  T get(std::string key) {
    int h = compressedHash(hash(key));
    int steps = 0;
    Container<T> c = arr[h];
    while (c.key != key) {
      if (c.status == 0)
        // The content corresponding to the given key can't be stored after
        // an unused cell. Therefore the key isn't present in the set.
        throw std::out_of_range("key not present");
      {
        c = arr[h == size ? h = 0 : h++];
        steps++;
      }
    }
    return c.content;
  }

  /**
  Remove content at `key`.
  */
  void remove(std::string key) {
    int h = compressedHash(hash(key));
    Container<T> c = arr[h];
    while (c.key != key) {
      if (c.status == 0)
        throw std::out_of_range("key not present");
      c = arr[h == size ? h = 0 : h++];
    }

    std::vector<std::string>::iterator el = keys.begin(); // First element
    for (size_t i = 0; i < keys.size(); i++)              // For every element
    {
      if (keys[i] == key) {
        keys.erase(el);
        return;
      }
      el++; // Next element
    }
    c.deleteContainer();
    arr[h] = c; // I really should have used pointers
  }
};

Please don't cry over the comment formatting, the plug in's auto format is responsible and I don't really care about it either way.

hzp · Aug 25, 2022

Hey,
I hope you don't mind the very late review.

first thing, I don't see any obvious mistakes, I haven't read everything in depth unfortunately though, so there's that.

Directional Vector said:
C++:

Container() {}

Ok, this isn't really that important for your exercise, but this might pose a problem.
Basically what you do here is you default construct the content of type T. Not all types are default constructible, so in your testing, you might have some cryptic errors about not being able to construct your type T.

Directional Vector said:
C++:

std::vector<std::string> keys; // A list of used keys. Used for reallocating

~~I'm not really sure what you mean by "re-allocating" here. from what I see, is that you never end up using your keys array.~~
It's not a bad thing per se to have a separate array of keys (in practice this is done, because it offers cache-locality), but you don't seem to use it, and since the key is already present in the container, I don't think you need this. ~~Unless I misinterpreted your intent.~~
You don't need this array to resize your hashmap's internal array. since, as you're resizing you can just traverse the whole array and everything "in use".

Directional Vector said:

C++:

  void resize(int n) {
    T contents[keys.size()]; // Temporary array for every value by key
    for (size_t i = 0; i < keys.size(); i++) {
      contents[i] = get(keys[i]);
    }
    arr = new Container<T>[n]; // Resize array
    for (size_t i = 0; i < keys.size(); i++) {
      insert(keys[i], contents[i]);
    }
  }

this function is the most problematic: you copy twice & you potentially explose the stack (that's probably what the compiler complains about).
the difference between

C++:

T foo[n]

and

C++:

foo = new T[n]

, is that the former allocates on the stack and the latter on the heap.
The stack has a very limited memory space and technically everything allocated on it has to be known statically (since the code generated by the compiler will manage it).
Not sure how familiar you are with these two notions, but maybe you should look more into, also learn about stack/function frames. actually you can allocate dynamically on the stack, C11 iirc allows it, but it's very niche and has all kind of weird implications.
Now you also copy the whole contents of you array in this temporary array, and then you copy everything back in the resized array. That's needless, since `arr` is a pointer to heap allocated memory, that means that when you allocate a new chunk of memory there, the previous one still exists, it's not replaced yet. (actually you also have a memory leak, because the chunk memory pointed by `arr` before is not freed, and since you overwrite what `arr` points to, you now have a dangling pointer to a chunk of memory that will never be freed).
You also don't want to use `get` to find all the values, you should just traverse the array, and check if a value is in use or not. Because the use of get in this loop makes you traverse `arr` so many times, while only once would suffice.
So what you should do is more something along the lines of:

C++:

  void resize(int n) {
    Container<T> * tmp = new Container<T>[n];
    for (size_t i; i < size; i++) {
        if (arr[i].status == 2)
            insert_container(tmp, arr[i]);
    }
    delete arr;
    arr = tmp;
    size = n;
  }

you could refactor your insert function to re-use `insert_container`

C++:

void insert_container(Container<T>* arr, Container<T> val);
void insert(std::string key, T content) {
  Container<T> c();
  c.setContent(key, content);
  return insert_container(arr, c);
}

It would also be better to have `size_t size` and `size_t capacity` instead of just `int size`. But, yeah, it should work without this, you'd just have a more limited API.
I'd suggest to check how to implement `std::vector`; there are lots of blogs and tutorials about how to make an amortized variable length array.

there are other things that could benefit from improvement, or that I see aren't exactly correct. but globally I feel that you finally got the gist of how to make a hashtable, I think you probably just need a little bit practice.

Directional Vector · Sep 6, 2022

hzp said:
Hey,
I hope you don't mind the very late review.

first thing, I don't see any obvious mistakes, I haven't read everything in depth unfortunately though, so there's that.

Ok, this isn't really that important for your exercise, but this might pose a problem.
Basically what you do here is you default construct the content of type T. Not all types are default constructible, so in your testing, you might have some cryptic errors about not being able to construct your type T.

~~I'm not really sure what you mean by "re-allocating" here. from what I see, is that you never end up using your keys array.~~
It's not a bad thing per se to have a separate array of keys (in practice this is done, because it offers cache-locality), but you don't seem to use it, and since the key is already present in the container, I don't think you need this. ~~Unless I misinterpreted your intent.~~
You don't need this array to resize your hashmap's internal array. since, as you're resizing you can just traverse the whole array and everything "in use".

this function is the most problematic: you copy twice & you potentially explose the stack (that's probably what the compiler complains about).
the difference between

C++:

T foo[n]

and

C++:

foo = new T[n]

, is that the former allocates on the stack and the latter on the heap.
The stack has a very limited memory space and technically everything allocated on it has to be known statically (since the code generated by the compiler will manage it).
Not sure how familiar you are with these two notions, but maybe you should look more into, also learn about stack/function frames. actually you can allocate dynamically on the stack, C11 iirc allows it, but it's very niche and has all kind of weird implications.
Now you also copy the whole contents of you array in this temporary array, and then you copy everything back in the resized array. That's needless, since `arr` is a pointer to heap allocated memory, that means that when you allocate a new chunk of memory there, the previous one still exists, it's not replaced yet. (actually you also have a memory leak, because the chunk memory pointed by `arr` before is not freed, and since you overwrite what `arr` points to, you now have a dangling pointer to a chunk of memory that will never be freed).
You also don't want to use `get` to find all the values, you should just traverse the array, and check if a value is in use or not. Because the use of get in this loop makes you traverse `arr` so many times, while only once would suffice.
So what you should do is more something along the lines of:

C++:

void resize(int n) { Container<T> * tmp = new Container<T>[n]; for (size_t i; i < size; i++) { if (arr[i].status == 2) insert_container(tmp, arr[i]); } delete arr; arr = tmp; size = n; }

you could refactor your insert function to re-use `insert_container`

C++:

void insert_container(Container<T>* arr, Container<T> val); void insert(std::string key, T content) { Container<T> c(); c.setContent(key, content); return insert_container(arr, c); }

It would also be better to have `size_t size` and `size_t capacity` instead of just `int size`. But, yeah, it should work without this, you'd just have a more limited API.
I'd suggest to check how to implement `std::vector`; there are lots of blogs and tutorials about how to make an amortized variable length array.

there are other things that could benefit from improvement, or that I see aren't exactly correct. but globally I feel that you finally got the gist of how to make a hashtable, I think you probably just need a little bit practice.

appreciate your feedback ~~but I'm legally required to ignore optimizations since I'm a webdev~~. I will try to apply your suggestions when I've got a bit of spare time that I don't feel like wasting on other pointless projects.

hzp said:
I'd suggest to check how to implement `std::vector`; there are lots of blogs and tutorials about how to make an amortized variable length array.

Do you mean I should use vectors instead of arrays to avoid reallocating a new array when resizing the hashmap? Isn't std::vector basically a linked list and not an array so I lose the array's "ability" to access memory by index instead of traversing the entire list?

Directional Vector · Dec 27, 2023

Time flies huh.
I needed a hashtable implementation today and though it'd be fun to try to make my own implementation.
Kind of forgot about this website ngl but I remembered my shit code. Anyways here you go the source is based off of squirrel, if anyone cares.

C++:

#include <stdio.h>
#include "table.h"

bool compare_int(int* a, int* b) { return *a == *b; }
bool compare_c_str(const char** a, const char** b) { return strcmp(*a, *b) == 0; }

PQHash hash_int(int* n) { return *n; } // just returning the value works great for anything numeric like floats or pointers as well
// PQHash hash_int(int* n) { return 1; } // test collisions

PQHash hash_c_str(const char** s)
{
  size_t l = strlen(*s);
  PQHash h = (PQHash)l;
  size_t step = (l >> 5) + 1;
  size_t l1;

  for(l1 = l; l1 >= step; l1 -= step)
    h = h ^ ((h << 5) + (h >> 2) + ((unsigned short)(*s)[l1 - 1]));

  return h;
}

void int_table_test()
{
  Table<int, const char*> table = Table<int, const char*>(32, hash_int, compare_int);

  int k1 = 1;
  int k2 = 2;
  const char* c1 = "int content 1";
  const char* c2 = "int content 2";

  table.set(&k1, &c1);
  table.set(&k2, &c2);

  const char** r1 = table.get(&k1);
  const char** r2 = table.get(&k2);

  printf("%s; %s\n", *r1, *r2);
}

void c_str_table_test()
{
  Table<const char*, const char*> table = Table<const char*, const char*>(4, hash_c_str, compare_c_str);

  const char* k1 = "key1";
  const char* k2 = "key2";
  const char* c1 = "cstr content 1";
  const char* c2 = "cstr content 2";

  char* k1_copy = new char[64]; // copy on heap to test different keys that have the same value and should return the same result
  strcpy(k1_copy, k1);

  table.set(&k1, &c1);
  table.set(&k2, &c2);

  const char** r1 = table.get((const char**)&k1_copy); // trust me bro
  const char** r2 = table.get(&k2);

  printf("%s; %s\n", *r1, *r2);
}

int main()
{
  int_table_test();
  c_str_table_test();
  return 0;
}

C++:

#ifndef TABLE_H
#define TABLE_H

#include <string.h>
#include <stdio.h>

#define MINPOW2 4

typedef unsigned long long PQHash;

template <typename TK, typename TC>
class Table {
  struct Node {
    TK key;
    TC content;
    Node* next;
  };

  PQHash (*key_hash)(TK* key);
  bool (*key_compare)(TK* a, TK* b);
  Node* nodes;
  Node* first_free;

  PQHash get_key_hash(TK* key)
  {
    return this->key_hash(key) & (this->nodes_size - 1);
  };

  Node* get_node_with_hash(TK* key, PQHash hash)
  {
    Node* node = &this->nodes[hash];
    do
    {
      if(node->key && this->key_compare(&node->key, key))
      {
    return node;
      }
      node = node->next;
    } while(node);
    return node;
  };

  Node* get_node(TK* key)
  {
    return get_node_with_hash(key, get_key_hash(key));
  };

  void allocate_nodes(size_t size)
  {

    this->nodes_size = size;
    this->used_nodes = 0;
    this->nodes = new Table::Node[size];
    memset(this->nodes, 0, sizeof(Table::Node) * size);
    this->first_free = &this->nodes[size - 1];
  };

  public:
  size_t nodes_size;
  size_t used_nodes;

  Table(size_t size, PQHash (*key_hash)(TK* key), bool (*key_compare)(TK* a, TK* b))
  {
    this->key_hash = key_hash;
    this->key_compare = key_compare;

    size_t pow2size = MINPOW2;
    while(size > pow2size) pow2size = pow2size << 1;
    this->allocate_nodes(pow2size);
  };

  ~Table()
  {
    delete[] this->nodes;
  }

  void rehash(bool force)
  {
    size_t old_size = this->nodes_size;
    if(old_size < MINPOW2) old_size = MINPOW2;

    Node* old_nodes = this->nodes;
    if(this->used_nodes >= old_size - old_size / 4) this->allocate_nodes(old_size * 2);
    else if(this->used_nodes <= old_size / 4) this->allocate_nodes(old_size / 2);
    else if(force) this->allocate_nodes(old_size);
    else return;

    for(size_t i = 0; i < old_size; i++)
    {
      Node* old_node = &old_nodes[i];
      if(old_node->key)
    this->set(&old_node->key, &old_node->content);
    }

    delete[] old_nodes;
  };

  void set(TK* key, TC* value)
  {
    PQHash h = this->get_key_hash(key);
    Node* node = this->get_node_with_hash(key, h);
   
    // node exists and the content needs to be updated
    if(node)
    {
      node->content = *value;
      return;
    }

    // insert a new node
    // TODO insert new node at the front of the chain to decrease overhead
    node = &this->nodes[h];
    while(node->next) node = node->next;
    node->key = *key;
    node->content = *value;
    node->next = this->first_free;

    // update first_free
    for(;;)
    {
      // TODO this may shit the bed with some types that can be not truthy (why would you use integers as keys)
      if(!this->first_free->key)
      {
    this->used_nodes++;
    return;
      }
      else if(this->first_free == this->nodes) break;
      else this->first_free--;
    }

    this->rehash(true);
  }

  TC* get(TK* key)
  {
    Node* node = this->get_node(key);
    if(node) return &node->content;
    return 0;
  };

  void remove(TK* key)
  {
    Node* node = this->get_node(key);
    if(node)
    {
      // TODO this initializes to the default but keys should ideally be pointers anyways
      *node = {};
      this->used_nodes--;
//      this->rehash(false); // I don't really need the tables to be that memory efficient to downscale them
    }
  }

  void clear()
  {
    this->used_nodes = 0;
    memset(this->nodes, 0, sizeof(Table::Node) * this->nodes_size);
//    this->rehash(true); // I don't really need the tables to be that memory efficient to free their memory on clear. I assume you want a table in similar scale after clearing it.
  }
};

#endif

I did the same implementation in C but I decided I might want to move my codebase to C++ so I can write C with namespaces and templates.
Here's also a bonus very hacky vector implementation in C I wrote before I switched to C++.

I invested quite a bit of time into an implementation where I don't have to forward declare the vector type so I can write `vec(T) v` just like real C++ (woah)

C:

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stddef.h>

typedef int i32;
typedef unsigned int u32;
typedef long long i64;
typedef unsigned long long u64;

struct _DynamicVectorHead {
  u32 cap;
  u32 size;
  char content[1];
};

#define vec(T) T*
#define vec_head(v) ((struct _DynamicVectorHead*)((u64)v - offsetof(struct _DynamicVectorHead, content)))
#define vec_content(vh) ((void*)vh->content)
#define vec_capacity(v) ((vec_head(v))->cap)
#define vec_size(v) ((vec_head(v))->size)
#define vec_new(T, size) ((T*)(_vec_new(sizeof(T), size)->content))
#define vec_free(v) free(vec_head(v))
#define vec_push(v, p) \
{ \
  struct _DynamicVectorHead* vh = vec_head(v); \
  if(vh->cap == vh->size) \
  { \
    vh = _vec_resize(vh, sizeof(*v)); \
    v = vec_content(vh); \
  }\
  v[vh->size] = p; \
  vh->size++; \
}
#define vec_erase(v, i, n) _vec_erase(vec_head(v), i, n, sizeof(*v))
#define vec_clear(v) vec_head(v)->size = 0;

struct _DynamicVectorHead* _vec_new(u32 dsize, u32 size)
{
  struct _DynamicVectorHead* v = malloc(sizeof(struct _DynamicVectorHead) + dsize * size - 1);
  v->cap = size;
  v->size = 0;
  return v;
}

struct _DynamicVectorHead* _vec_resize(struct _DynamicVectorHead* v, u32 dsize)
{
  struct _DynamicVectorHead* new = _vec_new(dsize, v->cap * 1.5);
  new->size = v->size;
  memmove(new->content, v->content, dsize * v->cap);
  free(v);
  return new;
}

void _vec_erase(struct _DynamicVectorHead* v, u32 idx, u32 n, u32 dsize)
{
  memmove((void*)((u64)v->content + (idx * dsize)), (void*)((u64)(v->content) + (idx * dsize) + (n * dsize)), (v->size * dsize) - (idx * dsize) - (n * dsize));
  v->size -= n;
}

int main()
{
  struct Vector3 { u32 x, y, z; };
  vec(struct Vector3) v = vec_new(struct Vector3, 4);

  for(u32 i = 0; i < 20 * 3; i += 3)
  {
    vec_push(v, ((struct Vector3){ i, i + 1, i + 2}));
  }

  vec_erase(v, 0, 2); // erase the first two items

  for(u32 i = 0; i < vec_size(v); i++)
  {
    printf("[%2d] <%2d, %2d, %2d>\n", i, v[i].x, v[i].y, v[i].z);
  }

  vec_free(v);

  return 0;
}

also @Halo fix the kiwifarms theme you can barely read the text on spoiler buttons

Halo · Dec 27, 2023

Directional Vector said:
also @Halo fix the kiwifarms theme you can barely read the text on spoiler buttons

didn't even know we had one lol

Directional Vector · Dec 27, 2023

I gotta stay true to my roots

NSFL Hashes and Stuff

Directional Vector

varishangout.com

Jahy

varishangout.com

Directional Vector

varishangout.com

Directional Vector

varishangout.com

hzp

varishangout.com

hzp

varishangout.com

Directional Vector

varishangout.com

Directional Vector

varishangout.com

hzp

varishangout.com

Directional Vector

varishangout.com

Directional Vector

varishangout.com

Halo

varishangout.com

Directional Vector

varishangout.com