• Please remember, if you're posting about a localizer, person of interest, or other internet bullshit that may be deleted.
    Make Sure to Archive.
    Despite what people say about the internet things can be deleted, and screenshots are only as reliable as the posters credibility.
    Archive Here

NSFL Hashes and Stuff

Directional Vector

varishangout.com
I was bored the last few days, read a bit about hashes and hash tables and thought it would be fun to rewrite them.
Anyways here is my abomination that may contain code or logic from random websites. Please tell me that I'm retarded (pointing out the issue optional)

main.cpp (nothing interesting here)
C++:
#include <iostream>
#include "hashtable.h"

int main(int argc, char const *argv[])
{
    hst::HashTable<int> map = hst::HashTable<int>(150); // New map type int; size 150
    map.insert("integer", 99);                          // Insert integer
    map.insert("2", 21);
    map.insert("3", 19);
    map.remove("3");                              // Remove by key
    map.resize(200);                              // Resize map (size 200)
    std::cout << map.get("integer") << std::endl; // Get value by key
    std::cout << map.get("2") << std::endl;
    return 0;
}

hashtable.h. The actual interesting bit
C++:
#ifndef HASHTABLE_INCLUDE
#define HASHTABLE_INCLUDE
#include <iostream>
#include <stdlib.h>
#include <vector>
#include "prime.h"

namespace hst
{
    template <class T>
    class HashTable
    {
    private:
        T *arr;                        // The content of the map
        std::vector<std::string> keys; // A list of used keys. Used for reallocating and returning every key.
        int r1;                        // Random parameter 1
        int r2;                        // Random parameter 2
        long long int p;               // Prime number larger than arr

        __int128 hash(const std::string &key) // Generate a hash based on the key
        {
            __int128 hash = 0;
            for (int i = 0; i < key.length(); i++)
            {
                hash = (hash << 5) - hash + key[i]; // Hash = hash*31 + key[i] (a lot of collisions don't @ me)
            }
            return hash;
        }

        int compressedHash(__int128 hash)
        {
            return ((r1 * hash + r2) % p) % sizeof(arr); // Compress hash into usable index (same as above)
        }

    public:
        HashTable(int size)
        {
            srand(time(NULL));                                   // Seed
            r1 = rand() % 5000 + 200;                            // Set first random parameter
            r2 = rand() % 5000 + 200;                            // Set second random parameter
            arr = new T[size];                                   // Set array to size
            p = prime::highest(sizeof(arr), sizeof(arr) + 1000); // Get a (more or less) appropriate prime number
        }

        void insert(std::string key, T content)
        {
            arr[compressedHash(hash(key))] = content; // Set content
            keys.push_back(key);                      // Add to used keys
        }

        T get(std::string key)
        {
            return arr[compressedHash(hash(key))]; // Kind of self explaining
        }

        void remove(std::string key)
        {
            arr[compressedHash(hash(key))] = NULL;                // This will give you a warning but fuck it
            std::vector<std::string>::iterator el = keys.begin(); // First element
            for (int i = 0; i < keys.size(); i++)                 //For every element
            {
                if (keys[i] == key)
                {
                    keys.erase(el);
                    return;
                }
                el++; // Next element
            }
        }

        void resize(int size)
        {
            T *contents = new T[keys.size()]; // Temporary array for every value by key
            for (int i = 0; i < keys.size(); i++)
            {
                contents[i] = get(keys[i]);
            }
            arr = new T[size]; // I'm pretty sure this is memory leakage
            for (int i = 0; i < keys.size(); i++)
            {
                arr[compressedHash(hash(keys[i]))] = contents[i]; // Rehash & store every stored element
            }
            delete[] contents; // Delete temp array
        }

        std::vector<std::string> getKeys()
        {
            return keys; // Yeah
        }

        /**
         * Get a pointer to a new array holding every inserted key.
         * Remember to delete.
        */
        std::string *getKeysArray()
        {
            std::string *k = new std::string[keys.size()];
            for (int i = 0; i < keys.size(); i++)
            {
                k[i] = keys[i];
            }

            return k;
        }
    };
}
#endif

prime.h. "Calculates" a high prime number that I'm using in the compressing function.
C++:
#ifndef PRIME_INCLUDE
#define PRIME_INCLUDE

namespace prime
{
    /**
     * I copied this from SO and only edited a few bits.
     * Don't ask why this works.
     * Returns the highest prime number (lower than max & higher than min)
    */
    int highest(long long int min, long long int max)
    {
        long long int p;

        for (int i = min; i <= max; i++)
        {

            for (int j = 2; j <= i; j++)
            {
                if (!(i % j) && (i != j))
                {
                    break;
                }

                if (j == i)
                {
                    p = i;
                }
            }
        }
        return p;
    }
}

#endif

This is pretty shit but it werks most of the time. Collisions happen often tho and trying to make an instance of hst::HashTable<std::string> is crashing with a lot of errors that I don't have the patience to actually work out.
Also the arr[compressedHash(hash(key))] = NULL; is bullshit but I have no memory efficient idea to make this work with every type.
 

Jahy

varishangout.com
You're retarded. :korone-smug:

That aside, I think this is a really cool exercise. When I was pursuing my studies in university, learning about data structures was one of my favorite parts of the programming courses. Learning how to make them from scratch rather than blindly using the self-included libraries truly gives you a better and deeper understanding of both their function and purpose.

There are problems with this, of course, but I think it's overall a relatively decent attempt. Collissions are already an issue as far as hashes go and doing your best to minimize them is the way to most effectively using this structure. If you want to continue with this code, I would advise that's what you focus on improving. Also, please either single space or equally space out your comments within each file; you are triggering my autism.

If I have both the time and motivation, I'll play around with it a bit and see if there's anything I can offer. I'm a bit rusty on C++ as I've focused more on C#, Java, and even Python, but it'll be a good refresher exercise for me all the same.
 

Directional Vector

varishangout.com
If I have both the time and motivation, I'll play around with it a bit
Knock yourself out. I'll look into it if someone posts improvements and shit but I don't plan to make my code usable (and fixing the collision issue)
To be honest I didn't expect anyone to actually read my code so I just commented the bare minimum and hit auto format. I'll keep in mind to not format code after commenting.

I looked into Bloom Filters because they're so close to Hashtables and made a quick implementation myself within a spare hour
C++:
// excerpt from hashdatastruct.h
     /**
     * A basic implementation of a bloom filter.
     * Bloom filters are probalistic data structures that can confirm if an item is in the set.
     * BFs are prone to false positives but won't produce false negatives.
     * Collision probability can be reduced by a few things:
     * [*] Larger bloom filter
     * [*] Hashing functions
     * [*] Compression functions that actually work (unlike the currently implemented)
     * [*] Multiple compressed hashes for a key
    */
    template <class T>
    class BloomFilter
    {
    private:
        bool *arr; // Contents
        int r1; // Random hashing parameters
        int r2;
        int p; // Prime number larger than arr
        std::hash<T> key_hash; // Base hash of key

        void setLength(int length)
        {
            arr = new bool[length]{false};
        }

        /**
         * Bloom filters need multiple different hashes for the same key to work properly.
         * I made these calculations without any mathematical consideration. That's why they are so bad
        */
        int uniqueHash1(__int128 hash) { return ((hash * r1 << r2) % p) % sizeof(arr); }
        int uniqueHash2(__int128 hash) { return hash % p % sizeof(arr) + r1 - r2; }
        int uniqueHash3(__int128 hash) { return (hash << r2) / r1 % p % sizeof(arr) + r2 << r2 / r1; }

    public:
        // Constructor
        BloomFilter(int length)
        {
            setLength(length); // Set content array
            srand(time(NULL)); // Seed
            r1 = rand() % (length - 30) + 30; // Compression parameters
            r2 = rand() % 9 + 1;
            p = prime::highest(length, length + 500);
        }

        /**
         * Checks if a key is probably inserted into the filter
        */
        bool inFilter(T key)
        {
            __int128 hash = key_hash(key);
            if (arr[uniqueHash1(hash)] && arr[uniqueHash2(hash)] && arr[uniqueHash3(hash)])
                return true;
            return false;
        }

        /**
         * Inserts a key into the filter. Ground breaking I know
        */
        void insert(T key)
        {
            __int128 hash = key_hash(key);
            arr[uniqueHash1(hash)] = true;
            arr[uniqueHash2(hash)] = true;
            arr[uniqueHash3(hash)] = true;
        }

        void operator delete(void *p)
        {
            free(p);
        }
    };
There are a lot of collisions
C++:
#include <iostream>
#include <string>
#include <stdlib.h>
#include "dependencies/hashDataStruct.h"

std::string getRandomString(int length)
{
    std::string s;
    for (int i = 0; i < length; i++)
    {
        s += (char)(rand() % 95 + 32);
    }

    return s;
}

int main(int argc, char const *argv[])
{
    if (argc < 3) // [file] [number of string to insert] [string length]
    {
        std::cout << "Not enough arguments" << std::endl;
        return 0;
    }
    srand(time(NULL));
    int strn = std::stoi(argv[1]);
    int strl = std::stoi(argv[2]);
    int collisions = 0;
    hst::BloomFilter<std::string> b = hst::BloomFilter<std::string>(100);

    std::string strings[strn];
    for (int i = 0; i < strn; i++)
    {
        std::string s = getRandomString(strl);
        std::cout << i << ") " << s << std::endl;
        strings[i] = s;
        b.insert(s);
    }

    for (int i = 0; i < 500; i++)
    {
        std::string s = getRandomString(strl);
        if (b.inFilter(s))
        {
            collisions++;
            int collidedString = -1;
            for (int i = 0; i < strn; i++)
            {
                if (s == strings[i])
                {
                    collidedString = i;
                    break;
                }
            }
            std::cout << "bloom collision "
                      << "(" << collidedString << "): " << s << std::endl;
        }
    }
    std::cout << "collisions: " << collisions << std::endl;
    return 0;
}
C++:
#ifndef HASHSTRUCTS_INCLUDE
#define HASHSTRUCTS_INCLUDE
#include <stdlib.h>
#include <vector>
#include "prime.h"

namespace hst
{
    template <class T>
    class HashTable
    {
    private:
        T *arr; // The content of the map
        std::vector<std::string> keys; // A list of used keys. Used for reallocating and returning every key.
        int r1; // Random parameter 1
        int r2; // Random parameter 2
        int p; // Prime number larger than arr

        __int128 hash(const std::string &key) // Generate a hash based on the key
        {
            __int128 hash = 0;
            for (int i = 0; i < key.length(); i++)
            {
                hash = (hash << 5) - hash + key[i]; // Hash = hash*31 + key[i] (a lot of collisions don't @ me)
            }
            return hash;
        }

        int compressedHash(__int128 hash)
        {
            return ((r1 * hash + r2) % p) % sizeof(arr); // Compress hash into usable index (same as above)
        }

    public:
        HashTable(int size)
        {
            srand(time(NULL)); // Seed
            r1 = rand() % 5000 + 200; // Set first random parameter
            r2 = rand() % 5000 + 200; // Set second random parameter
            arr = new T[size]; // Set array to size
            p = prime::highest(sizeof(arr), sizeof(arr) + 1000); // Get a (more or less) appropriate prime number
        }

        void insert(std::string key, T content)
        {
            arr[compressedHash(hash(key))] = content; // Set content
            keys.push_back(key); // Add to used keys
        }

        T get(std::string key)
        {
            return arr[compressedHash(hash(key))]; // Kind of self explaining
        }

        void remove(std::string key)
        {
            arr[compressedHash(hash(key))] = NULL; // This will give you a warning but fuck it
            std::vector<std::string>::iterator el = keys.begin(); // First element
            for (int i = 0; i < keys.size(); i++) //For every element
            {
                if (keys[i] == key)
                {
                    keys.erase(el);
                    return;
                }
                el++; // Next element
            }
        }

        void resize(int size)
        {
            T *contents = new T[keys.size()]; // Temporary array for every value by key
            for (int i = 0; i < keys.size(); i++)
            {
                contents[i] = get(keys[i]);
            }
            arr = new T[size]; // Resize array
            for (int i = 0; i < keys.size(); i++)
            {
                arr[compressedHash(hash(keys[i]))] = contents[i]; // Rehash & store every stored element
            }
            delete[] contents; // Delete temp array
        }

        std::vector<std::string> getKeys()
        {
            return keys; // Yeah
        }

        /**
         * Get a pointer to a new array holding every inserted key.
         * Remember to delete.
        */
        std::string *getKeysArray()
        {
            std::string *k = new std::string[keys.size()];
            for (int i = 0; i < keys.size(); i++)
            {
                k[i] = keys[i];
            }

            return k;
        }

        void operator delete(void *p)
        {
            free(p);
        }
    };

    /**
     * A basic implementation of a bloom filter.
     * Bloom filters are probalistic data structures that can confirm if an item is in the set.
     * BFs are prone to false positives but won't produce false negatives.
     * Collision probability can be reduced by a few things:
     * [*] Larger bloom filter
     * [*] Hashing functions
     * [*] Compression functions that actually work (unlike the currently implemented)
     * [*] Multiple compressed hashes for a key
    */
    template <class T>
    class BloomFilter
    {
    private:
        bool *arr; // Contents
        int r1; // Random hashing parameters
        int r2;
        int p; // Prime number larger than arr
        std::hash<T> key_hash; // Base hash of key

        void setLength(int length)
        {
            arr = new bool[length]{false};
        }

        /**
         * Bloom filters need multiple different hashes for the same key to work properly.
         * I made these calculations without any mathematical consideration. That's why they are so bad
        */
        int uniqueHash1(__int128 hash) { return ((hash * r1 << r2) % p) % sizeof(arr); }
        int uniqueHash2(__int128 hash) { return hash % p % sizeof(arr) + r1 - r2; }
        int uniqueHash3(__int128 hash) { return (hash << r2) / r1 % p % sizeof(arr) + r2 << r2 / r1; }

    public:
        // Constructor
        BloomFilter(int length)
        {
            setLength(length); // Set content array
            srand(time(NULL)); // Seed
            r1 = rand() % (length - 30) + 30; // Compression parameters
            r2 = rand() % 9 + 1;
            p = prime::highest(length, length + 500);
        }

        /**
         * Checks if a key is probably inserted into the filter
        */
        bool inFilter(T key)
        {
            __int128 hash = key_hash(key);
            if (arr[uniqueHash1(hash)] && arr[uniqueHash2(hash)] && arr[uniqueHash3(hash)])
                return true;
            return false;
        }

        /**
         * Inserts a key into the filter. Ground breaking I know
        */
        void insert(T key)
        {
            __int128 hash = key_hash(key);
            arr[uniqueHash1(hash)] = true;
            arr[uniqueHash2(hash)] = true;
            arr[uniqueHash3(hash)] = true;
        }

        void operator delete(void *p)
        {
            free(p);
        }
    };
}
#endif
 
Last edited:

Directional Vector

varishangout.com
Kinda related but I guess I just need a place to rant about troons.
I was looking into brute forcing shinigami eyes extension's bloom filter which turned out to be more difficult than expected.
  • God I hate JavaScript so much
    • Ok it's actually TypeScript but my point still stands
  • I have only worked with browser extensions on a very basic level and I don't understand some magic that's going on
    • From the code it looks like the extension is reading local files? Am I retarded or shouldn't they not be able to do that?
  • The troon who made this pile of surprisingly popular shit didn't even bother to make the source readable.
The last point is the most retarded one obviously. How the fuck am I supposed to understand what you were thinking if you refuse to make a fucking comments or god forbid, split your code in multiple fucking functions with descriptive names?
Whoever made this is apparently too retarded to use a debugger and is therefore printing everything to the fucking console. This wouldn't be so bad if the troon would invest half a fucking minute to remove unused code after debugging instead of commenting it out. You are already using source control why won't you just delete unused lines you fucking monkey?

Now you might be thinking something along the lines of "You fucking faggot look at your previous posts you did this shit yourself what are you complaining about?" or maybe just "You fucking faggot". However I didn't post my shit expecting someone else to work with it and I didn't publish my shit to fucking github.

I'm serious. Look at this:
JavaScript:
function getIdentifierFromElementImpl(element: HTMLAnchorElement, originalTarget: HTMLElement): string {
    if (!element) return null;

    const dataset = element.dataset;

    if (hostname == 'reddit.com') {
        const parent = element.parentElement;
        if (parent && parent.classList.contains('domain') && element.textContent.startsWith('self.')) return null;
    } else if (hostname == 'disqus.com') {
        if (element.classList && element.classList.contains('time-ago')) return null;
    } else if (hostname == 'facebook.com') {
        const parent = element.parentElement;
        const firstChild = <HTMLElement>element.firstChild;
        if (parent && (parent.tagName == 'H1' || parent.id == 'fb-timeline-cover-name')) {
            const id = getCurrentFacebookPageId();
            //console.log('Current fb page: ' + id)
            if (id)
                return 'facebook.com/' + id;
        }

        // comment timestamp
        if (firstChild && firstChild.tagName == 'ABBR' && element.lastChild == firstChild) return null;

        // post 'see more'
        if (element.classList.contains('see_more_link')) return null;

        // post 'continue reading'
        if (parent && parent.classList.contains('text_exposed_link')) return null;

        // React comment timestamp
        if (parent && parent.tagName == 'LI') return null;

        // React post timestamp
        if (element.getAttribute('role') == 'link' && parent && parent.tagName == 'SPAN' && firstChild && firstChild.tagName == 'SPAN' && firstChild.tabIndex == 0)
            return null;

        // React big profile picture (user or page)
        if (originalTarget instanceof SVGImageElement && isFacebookPictureLink(element) && !getMatchingAncestorByCss(element, '[role=article]')) {
            return getIdentifier(window.location.href);
        }

        // React cover picture
        if (originalTarget instanceof HTMLImageElement && isFacebookPictureLink(element) && element.getAttribute('aria-label') && !getMatchingAncestorByCss(element, '[role=article]')) {
            return getIdentifier(window.location.href);
        }

        if (dataset) {
            const hovercard = dataset.hovercard;
            if (hovercard) {
                const id = captureRegex(hovercard, /id=(\d+)/);
                if (id)
                    return 'facebook.com/' + id;
            }

            // post Comments link
            if (dataset.testid == 'UFI2CommentsCount/root') return null;

            // notification
            if (dataset.testid == 'notif_list_item_link') return null;

            // post Comments link
            if (dataset.commentPreludeRef) return null;

            // page left sidebar
            if (dataset.endpoint) return null;

            // profile tabs
            if (dataset.tabKey) return null;

            const gt = dataset.gt;
            if (gt) {
                const gtParsed = JSON.parse(gt);
                if (gtParsed.engagement && gtParsed.engagement.eng_tid) {
                    return 'facebook.com/' + gtParsed.engagement.eng_tid;
                }
            }

            // comment interaction buttons
            if (dataset.sigil) return null;

            let p = <HTMLElement>element;
            while (p) {
                const bt = p.dataset.bt;
                if (bt) {
                    const btParsed = JSON.parse(bt);
                    if (btParsed.id) return 'facebook.com/' + btParsed.id;
                }
                p = p.parentElement;
            }
        }
    } else if (hostname == 'twitter.com') {
        if (dataset && dataset.expandedUrl) return getIdentifier(dataset.expandedUrl);
        if (element.href.startsWith('https://t.co/')) {
            const title = element.title;
            if (title && (title.startsWith('http://') || title.startsWith('https://')))
                return getIdentifier(title);
            const content = element.textContent;
            if (!content.includes(' ') && content.includes('.') && !content.includes('…')) {
                const url = content.startsWith('http://') || content.startsWith('https://') ? content : 'http://' + content;
                return getIdentifier(url);
            }
        }
    } else if (domainIs(hostname, 'wikipedia.org')) {
        if (element.classList.contains('interlanguage-link-target')) return null;
    }

    if (element.classList.contains('tumblelog')) return element.textContent.replace('@', '') + '.tumblr.com';

    const href = element.href;
    if (href && (!href.endsWith('#') || href.includes('&stick='))) return getIdentifierFromURLImpl(tryParseURL(href));
    return null;
}
That's one hell of a fat function. Surprisingly with a lot of comments. Sadly, most of the if statements DO THE EXACT SAME FUCKING THING. Put all of that shit in a fucking function I'm begging you and maybe start commenting on your logic as well.
The sad part is, that wasn't even the worst fucking function. Look at this lunacy.
JavaScript:
function getIdentifierFromURLImpl(url: URL): string {
    if (!url) return null;

    // nested urls
    const nested = tryUnwrapNestedURL(url);
    if (nested) {
        return getIdentifierFromURLImpl(nested);
    }

    // fb group member badge
    if (url.pathname.includes('/badge_member_list/')) return null;

    let host = url.hostname;
    const searchParams = url.searchParams;
    if (domainIs(host, 'web.archive.org')) {
        const match = captureRegex(url.href, /\/web\/\w+\/(.*)/);
        if (!match) return null;
        return getIdentifierFromURLImpl(tryParseURL('http://' + match));
    }

    if (host.startsWith('www.')) host = host.substring(4);

    const pathArray = url.pathname.split('/');

    if (domainIs(host, 'facebook.com')) {
        if (searchParams.get('story_fbid')) return null;
        const fbId = searchParams.get('id');
        const p = url.pathname.replace('/pg/', '/');
        const isGroup = p.startsWith('/groups/');
        if (isGroup && p.includes('/user/')) return 'facebook.com/' + pathArray[4]; // fb.com/groups/.../user/...
        return 'facebook.com/' + (fbId || getPartialPath(p, isGroup ? 2 : 1).substring(1));
    } else if (domainIs(host, 'reddit.com')) {
        const pathname = url.pathname.replace('/u/', '/user/');
        if (!pathname.startsWith('/user/') && !pathname.startsWith('/r/')) return null;
        if (pathname.includes('/comments/') && hostname == 'reddit.com') return null;
        return 'reddit.com' + getPartialPath(pathname, 2);
    } else if (domainIs(host, 'twitter.com')) {
        return 'twitter.com' + getPartialPath(url.pathname, 1);
    } else if (domainIs(host, 'youtube.com')) {
        const pathname = url.pathname;
        if (pathname.startsWith('/user/') || pathname.startsWith('/c/') || pathname.startsWith('/channel/'))
            return 'youtube.com' + getPartialPath(pathname, 2);
        return 'youtube.com' + getPartialPath(pathname, 1);
    } else if (domainIs(host, 'disqus.com') && url.pathname.startsWith('/by/')) {
        return 'disqus.com' + getPartialPath(url.pathname, 2);
    } else if (domainIs(host, 'medium.com')) {
        const hostParts = host.split('.');
        if (hostParts.length == 3 && hostParts[0] != 'www') {
            return host;
        }
        return 'medium.com' + getPartialPath(url.pathname.replace('/t/', '/'), 1);
    } else if (domainIs(host, 'tumblr.com')) {
        if (url.pathname.startsWith('/register/follow/')) {
            const name = getPathPart(url.pathname, 2);
            return name ? name + '.tumblr.com' : null;
        }
        if (host != 'www.tumblr.com' && host != 'assets.tumblr.com' && host.indexOf('.media.') == -1) {
            if (!url.pathname.startsWith('/tagged/')) return url.host;
        }
        return null;
    } else if (domainIs(host, 'wikipedia.org') || domainIs(host, 'rationalwiki.org')) {
        const pathname = url.pathname;
        if (url.hash) return null;
        if (pathname == '/w/index.php' && searchParams.get('action') == 'edit') {
            const title = searchParams.get('title');
            if (title && title.startsWith('User:')) {
                return 'wikipedia.org/wiki/' + title;
            }
        }
        if (pathname.startsWith('/wiki/Special:Contributions/') && url.href == window.location.href)
            return 'wikipedia.org/wiki/User:' + pathArray[3];
        if (pathname.startsWith('/wiki/User:'))
            return 'wikipedia.org/wiki/User:' + pathArray[2].split(':')[1];
        if (pathname.includes(':')) return null;
        if (pathname.startsWith('/wiki/')) return 'wikipedia.org' + decodeURIComponent(getPartialPath(pathname, 2));
        else return null;
    } else if (host.indexOf('.blogspot.') != -1) {
        const m = captureRegex(host, /([a-zA-Z0-9\-]*)\.blogspot/);
        if (m) return m + '.blogspot.com';
        else return null;
    } else if (host.includes('google.')) {
        if (url.pathname == '/search' && searchParams.get('stick') && !searchParams.get('tbm') && !searchParams.get('start')) {
            const q = searchParams.get('q');
            if (q) return 'wikipedia.org/wiki/' + q.replace(/\s/g, '_');
        }
        return null;
    } else {
        if (host.startsWith('m.')) host = host.substr(2);
        return host;
    }
}
A great total of two (2) comments. Thanks mate that really helped understanding every fucking detail of your fucking function.
Is that what HRT does to your brain? I thought programming socks are supposed to make your code better? I have so many questions.
 

hzp

varishangout.com
You're retarded.
That said, it seems there is one fundamental thing you misunderstood with hashes & hashtables. Yes, hash collisions are inevitable. Yes, minimizing them is great (although that's completely out of scope for someone who's either not doing a PhD. in Group Theory or some autistic CS subfield nor paid a 7digit salary at a FAANG). But: No, it's not okay to not deal with them.
It's hard to explain without being confused, and I'm not sure how comfortable you are with the subject, so my apologies.

Basically, hash collisions will impact performance. They should not, in any case (and that is the case of every general purpose data structure you can come up with) impact correctness.
Currently you have no strategy to deal with collisions, and that strategy is really the main difference between all the different flavours and algorithms of hash tables (some are better than others, some have tradeoffs, etc.). You have to.
If you hash a key, and then a second one, you need someway to differentiate the two values, because they could have the same hash! Collisions are impossible to avoid: you're transforming a piece of data that can have any size (but for the sake of examples they are 128 byte long, like IPv6 addresses, and you're reducing them to an index that can only be 32byte long (64 if you use `size_t`), 2^96 ip addresses will be put at the same index in your array. Stuff actually becomes even more perverse, because obviously you wouldn't crate an array of 2^32 * sizeof(key) in size, so you're doing a modulo. That creates collisions within your hashes.

I recommend you go over the wikipedia article or some other similar resources. They have graphics and explanation about the different strategies. "chaining" being the easiest one.

There's a lot of other things I could comment on (like you should have your hashtable do the resizing, and not the user of your hashtable), but that's really the most important one.
 

Directional Vector

varishangout.com
oh yeah I didn't notice that arr was of type `T *`.
So basically, another big mistake is that: sizeof(arr) doesn't do what you think it does: https://godbolt.org/z/eoPW5Mhr4 .
You're retarded.
That said, it seems there is one fundamental thing you misunderstood with hashes & hashtables. Yes, hash collisions are inevitable. Yes, minimizing them is great (although that's completely out of scope for someone who's either not doing a PhD. in Group Theory or some autistic CS subfield nor paid a 7digit salary at a FAANG). But: No, it's not okay to not deal with them.
It's hard to explain without being confused, and I'm not sure how comfortable you are with the subject, so my apologies.

Basically, hash collisions will impact performance. They should not, in any case (and that is the case of every general purpose data structure you can come up with) impact correctness.
Currently you have no strategy to deal with collisions, and that strategy is really the main difference between all the different flavours and algorithms of hash tables (some are better than others, some have tradeoffs, etc.). You have to.
If you hash a key, and then a second one, you need someway to differentiate the two values, because they could have the same hash! Collisions are impossible to avoid: you're transforming a piece of data that can have any size (but for the sake of examples they are 128 byte long, like IPv6 addresses, and you're reducing them to an index that can only be 32byte long (64 if you use `size_t`), 2^96 ip addresses will be put at the same index in your array. Stuff actually becomes even more perverse, because obviously you wouldn't crate an array of 2^32 * sizeof(key) in size, so you're doing a modulo. That creates collisions within your hashes.

I recommend you go over the wikipedia article or some other similar resources. They have graphics and explanation about the different strategies. "chaining" being the easiest one.

There's a lot of other things I could comment on (like you should have your hashtable do the resizing, and not the user of your hashtable), but that's really the most important one.
I'm going to improve my shit once I've got some spare time on my hands. Because that's going to take some time, feel free to continue taking my code apart.
 

Directional Vector

varishangout.com
Okay so I sat down a few days ago and made a basic implementation of linear probing. There are still some things bugging me but I don't care right know. And the compiler is still crying about me using a variable array length.

C++:
__extension__ typedef __int128 int128; // The compiler wouldn't shut up.
template <class T>
class Container // Container class to store key, status and content
{
private:
  Container(int s) { status = s; }

public:
  T content;
  std::string key;
  int status = 0; // 0 unused; 1 deleted; 2 in use.
  Container() {}
  void setContent(std::string k, T c) {
    content = c;
    key = k;
    status = 2;
  }
  void deleteContainer() { *this = Container(1); }
};

template <class T> class HashTable {
private:
  Container<T> *arr;
  // T *arr; // The content of the map
  std::vector<std::string> keys; // A list of used keys. Used for reallocating
                                 // and returning every key.
  int r1;                        // Random parameter 1
  int r2;                        // Random parameter 2
  int p;                         // Prime number larger than arr
  int size;                      // Array length. Yeah I'm lazy.

  int128 hash(const std::string &key) // Generate a hash based on the key
  {
    int128 hash = 0;
    for (size_t i = 0; i < key.length(); i++) {
      hash = (hash << 5) - hash +
             key[i]; // Hash = hash*31 + key[i] (a lot of collisions don't @ me)
    }
    return hash;
  }

  int compressedHash(int128 hash) {
    return ((r1 * hash + r2) % p) %
           size; // Compress hash into usable index (same as above)
  }

  void resize(int n) {
    T contents[keys.size()]; // Temporary array for every value by key
    for (size_t i = 0; i < keys.size(); i++) {
      contents[i] = get(keys[i]);
    }
    arr = new Container<T>[n]; // Resize array
    for (size_t i = 0; i < keys.size(); i++) {
      insert(keys[i], contents[i]);
    }
  }

public:
  HashTable(int pSize = 150) {
    srand(time(NULL)); // Seed
    size = pSize;
    r1 = rand() % 5000 + 200;      // Set first random parameter
    r2 = rand() % 5000 + 200;      // Set second random parameter
    arr = new Container<T>[pSize]; // Set array to size
    p = prime::highest(
        pSize, pSize + 1000); // Get a (more or less) appropriate prime number
  }

  /**
  Insert `content` at `key`.
  */
  void insert(std::string key, T content) {
    int h = compressedHash(hash(key)); // Hash
    int steps = 0;           // Count steps until empty bucket is located
    Container<T> c = arr[h]; // Container at hash location
    while (c.status == 2) {  // While Container is used ...
      if (c.key != key) {
        c = arr[h == size ? h = 0 : h++]; // Increment Hash by 1 or back to 0
        steps++;
      } else // Same key => overwrite container
        break;
    }
    c.setContent(key, content);
    arr[h] = c; // Maybe I should have used pointers.
    if (steps > size * 0.75)
      resize(size + size * 0.5);
  }

  /**
  Retrieve content of type `T` at `key`
  */
  T get(std::string key) {
    int h = compressedHash(hash(key));
    int steps = 0;
    Container<T> c = arr[h];
    while (c.key != key) {
      if (c.status == 0)
        // The content corresponding to the given key can't be stored after
        // an unused cell. Therefore the key isn't present in the set.
        throw std::out_of_range("key not present");
      {
        c = arr[h == size ? h = 0 : h++];
        steps++;
      }
    }
    return c.content;
  }

  /**
  Remove content at `key`.
  */
  void remove(std::string key) {
    int h = compressedHash(hash(key));
    Container<T> c = arr[h];
    while (c.key != key) {
      if (c.status == 0)
        throw std::out_of_range("key not present");
      c = arr[h == size ? h = 0 : h++];
    }

    std::vector<std::string>::iterator el = keys.begin(); // First element
    for (size_t i = 0; i < keys.size(); i++)              // For every element
    {
      if (keys[i] == key) {
        keys.erase(el);
        return;
      }
      el++; // Next element
    }
    c.deleteContainer();
    arr[h] = c; // I really should have used pointers
  }
};
Please don't cry over the comment formatting, the plug in's auto format is responsible and I don't really care about it either way.
 
Top