// Package nilsimsa is a Go implementation of Nilsimsa, ported from // code.google.com/p/py-nilsimsa but follows the conventions established by the md5 package in // the standard lib // // The Java implementation at // https://github.com/weblyzard/nilsimsa/ // blob/master/src/main/java/com/weblyzard/lib/string/nilsimsa/Nilsimsa.java // was also consulted. // // There is a discussion about using hash to score string similarities // http://stackoverflow.com/questions/4323977/string-similarity-score-hash // // Copyright 2015 Sheng-Te Tsao. All rights reserved. // // Use of this source code is governed by the same BSD-style license that is used by the Go // standard library // // From http://en.wikipedia.org/wiki/Nilsimsa_Hash // // Nilsimsa is an anti-spam focused locality-sensitive hashing algorithm originally proposed by // the cmeclax remailer operator in 2001[1] and then reviewed by Damiani et al. in their 2004 // paper titled, "An Open Digest-based Technique for Spam Detection".[2] // // The goal of Nilsimsa is to generate a hash Digest of an email message such that the digests // of two similar messages are similar to each other. In comparison with cryptographic hash // functions such as SHA-1 or MD5, making a small modification to a document does not // substantially change the resulting hash of the document. The paper suggests that the Nilsimsa // satisfies three requirements: // // 1. The Digest identifying each message should not vary significantly (sic) // for changes that can be produced automatically. // 2. The encoding must be robust against intentional attacks. // 3. The encoding should support an extremely low risk of false positives. // // Subsequent testing[3] on a range of file types identified the Nilsimsa hash as having a // significantly higher false positive rate when compared to other similarity Digest schemes // such as TLSH, Ssdeep and Sdhash. // // Nilsimsa similarity matching was taken in consideration by Jesse Kornblum when developing the // fuzzy hashing in 2006,[4] that used the algorithms of spamsum by Andrew Tridgell (2002).[5] // // References: // // [1] http://web.archive.org/web/20050707005338/ // // http://ixazon.dynip.com/~cmeclax/nilsimsa-0.2.4.tar.gz // // [2] http://spdp.di.unimi.it/papers/pdcs04.pdf // [3] https://www.academia.edu/7833902/TLSH_-A_Locality_Sensitive_Hash // [4] http://jessekornblum.livejournal.com/242493.html // [5] http://dfrws.org/2006/proceedings/12-Kornblum.pdf // // From http://blog.semanticlab.net/tag/nilsimsa/ // // # An Open Digest-based Technique for Spam Detection // // Damiani, E. et al., 2004. An Open Digest-based Technique for Spam Detection. In in // Proceedings of the 2004 International Workshop on Security in Parallel and Distributed // Systems. pp. 15-17. // // # Summary // // This paper discusses the Nilsimsa open Digest hash algorithm which is frequently used for // Spam detection. The authors describe the computation of the 32-byte code, discuss different // attack scenarios and measures to counter them. // // Digest computation // // 1. Slide a five character window through the input text and compute all // eight possible tri-grams for each window (e.g. "igram" yields "igr", // "gra", "ram", "ira", "iam", "grm", ...) // // 2. Hash these trigrams using a hash function h() which maps every tri-gram // to one of 256 accumulators and increment the corresponding // accumulator. Nilsimsa uses the Trans53 hash function for hashing. // // 3. At the end of the process described below, compute the expected value // of the accumulators and set the bits which correspond to each accumulator // either to (1) if it exceeds this threshold or (o) otherwise. // // # Similarity computation // // The Nilsimsa similarity is computed based on the bitwise difference between two Nilsimsa // hashes. Documents are considered similar if they exceed a pre-defined similarity value. // // 1. >24 similar bits - conflict probability: 1.35E-4 // (suggestions by Nilsimsa's original designer) // // 2. >54 similar bits - conflict probability: 7.39E-12 // (suggested by the article's authors) // // # Attacks // // Spammers can apply multiple techniques to prevent Nilsimsa from detecting duplicates: // // 1. Random addition: requires >300% of additional text to prevent detection. // 2. Thesaurus substitutions: require >20% of replaced text // 3. Perceptive substitution (security > s3cur1ty): requires >15% // of the text to be altered // 4. Aimed attacks (i.e. attacks which specifically target Nilsimsa): // ~10% of the text needs to be altered // // Aimed attacks manipulate Nilsimsa's accumulators by adding words which introduce new // tri-grams that specifically alter the hash value. Although these attacks are relatively // effective, they can be easily circumvented by computing the Nilsimsa hash twice with // different hash functions. package nilsimsa import ( "fmt" "hash" "strconv" ) // Size of an Nilsimsa hash in bytes. const Size = 32 // BlockSize of Nilsimsa in bytes (not sure what values is best...?) const BlockSize = 8 type Digest struct { count int64 // number of characters that we have come across acc [256]int64 // 256-bit vector to hold the results of the Digest c0, c1, c2, c3 byte // last 4 characters from previous call to Write } // New create a new Nilsimsa hash diget func New() hash.Hash { d := new(Digest) // Note that no memory is allocate other than the struct itself. It is better to embed // last4Array into the struct itself since it's maximum size is know already // d.last4 = d.last4Array[:0] //creating the slice by re-slicing last4Array return d } func (d *Digest) Reset() { // It is probably faster to just call New() again rather than to re-use an existing struct // by calling New() because presumably the compiler does a better job of zeroing the whole // struct than doing the manual zeroing via copy(d.acc, zero) d.count = 0 for i := range d.acc { d.acc[i] = 0 } } var zero = make([]int64, 256) func (d *Digest) Size() int { return Size } func (d *Digest) BlockSize() int { return BlockSize } func (d *Digest) Sum(in []byte) []byte { digest := ComputeDigest(&d.acc, d.count) return append(in, digest[:]...) } // ComputeDigest uses a threshold (mean of the accumulator) to computes the Nilsimsa Digest func ComputeDigest(acc *[256]int64, count int64) [Size]byte { var trigrams int64 = 0 if count == 3 { trigrams = 1 } else if count == 4 { trigrams = 4 } else if count > 4 { // > 4 chars -> 8 for each char trigrams = 8*count - 28 } // threshold is the mean of the acc buckets threshold := trigrams / 256 var digest [Size]byte for i := uint(0); i < 256; i++ { if acc[i] > threshold { digest[i>>3] += 1 << (i & 7) // equivalent to i/8, 2**(i mod 7) } } // Reverse the Digest for i, j := 0, 31; i < j; i++ { digest[i], digest[j] = digest[j], digest[i] j-- } return digest } // Update the Nilsimsa accumulator with the data contained in // chunk using a 5 bytes sliding window. // // In general it is easier to just use Sum(data []byte) func Update(chunk []byte, count int64, c0, c1, c2, c3 byte, acc *[256]int64) (int64, byte, byte, byte, byte) { for _, c4 := range chunk { count++ if count > 4 { // seen at least 5, so have full 5 bytes window // These and c4 form the 5 bytes sliding window acc[tran53(c4, c0, c1, 0)]++ acc[tran53(c4, c0, c2, 1)]++ acc[tran53(c4, c1, c2, 2)]++ acc[tran53(c4, c0, c3, 3)]++ acc[tran53(c4, c1, c3, 4)]++ acc[tran53(c4, c2, c3, 5)]++ // duplicate hashes, used to maintain 8 trigrams per character acc[tran53(c3, c0, c4, 6)]++ acc[tran53(c3, c2, c4, 7)]++ // Drop off c3 and put c4 at the front of the sliding window c0, c1, c2, c3 = c4, c0, c1, c2 } else if count > 3 { // seen at least 4 bytes acc[tran53(c4, c0, c1, 0)]++ acc[tran53(c4, c0, c2, 1)]++ acc[tran53(c4, c1, c2, 2)]++ c0, c1, c2, c3 = c4, c0, c1, c2 } else if count > 2 { // seen at least 3 bytes acc[tran53(c4, c0, c1, 0)]++ c0, c1, c2 = c4, c0, c1 } else if count > 1 { c0, c1 = c4, c0 } else { c0 = c4 } } // Return the sliding window which should be save by the caller. // It ok to save values that have not been set because they will not be // used due to check for counter > x inside the loop return count, c0, c1, c2, c3 } /* This version is obsolete after the introduction of the Update() function following the pattern used in package hash/crc32 func Sum(data []byte) [Size]byte { // There is no need to allocate on the heap using array := [4]byte and // then create the slice using last4 := array[:] because this is // done by the compiler automatically through escape analysis // http://grokbase.com/t/gg/golang-nuts/142cqce51f/ // go-nuts-allocation-optimization-stack-vs-heap // // last4 := make([]byte, 0, 4) // acc := [256]int64 // count, _ := update(data, 0, acc, last4) // // Now that both accArray and last4Array are embedded into Digest struct // there is reason not to use New() directly and just call Write(). // Escape analysis should create the Digest struct on the stack rather // than allocating it on the heap d := New() d.Write(data) return ComputeDigest(&d.acc, d.count) } */ // Write computes the hash of all of the trigrams in the chunk // using a sliding window of length 5 // // It is part of the hash.Hash interface func (d *Digest) Write(chunk []byte) (int, error) { // Load up sliding window with values from values set by previous Write. // it is ok even if some of the values are not valid because by checking for // count > x inside the loop the invalid values are not being used d.count, d.c0, d.c1, d.c2, d.c3 = Update(chunk, d.count, d.c0, d.c1, d.c2, d.c3, &d.acc) return len(chunk), nil } // Sum returns the Nilsimsa Digest of the data. // Like Sum() for MD5, it returns [Size]byte by value rather than a slice. // This way the return value does not need to be allocated on the head // so there is no garbage collection later. // // To use the result as a slice when using it as as a function parameter, // simply re-slice it using the digest[:] syntax // See http://blog.golang.org/go-slices-usage-and-internals // and search for: "slicing" an existing slice or array func Sum(data []byte) (digest [Size]byte, err error) { if len(data) < 5 { err = fmt.Errorf("cannot compute checksum of data ") } var acc [256]int64 count, _, _, _, _ := Update(data, 0, 0, 0, 0, 0, &acc) return ComputeDigest(&acc, count), nil } // HexSum returns the Nilsimsa Digest of the data as a hex string func HexSum(data []byte) (hexDigest string, err error) { var sum [Size]byte if sum, err = Sum(data); err != nil { return } return fmt.Sprintf("%x", sum), nil } var tran = [256]byte{ 0x2, 0xD6, 0x9E, 0x6F, 0xF9, 0x1D, 0x04, 0xAB, 0xD0, 0x22, 0x16, 0x1F, 0xD8, 0x73, 0xA1, 0xAC, 0x3B, 0x70, 0x62, 0x96, 0x1E, 0x6E, 0x8F, 0x39, 0x9D, 0x05, 0x14, 0x4A, 0xA6, 0xBE, 0xAE, 0x0E, 0xCF, 0xB9, 0x9C, 0x9A, 0xC7, 0x68, 0x13, 0xE1, 0x2D, 0xA4, 0xEB, 0x51, 0x8D, 0x64, 0x6B, 0x50, 0x23, 0x80, 0x03, 0x41, 0xEC, 0xBB, 0x71, 0xCC, 0x7A, 0x86, 0x7F, 0x98, 0xF2, 0x36, 0x5E, 0xEE, 0x8E, 0xCE, 0x4F, 0xB8, 0x32, 0xB6, 0x5F, 0x59, 0xDC, 0x1B, 0x31, 0x4C, 0x7B, 0xF0, 0x63, 0x01, 0x6C, 0xBA, 0x07, 0xE8, 0x12, 0x77, 0x49, 0x3C, 0xDA, 0x46, 0xFE, 0x2F, 0x79, 0x1C, 0x9B, 0x30, 0xE3, 0x00, 0x06, 0x7E, 0x2E, 0x0F, 0x38, 0x33, 0x21, 0xAD, 0xA5, 0x54, 0xCA, 0xA7, 0x29, 0xFC, 0x5A, 0x47, 0x69, 0x7D, 0xC5, 0x95, 0xB5, 0xF4, 0x0B, 0x90, 0xA3, 0x81, 0x6D, 0x25, 0x55, 0x35, 0xF5, 0x75, 0x74, 0x0A, 0x26, 0xBF, 0x19, 0x5C, 0x1A, 0xC6, 0xFF, 0x99, 0x5D, 0x84, 0xAA, 0x66, 0x3E, 0xAF, 0x78, 0xB3, 0x20, 0x43, 0xC1, 0xED, 0x24, 0xEA, 0xE6, 0x3F, 0x18, 0xF3, 0xA0, 0x42, 0x57, 0x08, 0x53, 0x60, 0xC3, 0xC0, 0x83, 0x40, 0x82, 0xD7, 0x09, 0xBD, 0x44, 0x2A, 0x67, 0xA8, 0x93, 0xE0, 0xC2, 0x56, 0x9F, 0xD9, 0xDD, 0x85, 0x15, 0xB4, 0x8A, 0x27, 0x28, 0x92, 0x76, 0xDE, 0xEF, 0xF8, 0xB2, 0xB7, 0xC9, 0x3D, 0x45, 0x94, 0x4B, 0x11, 0x0D, 0x65, 0xD5, 0x34, 0x8B, 0x91, 0x0C, 0xFA, 0x87, 0xE9, 0x7C, 0x5B, 0xB1, 0x4D, 0xE5, 0xD4, 0xCB, 0x10, 0xA2, 0x17, 0x89, 0xBC, 0xDB, 0xB0, 0xE2, 0x97, 0x88, 0x52, 0xF7, 0x48, 0xD3, 0x61, 0x2C, 0x3A, 0x2B, 0xD1, 0x8C, 0xFB, 0xF1, 0xCD, 0xE4, 0x6A, 0xE7, 0xA9, 0xFD, 0xC4, 0x37, 0xC8, 0xD2, 0xF6, 0xDF, 0x58, 0x72, 0x4E, } // tran53 implements the tran53 hash function, which is an // accumulator for a transition n between the chars a, b, c func tran53(a, b, c, n byte) byte { return ((tran[(a+n)&0xff] ^ tran[b]*(n+n+1)) + tran[0xff&c^tran[n]]) & 0xff } // Shortcut to compute the Hamming distance between two bit vector // representations of integers. // // popc - population count // popc[x] = number of 1's in binary representation of x // popc[a ^b] = hamming distance from a to b var popc = [256]byte{ 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x06, 0x07, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x06, 0x07, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x06, 0x07, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x06, 0x07, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x06, 0x07, 0x05, 0x06, 0x06, 0x07, 0x06, 0x07, 0x07, 0x08, } // BitsDiffSlice compares two Nilsimsa digests slices and return the number of bits that differ. func BitsDiffSlice(n1, n2 []byte) byte { var bits byte for i := 0; i < Size; i++ { bits += popc[0xff&n1[i]^n2[i]] } return 128 - bits } // BitsDiff compares two Nilsimsa Digest arrays and return the number of bits that differ. func BitsDiff(n1, n2 *[Size]byte) byte { var bits byte for i := 0; i < Size; i++ { bits += popc[0xff&n1[i]^n2[i]] } return 128 - bits } // BitsDiffHex compares two Nilsimsa digests hex strings and return the number of bits that // differ func BitsDiffHex(n1, n2 string) byte { var bits byte if len(n1) != Size*2 { panic("len(n1) != 64") } if len(n2) != 32*2 { panic("len(n2) != 64") } for i, j := 0, 2; i < Size*2; j += 2 { x, err := strconv.ParseInt(n1[i:j], 16, 16) if err != nil { panic(err) } y, err := strconv.ParseInt(n2[i:j], 16, 16) if err != nil { panic(err) } bits += popc[0xff&x^y] i = j } return 128 - bits }