Corrected a typo in a comment within `nilsimsa.go`. Introduced a new `Evaluate` function in `evaluate.go` to compute the bitwise difference between two Nilsimsa hash values, ensuring input validation and proper bit counting logic.
413 lines
15 KiB
Go
413 lines
15 KiB
Go
// Package nilsimsa is a Go implementation of Nilsimsa, ported from
|
|
// code.google.com/p/py-nilsimsa but follows the conventions established by the md5 package in
|
|
// the standard lib
|
|
//
|
|
// The Java implementation at
|
|
// https://github.com/weblyzard/nilsimsa/
|
|
// blob/master/src/main/java/com/weblyzard/lib/string/nilsimsa/Nilsimsa.java
|
|
// was also consulted.
|
|
//
|
|
// There is a discussion about using hash to score string similarities
|
|
// http://stackoverflow.com/questions/4323977/string-similarity-score-hash
|
|
//
|
|
// Copyright 2015 Sheng-Te Tsao. All rights reserved.
|
|
//
|
|
// Use of this source code is governed by the same BSD-style license that is used by the Go
|
|
// standard library
|
|
//
|
|
// From http://en.wikipedia.org/wiki/Nilsimsa_Hash
|
|
//
|
|
// Nilsimsa is an anti-spam focused locality-sensitive hashing algorithm originally proposed by
|
|
// the cmeclax remailer operator in 2001[1] and then reviewed by Damiani et al. in their 2004
|
|
// paper titled, "An Open Digest-based Technique for Spam Detection".[2]
|
|
//
|
|
// The goal of Nilsimsa is to generate a hash Digest of an email message such that the digests
|
|
// of two similar messages are similar to each other. In comparison with cryptographic hash
|
|
// functions such as SHA-1 or MD5, making a small modification to a document does not
|
|
// substantially change the resulting hash of the document. The paper suggests that the Nilsimsa
|
|
// satisfies three requirements:
|
|
//
|
|
// 1. The Digest identifying each message should not vary significantly (sic)
|
|
// for changes that can be produced automatically.
|
|
// 2. The encoding must be robust against intentional attacks.
|
|
// 3. The encoding should support an extremely low risk of false positives.
|
|
//
|
|
// Subsequent testing[3] on a range of file types identified the Nilsimsa hash as having a
|
|
// significantly higher false positive rate when compared to other similarity Digest schemes
|
|
// such as TLSH, Ssdeep and Sdhash.
|
|
//
|
|
// Nilsimsa similarity matching was taken in consideration by Jesse Kornblum when developing the
|
|
// fuzzy hashing in 2006,[4] that used the algorithms of spamsum by Andrew Tridgell (2002).[5]
|
|
//
|
|
// References:
|
|
//
|
|
// [1] http://web.archive.org/web/20050707005338/
|
|
//
|
|
// http://ixazon.dynip.com/~cmeclax/nilsimsa-0.2.4.tar.gz
|
|
//
|
|
// [2] http://spdp.di.unimi.it/papers/pdcs04.pdf
|
|
// [3] https://www.academia.edu/7833902/TLSH_-A_Locality_Sensitive_Hash
|
|
// [4] http://jessekornblum.livejournal.com/242493.html
|
|
// [5] http://dfrws.org/2006/proceedings/12-Kornblum.pdf
|
|
//
|
|
// From http://blog.semanticlab.net/tag/nilsimsa/
|
|
//
|
|
// # An Open Digest-based Technique for Spam Detection
|
|
//
|
|
// Damiani, E. et al., 2004. An Open Digest-based Technique for Spam Detection. In in
|
|
// Proceedings of the 2004 International Workshop on Security in Parallel and Distributed
|
|
// Systems. pp. 15-17.
|
|
//
|
|
// # Summary
|
|
//
|
|
// This paper discusses the Nilsimsa open Digest hash algorithm which is frequently used for
|
|
// Spam detection. The authors describe the computation of the 32-byte code, discuss different
|
|
// attack scenarios and measures to counter them.
|
|
//
|
|
// Digest computation
|
|
//
|
|
// 1. Slide a five character window through the input text and compute all
|
|
// eight possible tri-grams for each window (e.g. "igram" yields "igr",
|
|
// "gra", "ram", "ira", "iam", "grm", ...)
|
|
//
|
|
// 2. Hash these trigrams using a hash function h() which maps every tri-gram
|
|
// to one of 256 accumulators and increment the corresponding
|
|
// accumulator. Nilsimsa uses the Trans53 hash function for hashing.
|
|
//
|
|
// 3. At the end of the process described below, compute the expected value
|
|
// of the accumulators and set the bits which correspond to each accumulator
|
|
// either to (1) if it exceeds this threshold or (o) otherwise.
|
|
//
|
|
// # Similarity computation
|
|
//
|
|
// The Nilsimsa similarity is computed based on the bitwise difference between two Nilsimsa
|
|
// hashes. Documents are considered similar if they exceed a pre-defined similarity value.
|
|
//
|
|
// 1. >24 similar bits - conflict probability: 1.35E-4
|
|
// (suggestions by Nilsimsa's original designer)
|
|
//
|
|
// 2. >54 similar bits - conflict probability: 7.39E-12
|
|
// (suggested by the article's authors)
|
|
//
|
|
// # Attacks
|
|
//
|
|
// Spammers can apply multiple techniques to prevent Nilsimsa from detecting duplicates:
|
|
//
|
|
// 1. Random addition: requires >300% of additional text to prevent detection.
|
|
// 2. Thesaurus substitutions: require >20% of replaced text
|
|
// 3. Perceptive substitution (security > s3cur1ty): requires >15%
|
|
// of the text to be altered
|
|
// 4. Aimed attacks (i.e. attacks which specifically target Nilsimsa):
|
|
// ~10% of the text needs to be altered
|
|
//
|
|
// Aimed attacks manipulate Nilsimsa's accumulators by adding words which introduce new
|
|
// tri-grams that specifically alter the hash value. Although these attacks are relatively
|
|
// effective, they can be easily circumvented by computing the Nilsimsa hash twice with
|
|
// different hash functions.
|
|
package nilsimsa
|
|
|
|
import (
|
|
"fmt"
|
|
"hash"
|
|
"strconv"
|
|
)
|
|
|
|
// Size of an Nilsimsa hash in bytes.
|
|
const Size = 32
|
|
|
|
// BlockSize of Nilsimsa in bytes (not sure what values is best...?)
|
|
const BlockSize = 8
|
|
|
|
type Digest struct {
|
|
count int64 // number of characters that we have come across
|
|
acc [256]int64 // 256-bit vector to hold the results of the Digest
|
|
c0, c1, c2, c3 byte // last 4 characters from previous call to Write
|
|
}
|
|
|
|
// New create a new Nilsimsa hash diget
|
|
func New() hash.Hash {
|
|
d := new(Digest)
|
|
// Note that no memory is allocate other than the struct itself. It is better to embed
|
|
// last4Array into the struct itself since it's maximum size is know already
|
|
// d.last4 = d.last4Array[:0] //creating the slice by re-slicing last4Array
|
|
return d
|
|
}
|
|
|
|
func (d *Digest) Reset() {
|
|
// It is probably faster to just call New() again rather than to re-use an existing struct
|
|
// by calling New() because presumably the compiler does a better job of zeroing the whole
|
|
// struct than doing the manual zeroing via copy(d.acc, zero)
|
|
d.count = 0
|
|
for i := range d.acc {
|
|
d.acc[i] = 0
|
|
}
|
|
}
|
|
|
|
var zero = make([]int64, 256)
|
|
|
|
func (d *Digest) Size() int { return Size }
|
|
|
|
func (d *Digest) BlockSize() int { return BlockSize }
|
|
|
|
func (d *Digest) Sum(in []byte) []byte {
|
|
digest := ComputeDigest(&d.acc, d.count)
|
|
return append(in, digest[:]...)
|
|
}
|
|
|
|
// ComputeDigest uses a threshold (mean of the accumulator) to computes the Nilsimsa Digest
|
|
func ComputeDigest(acc *[256]int64, count int64) [Size]byte {
|
|
var trigrams int64 = 0
|
|
if count == 3 {
|
|
trigrams = 1
|
|
} else if count == 4 {
|
|
trigrams = 4
|
|
} else if count > 4 { // > 4 chars -> 8 for each char
|
|
trigrams = 8*count - 28
|
|
}
|
|
// threshold is the mean of the acc buckets
|
|
threshold := trigrams / 256
|
|
|
|
var digest [Size]byte
|
|
for i := uint(0); i < 256; i++ {
|
|
if acc[i] > threshold {
|
|
digest[i>>3] += 1 << (i & 7) // equivalent to i/8, 2**(i mod 7)
|
|
}
|
|
}
|
|
// Reverse the Digest
|
|
for i, j := 0, 31; i < j; i++ {
|
|
digest[i], digest[j] = digest[j], digest[i]
|
|
j--
|
|
}
|
|
return digest
|
|
}
|
|
|
|
// Update the Nilsimsa accumulator with the data contained in
|
|
// chunk using a 5 bytes sliding window.
|
|
//
|
|
// In general it is easier to just use Sum(data []byte)
|
|
func Update(chunk []byte, count int64, c0, c1, c2, c3 byte,
|
|
acc *[256]int64) (int64, byte, byte, byte, byte) {
|
|
for _, c4 := range chunk {
|
|
count++
|
|
if count > 4 { // seen at least 5, so have full 5 bytes window
|
|
// These and c4 form the 5 bytes sliding window
|
|
acc[tran53(c4, c0, c1, 0)]++
|
|
acc[tran53(c4, c0, c2, 1)]++
|
|
acc[tran53(c4, c1, c2, 2)]++
|
|
acc[tran53(c4, c0, c3, 3)]++
|
|
acc[tran53(c4, c1, c3, 4)]++
|
|
acc[tran53(c4, c2, c3, 5)]++
|
|
// duplicate hashes, used to maintain 8 trigrams per character
|
|
acc[tran53(c3, c0, c4, 6)]++
|
|
acc[tran53(c3, c2, c4, 7)]++
|
|
// Drop off c3 and put c4 at the front of the sliding window
|
|
c0, c1, c2, c3 = c4, c0, c1, c2
|
|
} else if count > 3 { // seen at least 4 bytes
|
|
acc[tran53(c4, c0, c1, 0)]++
|
|
acc[tran53(c4, c0, c2, 1)]++
|
|
acc[tran53(c4, c1, c2, 2)]++
|
|
c0, c1, c2, c3 = c4, c0, c1, c2
|
|
} else if count > 2 { // seen at least 3 bytes
|
|
acc[tran53(c4, c0, c1, 0)]++
|
|
c0, c1, c2 = c4, c0, c1
|
|
} else if count > 1 {
|
|
c0, c1 = c4, c0
|
|
} else {
|
|
c0 = c4
|
|
}
|
|
}
|
|
|
|
// Return the sliding window which should be save by the caller.
|
|
// It ok to save values that have not been set because they will not be
|
|
// used due to check for counter > x inside the loop
|
|
return count, c0, c1, c2, c3
|
|
}
|
|
|
|
/* This version is obsolete after the introduction of the Update() function
|
|
following the pattern used in package hash/crc32
|
|
func Sum(data []byte) [Size]byte {
|
|
// There is no need to allocate on the heap using array := [4]byte and
|
|
// then create the slice using last4 := array[:] because this is
|
|
// done by the compiler automatically through escape analysis
|
|
// http://grokbase.com/t/gg/golang-nuts/142cqce51f/
|
|
// go-nuts-allocation-optimization-stack-vs-heap
|
|
//
|
|
// last4 := make([]byte, 0, 4)
|
|
// acc := [256]int64
|
|
// count, _ := update(data, 0, acc, last4)
|
|
//
|
|
// Now that both accArray and last4Array are embedded into Digest struct
|
|
// there is reason not to use New() directly and just call Write().
|
|
// Escape analysis should create the Digest struct on the stack rather
|
|
// than allocating it on the heap
|
|
d := New()
|
|
d.Write(data)
|
|
return ComputeDigest(&d.acc, d.count)
|
|
}
|
|
*/
|
|
|
|
// Write computes the hash of all of the trigrams in the chunk
|
|
// using a sliding window of length 5
|
|
//
|
|
// It is part of the hash.Hash interface
|
|
func (d *Digest) Write(chunk []byte) (int, error) {
|
|
// Load up sliding window with values from values set by previous Write.
|
|
// it is ok even if some of the values are not valid because by checking for
|
|
// count > x inside the loop the invalid values are not being used
|
|
d.count, d.c0, d.c1, d.c2, d.c3 = Update(chunk, d.count,
|
|
d.c0, d.c1, d.c2, d.c3, &d.acc)
|
|
return len(chunk), nil
|
|
}
|
|
|
|
// Sum returns the Nilsimsa Digest of the data.
|
|
// Like Sum() for MD5, it returns [Size]byte by value rather than a slice.
|
|
// This way the return value does not need to be allocated on the head
|
|
// so there is no garbage collection later.
|
|
//
|
|
// To use the result as a slice when using it as as a function parameter,
|
|
// simply re-slice it using the digest[:] syntax
|
|
// See http://blog.golang.org/go-slices-usage-and-internals
|
|
// and search for: "slicing" an existing slice or array
|
|
func Sum(data []byte) (digest [Size]byte, err error) {
|
|
if len(data) < 5 {
|
|
err = fmt.Errorf("cannot compute checksum of data ")
|
|
}
|
|
var acc [256]int64
|
|
count, _, _, _, _ := Update(data, 0, 0, 0, 0, 0, &acc)
|
|
return ComputeDigest(&acc, count), nil
|
|
}
|
|
|
|
// HexSum returns the Nilsimsa Digest of the data as a hex string
|
|
func HexSum(data []byte) (hexDigest string, err error) {
|
|
var sum [Size]byte
|
|
if sum, err = Sum(data); err != nil {
|
|
return
|
|
}
|
|
return fmt.Sprintf("%x", sum), nil
|
|
}
|
|
|
|
var tran = [256]byte{
|
|
0x2, 0xD6, 0x9E, 0x6F, 0xF9, 0x1D, 0x04, 0xAB,
|
|
0xD0, 0x22, 0x16, 0x1F, 0xD8, 0x73, 0xA1, 0xAC,
|
|
0x3B, 0x70, 0x62, 0x96, 0x1E, 0x6E, 0x8F, 0x39,
|
|
0x9D, 0x05, 0x14, 0x4A, 0xA6, 0xBE, 0xAE, 0x0E,
|
|
0xCF, 0xB9, 0x9C, 0x9A, 0xC7, 0x68, 0x13, 0xE1,
|
|
0x2D, 0xA4, 0xEB, 0x51, 0x8D, 0x64, 0x6B, 0x50,
|
|
0x23, 0x80, 0x03, 0x41, 0xEC, 0xBB, 0x71, 0xCC,
|
|
0x7A, 0x86, 0x7F, 0x98, 0xF2, 0x36, 0x5E, 0xEE,
|
|
0x8E, 0xCE, 0x4F, 0xB8, 0x32, 0xB6, 0x5F, 0x59,
|
|
0xDC, 0x1B, 0x31, 0x4C, 0x7B, 0xF0, 0x63, 0x01,
|
|
0x6C, 0xBA, 0x07, 0xE8, 0x12, 0x77, 0x49, 0x3C,
|
|
0xDA, 0x46, 0xFE, 0x2F, 0x79, 0x1C, 0x9B, 0x30,
|
|
0xE3, 0x00, 0x06, 0x7E, 0x2E, 0x0F, 0x38, 0x33,
|
|
0x21, 0xAD, 0xA5, 0x54, 0xCA, 0xA7, 0x29, 0xFC,
|
|
0x5A, 0x47, 0x69, 0x7D, 0xC5, 0x95, 0xB5, 0xF4,
|
|
0x0B, 0x90, 0xA3, 0x81, 0x6D, 0x25, 0x55, 0x35,
|
|
0xF5, 0x75, 0x74, 0x0A, 0x26, 0xBF, 0x19, 0x5C,
|
|
0x1A, 0xC6, 0xFF, 0x99, 0x5D, 0x84, 0xAA, 0x66,
|
|
0x3E, 0xAF, 0x78, 0xB3, 0x20, 0x43, 0xC1, 0xED,
|
|
0x24, 0xEA, 0xE6, 0x3F, 0x18, 0xF3, 0xA0, 0x42,
|
|
0x57, 0x08, 0x53, 0x60, 0xC3, 0xC0, 0x83, 0x40,
|
|
0x82, 0xD7, 0x09, 0xBD, 0x44, 0x2A, 0x67, 0xA8,
|
|
0x93, 0xE0, 0xC2, 0x56, 0x9F, 0xD9, 0xDD, 0x85,
|
|
0x15, 0xB4, 0x8A, 0x27, 0x28, 0x92, 0x76, 0xDE,
|
|
0xEF, 0xF8, 0xB2, 0xB7, 0xC9, 0x3D, 0x45, 0x94,
|
|
0x4B, 0x11, 0x0D, 0x65, 0xD5, 0x34, 0x8B, 0x91,
|
|
0x0C, 0xFA, 0x87, 0xE9, 0x7C, 0x5B, 0xB1, 0x4D,
|
|
0xE5, 0xD4, 0xCB, 0x10, 0xA2, 0x17, 0x89, 0xBC,
|
|
0xDB, 0xB0, 0xE2, 0x97, 0x88, 0x52, 0xF7, 0x48,
|
|
0xD3, 0x61, 0x2C, 0x3A, 0x2B, 0xD1, 0x8C, 0xFB,
|
|
0xF1, 0xCD, 0xE4, 0x6A, 0xE7, 0xA9, 0xFD, 0xC4,
|
|
0x37, 0xC8, 0xD2, 0xF6, 0xDF, 0x58, 0x72, 0x4E,
|
|
}
|
|
|
|
// tran53 implements the tran53 hash function, which is an
|
|
// accumulator for a transition n between the chars a, b, c
|
|
func tran53(a, b, c, n byte) byte {
|
|
return ((tran[(a+n)&0xff] ^ tran[b]*(n+n+1)) + tran[0xff&c^tran[n]]) & 0xff
|
|
}
|
|
|
|
// Shortcut to compute the Hamming distance between two bit vector
|
|
// representations of integers.
|
|
//
|
|
// popc - population count
|
|
// popc[x] = number of 1's in binary representation of x
|
|
// popc[a ^b] = hamming distance from a to b
|
|
var popc = [256]byte{
|
|
0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03,
|
|
0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04,
|
|
0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04,
|
|
0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05,
|
|
0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04,
|
|
0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05,
|
|
0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05,
|
|
0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06,
|
|
0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04,
|
|
0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05,
|
|
0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05,
|
|
0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06,
|
|
0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05,
|
|
0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06,
|
|
0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06,
|
|
0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x06, 0x07,
|
|
0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04,
|
|
0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05,
|
|
0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05,
|
|
0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06,
|
|
0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05,
|
|
0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06,
|
|
0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06,
|
|
0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x06, 0x07,
|
|
0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05,
|
|
0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06,
|
|
0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06,
|
|
0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x06, 0x07,
|
|
0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06,
|
|
0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x06, 0x07,
|
|
0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x06, 0x07,
|
|
0x05, 0x06, 0x06, 0x07, 0x06, 0x07, 0x07, 0x08,
|
|
}
|
|
|
|
// BitsDiffSlice compares two Nilsimsa digests slices and return the number of bits that differ.
|
|
func BitsDiffSlice(n1, n2 []byte) byte {
|
|
var bits byte
|
|
for i := 0; i < Size; i++ {
|
|
bits += popc[0xff&n1[i]^n2[i]]
|
|
}
|
|
return 128 - bits
|
|
}
|
|
|
|
// BitsDiff compares two Nilsimsa Digest arrays and return the number of bits that differ.
|
|
func BitsDiff(n1, n2 *[Size]byte) byte {
|
|
var bits byte
|
|
for i := 0; i < Size; i++ {
|
|
bits += popc[0xff&n1[i]^n2[i]]
|
|
}
|
|
return 128 - bits
|
|
}
|
|
|
|
// BitsDiffHex compares two Nilsimsa digests hex strings and return the number of bits that
|
|
// differ
|
|
func BitsDiffHex(n1, n2 string) byte {
|
|
var bits byte
|
|
if len(n1) != Size*2 {
|
|
panic("len(n1) != 64")
|
|
}
|
|
if len(n2) != 32*2 {
|
|
panic("len(n2) != 64")
|
|
}
|
|
for i, j := 0, 2; i < Size*2; j += 2 {
|
|
x, err := strconv.ParseInt(n1[i:j], 16, 16)
|
|
if err != nil {
|
|
panic(err)
|
|
}
|
|
y, err := strconv.ParseInt(n2[i:j], 16, 16)
|
|
if err != nil {
|
|
panic(err)
|
|
}
|
|
bits += popc[0xff&x^y]
|
|
i = j
|
|
}
|
|
return 128 - bits
|
|
}
|