Files
nilsimsa/nilsimsa.go
mleku 4f16b0b2fd Fix typo in comment and add Evaluate function for hash comparison
Corrected a typo in a comment within `nilsimsa.go`. Introduced a new `Evaluate` function in `evaluate.go` to compute the bitwise difference between two Nilsimsa hash values, ensuring input validation and proper bit counting logic.
2025-06-16 14:13:17 +01:00

413 lines
15 KiB
Go

// Package nilsimsa is a Go implementation of Nilsimsa, ported from
// code.google.com/p/py-nilsimsa but follows the conventions established by the md5 package in
// the standard lib
//
// The Java implementation at
// https://github.com/weblyzard/nilsimsa/
// blob/master/src/main/java/com/weblyzard/lib/string/nilsimsa/Nilsimsa.java
// was also consulted.
//
// There is a discussion about using hash to score string similarities
// http://stackoverflow.com/questions/4323977/string-similarity-score-hash
//
// Copyright 2015 Sheng-Te Tsao. All rights reserved.
//
// Use of this source code is governed by the same BSD-style license that is used by the Go
// standard library
//
// From http://en.wikipedia.org/wiki/Nilsimsa_Hash
//
// Nilsimsa is an anti-spam focused locality-sensitive hashing algorithm originally proposed by
// the cmeclax remailer operator in 2001[1] and then reviewed by Damiani et al. in their 2004
// paper titled, "An Open Digest-based Technique for Spam Detection".[2]
//
// The goal of Nilsimsa is to generate a hash Digest of an email message such that the digests
// of two similar messages are similar to each other. In comparison with cryptographic hash
// functions such as SHA-1 or MD5, making a small modification to a document does not
// substantially change the resulting hash of the document. The paper suggests that the Nilsimsa
// satisfies three requirements:
//
// 1. The Digest identifying each message should not vary significantly (sic)
// for changes that can be produced automatically.
// 2. The encoding must be robust against intentional attacks.
// 3. The encoding should support an extremely low risk of false positives.
//
// Subsequent testing[3] on a range of file types identified the Nilsimsa hash as having a
// significantly higher false positive rate when compared to other similarity Digest schemes
// such as TLSH, Ssdeep and Sdhash.
//
// Nilsimsa similarity matching was taken in consideration by Jesse Kornblum when developing the
// fuzzy hashing in 2006,[4] that used the algorithms of spamsum by Andrew Tridgell (2002).[5]
//
// References:
//
// [1] http://web.archive.org/web/20050707005338/
//
// http://ixazon.dynip.com/~cmeclax/nilsimsa-0.2.4.tar.gz
//
// [2] http://spdp.di.unimi.it/papers/pdcs04.pdf
// [3] https://www.academia.edu/7833902/TLSH_-A_Locality_Sensitive_Hash
// [4] http://jessekornblum.livejournal.com/242493.html
// [5] http://dfrws.org/2006/proceedings/12-Kornblum.pdf
//
// From http://blog.semanticlab.net/tag/nilsimsa/
//
// # An Open Digest-based Technique for Spam Detection
//
// Damiani, E. et al., 2004. An Open Digest-based Technique for Spam Detection. In in
// Proceedings of the 2004 International Workshop on Security in Parallel and Distributed
// Systems. pp. 15-17.
//
// # Summary
//
// This paper discusses the Nilsimsa open Digest hash algorithm which is frequently used for
// Spam detection. The authors describe the computation of the 32-byte code, discuss different
// attack scenarios and measures to counter them.
//
// Digest computation
//
// 1. Slide a five character window through the input text and compute all
// eight possible tri-grams for each window (e.g. "igram" yields "igr",
// "gra", "ram", "ira", "iam", "grm", ...)
//
// 2. Hash these trigrams using a hash function h() which maps every tri-gram
// to one of 256 accumulators and increment the corresponding
// accumulator. Nilsimsa uses the Trans53 hash function for hashing.
//
// 3. At the end of the process described below, compute the expected value
// of the accumulators and set the bits which correspond to each accumulator
// either to (1) if it exceeds this threshold or (o) otherwise.
//
// # Similarity computation
//
// The Nilsimsa similarity is computed based on the bitwise difference between two Nilsimsa
// hashes. Documents are considered similar if they exceed a pre-defined similarity value.
//
// 1. >24 similar bits - conflict probability: 1.35E-4
// (suggestions by Nilsimsa's original designer)
//
// 2. >54 similar bits - conflict probability: 7.39E-12
// (suggested by the article's authors)
//
// # Attacks
//
// Spammers can apply multiple techniques to prevent Nilsimsa from detecting duplicates:
//
// 1. Random addition: requires >300% of additional text to prevent detection.
// 2. Thesaurus substitutions: require >20% of replaced text
// 3. Perceptive substitution (security > s3cur1ty): requires >15%
// of the text to be altered
// 4. Aimed attacks (i.e. attacks which specifically target Nilsimsa):
// ~10% of the text needs to be altered
//
// Aimed attacks manipulate Nilsimsa's accumulators by adding words which introduce new
// tri-grams that specifically alter the hash value. Although these attacks are relatively
// effective, they can be easily circumvented by computing the Nilsimsa hash twice with
// different hash functions.
package nilsimsa
import (
"fmt"
"hash"
"strconv"
)
// Size of an Nilsimsa hash in bytes.
const Size = 32
// BlockSize of Nilsimsa in bytes (not sure what values is best...?)
const BlockSize = 8
type Digest struct {
count int64 // number of characters that we have come across
acc [256]int64 // 256-bit vector to hold the results of the Digest
c0, c1, c2, c3 byte // last 4 characters from previous call to Write
}
// New create a new Nilsimsa hash diget
func New() hash.Hash {
d := new(Digest)
// Note that no memory is allocate other than the struct itself. It is better to embed
// last4Array into the struct itself since it's maximum size is know already
// d.last4 = d.last4Array[:0] //creating the slice by re-slicing last4Array
return d
}
func (d *Digest) Reset() {
// It is probably faster to just call New() again rather than to re-use an existing struct
// by calling New() because presumably the compiler does a better job of zeroing the whole
// struct than doing the manual zeroing via copy(d.acc, zero)
d.count = 0
for i := range d.acc {
d.acc[i] = 0
}
}
var zero = make([]int64, 256)
func (d *Digest) Size() int { return Size }
func (d *Digest) BlockSize() int { return BlockSize }
func (d *Digest) Sum(in []byte) []byte {
digest := ComputeDigest(&d.acc, d.count)
return append(in, digest[:]...)
}
// ComputeDigest uses a threshold (mean of the accumulator) to computes the Nilsimsa Digest
func ComputeDigest(acc *[256]int64, count int64) [Size]byte {
var trigrams int64 = 0
if count == 3 {
trigrams = 1
} else if count == 4 {
trigrams = 4
} else if count > 4 { // > 4 chars -> 8 for each char
trigrams = 8*count - 28
}
// threshold is the mean of the acc buckets
threshold := trigrams / 256
var digest [Size]byte
for i := uint(0); i < 256; i++ {
if acc[i] > threshold {
digest[i>>3] += 1 << (i & 7) // equivalent to i/8, 2**(i mod 7)
}
}
// Reverse the Digest
for i, j := 0, 31; i < j; i++ {
digest[i], digest[j] = digest[j], digest[i]
j--
}
return digest
}
// Update the Nilsimsa accumulator with the data contained in
// chunk using a 5 bytes sliding window.
//
// In general it is easier to just use Sum(data []byte)
func Update(chunk []byte, count int64, c0, c1, c2, c3 byte,
acc *[256]int64) (int64, byte, byte, byte, byte) {
for _, c4 := range chunk {
count++
if count > 4 { // seen at least 5, so have full 5 bytes window
// These and c4 form the 5 bytes sliding window
acc[tran53(c4, c0, c1, 0)]++
acc[tran53(c4, c0, c2, 1)]++
acc[tran53(c4, c1, c2, 2)]++
acc[tran53(c4, c0, c3, 3)]++
acc[tran53(c4, c1, c3, 4)]++
acc[tran53(c4, c2, c3, 5)]++
// duplicate hashes, used to maintain 8 trigrams per character
acc[tran53(c3, c0, c4, 6)]++
acc[tran53(c3, c2, c4, 7)]++
// Drop off c3 and put c4 at the front of the sliding window
c0, c1, c2, c3 = c4, c0, c1, c2
} else if count > 3 { // seen at least 4 bytes
acc[tran53(c4, c0, c1, 0)]++
acc[tran53(c4, c0, c2, 1)]++
acc[tran53(c4, c1, c2, 2)]++
c0, c1, c2, c3 = c4, c0, c1, c2
} else if count > 2 { // seen at least 3 bytes
acc[tran53(c4, c0, c1, 0)]++
c0, c1, c2 = c4, c0, c1
} else if count > 1 {
c0, c1 = c4, c0
} else {
c0 = c4
}
}
// Return the sliding window which should be save by the caller.
// It ok to save values that have not been set because they will not be
// used due to check for counter > x inside the loop
return count, c0, c1, c2, c3
}
/* This version is obsolete after the introduction of the Update() function
following the pattern used in package hash/crc32
func Sum(data []byte) [Size]byte {
// There is no need to allocate on the heap using array := [4]byte and
// then create the slice using last4 := array[:] because this is
// done by the compiler automatically through escape analysis
// http://grokbase.com/t/gg/golang-nuts/142cqce51f/
// go-nuts-allocation-optimization-stack-vs-heap
//
// last4 := make([]byte, 0, 4)
// acc := [256]int64
// count, _ := update(data, 0, acc, last4)
//
// Now that both accArray and last4Array are embedded into Digest struct
// there is reason not to use New() directly and just call Write().
// Escape analysis should create the Digest struct on the stack rather
// than allocating it on the heap
d := New()
d.Write(data)
return ComputeDigest(&d.acc, d.count)
}
*/
// Write computes the hash of all of the trigrams in the chunk
// using a sliding window of length 5
//
// It is part of the hash.Hash interface
func (d *Digest) Write(chunk []byte) (int, error) {
// Load up sliding window with values from values set by previous Write.
// it is ok even if some of the values are not valid because by checking for
// count > x inside the loop the invalid values are not being used
d.count, d.c0, d.c1, d.c2, d.c3 = Update(chunk, d.count,
d.c0, d.c1, d.c2, d.c3, &d.acc)
return len(chunk), nil
}
// Sum returns the Nilsimsa Digest of the data.
// Like Sum() for MD5, it returns [Size]byte by value rather than a slice.
// This way the return value does not need to be allocated on the head
// so there is no garbage collection later.
//
// To use the result as a slice when using it as as a function parameter,
// simply re-slice it using the digest[:] syntax
// See http://blog.golang.org/go-slices-usage-and-internals
// and search for: "slicing" an existing slice or array
func Sum(data []byte) (digest [Size]byte, err error) {
if len(data) < 5 {
err = fmt.Errorf("cannot compute checksum of data ")
}
var acc [256]int64
count, _, _, _, _ := Update(data, 0, 0, 0, 0, 0, &acc)
return ComputeDigest(&acc, count), nil
}
// HexSum returns the Nilsimsa Digest of the data as a hex string
func HexSum(data []byte) (hexDigest string, err error) {
var sum [Size]byte
if sum, err = Sum(data); err != nil {
return
}
return fmt.Sprintf("%x", sum), nil
}
var tran = [256]byte{
0x2, 0xD6, 0x9E, 0x6F, 0xF9, 0x1D, 0x04, 0xAB,
0xD0, 0x22, 0x16, 0x1F, 0xD8, 0x73, 0xA1, 0xAC,
0x3B, 0x70, 0x62, 0x96, 0x1E, 0x6E, 0x8F, 0x39,
0x9D, 0x05, 0x14, 0x4A, 0xA6, 0xBE, 0xAE, 0x0E,
0xCF, 0xB9, 0x9C, 0x9A, 0xC7, 0x68, 0x13, 0xE1,
0x2D, 0xA4, 0xEB, 0x51, 0x8D, 0x64, 0x6B, 0x50,
0x23, 0x80, 0x03, 0x41, 0xEC, 0xBB, 0x71, 0xCC,
0x7A, 0x86, 0x7F, 0x98, 0xF2, 0x36, 0x5E, 0xEE,
0x8E, 0xCE, 0x4F, 0xB8, 0x32, 0xB6, 0x5F, 0x59,
0xDC, 0x1B, 0x31, 0x4C, 0x7B, 0xF0, 0x63, 0x01,
0x6C, 0xBA, 0x07, 0xE8, 0x12, 0x77, 0x49, 0x3C,
0xDA, 0x46, 0xFE, 0x2F, 0x79, 0x1C, 0x9B, 0x30,
0xE3, 0x00, 0x06, 0x7E, 0x2E, 0x0F, 0x38, 0x33,
0x21, 0xAD, 0xA5, 0x54, 0xCA, 0xA7, 0x29, 0xFC,
0x5A, 0x47, 0x69, 0x7D, 0xC5, 0x95, 0xB5, 0xF4,
0x0B, 0x90, 0xA3, 0x81, 0x6D, 0x25, 0x55, 0x35,
0xF5, 0x75, 0x74, 0x0A, 0x26, 0xBF, 0x19, 0x5C,
0x1A, 0xC6, 0xFF, 0x99, 0x5D, 0x84, 0xAA, 0x66,
0x3E, 0xAF, 0x78, 0xB3, 0x20, 0x43, 0xC1, 0xED,
0x24, 0xEA, 0xE6, 0x3F, 0x18, 0xF3, 0xA0, 0x42,
0x57, 0x08, 0x53, 0x60, 0xC3, 0xC0, 0x83, 0x40,
0x82, 0xD7, 0x09, 0xBD, 0x44, 0x2A, 0x67, 0xA8,
0x93, 0xE0, 0xC2, 0x56, 0x9F, 0xD9, 0xDD, 0x85,
0x15, 0xB4, 0x8A, 0x27, 0x28, 0x92, 0x76, 0xDE,
0xEF, 0xF8, 0xB2, 0xB7, 0xC9, 0x3D, 0x45, 0x94,
0x4B, 0x11, 0x0D, 0x65, 0xD5, 0x34, 0x8B, 0x91,
0x0C, 0xFA, 0x87, 0xE9, 0x7C, 0x5B, 0xB1, 0x4D,
0xE5, 0xD4, 0xCB, 0x10, 0xA2, 0x17, 0x89, 0xBC,
0xDB, 0xB0, 0xE2, 0x97, 0x88, 0x52, 0xF7, 0x48,
0xD3, 0x61, 0x2C, 0x3A, 0x2B, 0xD1, 0x8C, 0xFB,
0xF1, 0xCD, 0xE4, 0x6A, 0xE7, 0xA9, 0xFD, 0xC4,
0x37, 0xC8, 0xD2, 0xF6, 0xDF, 0x58, 0x72, 0x4E,
}
// tran53 implements the tran53 hash function, which is an
// accumulator for a transition n between the chars a, b, c
func tran53(a, b, c, n byte) byte {
return ((tran[(a+n)&0xff] ^ tran[b]*(n+n+1)) + tran[0xff&c^tran[n]]) & 0xff
}
// Shortcut to compute the Hamming distance between two bit vector
// representations of integers.
//
// popc - population count
// popc[x] = number of 1's in binary representation of x
// popc[a ^b] = hamming distance from a to b
var popc = [256]byte{
0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03,
0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04,
0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04,
0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05,
0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04,
0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05,
0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05,
0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06,
0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04,
0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05,
0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05,
0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06,
0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05,
0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06,
0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06,
0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x06, 0x07,
0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04,
0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05,
0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05,
0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06,
0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05,
0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06,
0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06,
0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x06, 0x07,
0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05,
0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06,
0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06,
0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x06, 0x07,
0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06,
0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x06, 0x07,
0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x06, 0x07,
0x05, 0x06, 0x06, 0x07, 0x06, 0x07, 0x07, 0x08,
}
// BitsDiffSlice compares two Nilsimsa digests slices and return the number of bits that differ.
func BitsDiffSlice(n1, n2 []byte) byte {
var bits byte
for i := 0; i < Size; i++ {
bits += popc[0xff&n1[i]^n2[i]]
}
return 128 - bits
}
// BitsDiff compares two Nilsimsa Digest arrays and return the number of bits that differ.
func BitsDiff(n1, n2 *[Size]byte) byte {
var bits byte
for i := 0; i < Size; i++ {
bits += popc[0xff&n1[i]^n2[i]]
}
return 128 - bits
}
// BitsDiffHex compares two Nilsimsa digests hex strings and return the number of bits that
// differ
func BitsDiffHex(n1, n2 string) byte {
var bits byte
if len(n1) != Size*2 {
panic("len(n1) != 64")
}
if len(n2) != 32*2 {
panic("len(n2) != 64")
}
for i, j := 0, 2; i < Size*2; j += 2 {
x, err := strconv.ParseInt(n1[i:j], 16, 16)
if err != nil {
panic(err)
}
y, err := strconv.ParseInt(n2[i:j], 16, 16)
if err != nil {
panic(err)
}
bits += popc[0xff&x^y]
i = j
}
return 128 - bits
}