Refactor and fix naming conventions in Nilsimsa package

Standardized comments and naming across the codebase to align with Go conventions, e.g., changed "digest" to "Digest" for public structs and functions. Addressed typos in comments, improved formatting, and simplified logic for clarity. Minor corrections were also made in test cases and error handling.
2025-06-16 13:42:52 +01:00
parent a31fba9f91
commit f9f8a9cdd8
2 changed files with 296 additions and 306 deletions
--- a/nilsimsa.go
+++ b/nilsimsa.go
@@ -1,8 +1,8 @@
-// Package nilsimsa is a Go implemenation of Nilsimsa, ported from
+// Package nilsimsa is a Go implementation of Nilsimsa, ported from
-// code.google.com/p/py-nilsimsa
+// code.google.com/p/py-nilsimsa but follows the conventions established by the md5 package in
-// but follows the conventions establish by the md5 package in the standard lib
+// the standard lib
-// 
+//
-// The Java implementaition at 
+// The Java implementation at
 // https://github.com/weblyzard/nilsimsa/
 // blob/master/src/main/java/com/weblyzard/lib/string/nilsimsa/Nilsimsa.java
 // was also consulted.
@@ -10,183 +10,174 @@
 // There is a discussion about using hash to score string similarities
 // http://stackoverflow.com/questions/4323977/string-similarity-score-hash
 //
 //
 // Copyright 2015 Sheng-Te Tsao. All rights reserved.
-// Use of this source code is governed by the same BSD-style
+//
-// license that is used by the Go standard library
+// Use of this source code is governed by the same BSD-style license that is used by the Go
-// 
+// standard library
-// 
+//
 // From http://en.wikipedia.org/wiki/Nilsimsa_Hash
-// 
+//
-// Nilsimsa is an anti-spam focused locality-sensitive hashing algorithm
+// Nilsimsa is an anti-spam focused locality-sensitive hashing algorithm originally proposed by
-// originally proposed the cmeclax remailer operator in 2001[1] and then
+// the cmeclax remailer operator in 2001[1] and then reviewed by Damiani et al. in their 2004
-// reviewed by Damiani et al. in their 2004 paper titled,
+// paper titled, "An Open Digest-based Technique for Spam Detection".[2]
-// "An Open Digest-based Technique for Spam Detection".[2]
+//
-// 
+// The goal of Nilsimsa is to generate a hash Digest of an email message such that the digests
-// The goal of Nilsimsa is to generate a hash digest of an email message such
+// of two similar messages are similar to each other. In comparison with cryptographic hash
-// that the digests of two similar messages are similar to each other.
+// functions such as SHA-1 or MD5, making a small modification to a document does not
-// In comparison with cryptographic hash functions such as SHA-1 or MD5,
+// substantially change the resulting hash of the document. The paper suggests that the Nilsimsa
 // making a small modification to a document does not substantially change
 // the resulting hash of the document. The paper suggests that the Nilsimsa
 // satisfies three requirements:
-// 
+//
-//  1. The digest identifying each message should not vary significantly (sic)
+//  1. The Digest identifying each message should not vary significantly (sic)
-// 	for changes that can be produced automatically.
+//     for changes that can be produced automatically.
 //  2. The encoding must be robust against intentional attacks.
 //  3. The encoding should support an extremely low risk of false positives.
-// 
+//
-// Subsequent testing[3] on a range of file types identified the Nilsimsa hash
+// Subsequent testing[3] on a range of file types identified the Nilsimsa hash as having a
-// as having a significantly higher false positive rate when compared to other
+// significantly higher false positive rate when compared to other similarity Digest schemes
-// similarity digest schemes such as TLSH, Ssdeep and Sdhash.
+// such as TLSH, Ssdeep and Sdhash.
-// 
+//
-// Nilsimsa similarity matching was taken in consideration by Jesse Kornblum
+// Nilsimsa similarity matching was taken in consideration by Jesse Kornblum when developing the
-// when developing the fuzzy hashing in 2006,[4] that used the algorithms of
+// fuzzy hashing in 2006,[4] that used the algorithms of spamsum by Andrew Tridgell (2002).[5]
-// spamsum by Andrew Tridgell (2002).[5]
+//
 // 
 // References:
-// 
+//
 // [1] http://web.archive.org/web/20050707005338/
-//	   http://ixazon.dynip.com/~cmeclax/nilsimsa-0.2.4.tar.gz
+//
 //	http://ixazon.dynip.com/~cmeclax/nilsimsa-0.2.4.tar.gz
 //
 // [2] http://spdp.di.unimi.it/papers/pdcs04.pdf
 // [3] https://www.academia.edu/7833902/TLSH_-A_Locality_Sensitive_Hash
 // [4] http://jessekornblum.livejournal.com/242493.html
 // [5] http://dfrws.org/2006/proceedings/12-Kornblum.pdf
-// 
+//
 // 
 // From  http://blog.semanticlab.net/tag/nilsimsa/
-// 
+//
-// An Open Digest-based Technique for Spam Detection
+// # An Open Digest-based Technique for Spam Detection
-// 
+//
-// Damiani, E. et al., 2004. An Open Digest-based Technique for Spam Detection. 
+// Damiani, E. et al., 2004. An Open Digest-based Technique for Spam Detection. In in
-// In in Proceedings of the 2004 International Workshop on Security
+// Proceedings of the 2004 International Workshop on Security in Parallel and Distributed
-// in Parallel and Distributed Systems. pp. 15-17.
+// Systems. pp. 15-17.
-// 	
+//
-// Summary
+// # Summary
-// 
+//
-// This paper discusses the Nilsimsa open digest hash algorithm which is
+// This paper discusses the Nilsimsa open Digest hash algorithm which is frequently used for
-// frequently used for Spam detection. The authors describe the computation of
+// Spam detection. The authors describe the computation of the 32-byte code, discuss different
-// the 32-byte code, discuss different attack scenarios and measures to
+// attack scenarios and measures to counter them.
-// counter them.
+//
 // 	
 // Digest computation
-// 
+//
 //  1. Slide a five character window through the input text and compute all
 //     eight possible tri-grams for each window (e.g. "igram" yields "igr",
 //     "gra", "ram", "ira", "iam", "grm", ...)
-// 
+//
 //  2. Hash these trigrams using a hash function h() which maps every tri-gram
 //     to one of 256 accumulators and increment the corresponding
 //     accumulator. Nilsimsa uses the Trans53 hash function for hashing.
-// 	 
+//
 //  3. At the end of the process described below, compute the expected value
-//     of the accumulators and set the bits which correspond to each accumulator 
+//     of the accumulators and set the bits which correspond to each accumulator
-// 	either to (1) if it exceeds this threshold or (o) otherwise.
+//     either to (1) if it exceeds this threshold or (o) otherwise.
-// 
+//
-// Similarity computation
+// # Similarity computation
-// 
+//
-// The Nilsimsa similarity is computed based on the bitwise difference between
+// The Nilsimsa similarity is computed based on the bitwise difference between two Nilsimsa
-// two Nilsimsa hashes. Documents are considered similar if they exceed a
+// hashes. Documents are considered similar if they exceed a pre-defined similarity value.
-// pre-defined similarity value.
+//
 // 
 //  1. >24 similar bits - conflict probability: 1.35E-4
-// 	(suggestions by Nilsimsa's original designer)
+//     (suggestions by Nilsimsa's original designer)
-// 
+//
 //  2. >54 similar bits - conflict probability: 7.39E-12
 //     (suggested by the article's authors)
-// 
+//
-// Attacks
+// # Attacks
-// 
+//
-// Spammers can apply multiple techniques to prevent Nilsimsa from detecting
+// Spammers can apply multiple techniques to prevent Nilsimsa from detecting duplicates:
-// duplicates:
+//
 // 
 //  1. Random addition: requires >300% of additional text to prevent detection.
 //  2. Thesaurus substitutions: require >20% of replaced text
 //  3. Perceptive substitution (security > s3cur1ty): requires >15%
-// 	of the text to be altered
+//     of the text to be altered
 //  4. Aimed attacks (i.e. attacks which specifically target Nilsimsa):
-// 	~10% of the text needs to be altered
+//     ~10% of the text needs to be altered
-// 
+//
-// Aimed attacks manipulate Nilsimsa's accumulators by adding words which
+// Aimed attacks manipulate Nilsimsa's accumulators by adding words which introduce new
-// introduce new tri-grams that specifically alter the hash value. Although
+// tri-grams that specifically alter the hash value. Although these attacks are relatively
-// these attacks are relatively effective, they can be easily circumvented by
+// effective, they can be easily circumvented by computing the Nilsimsa hash twice with
-// computing the Nilsimsa hash twice with different hash functions.
+// different hash functions.
 package nilsimsa
 import (
 	"strconv"
 	"fmt"
 	"hash"
 	"strconv"
 )
-// The size of an Nilsimsa hash in bytes.
+// Size of an Nilsimsa hash in bytes.
 const Size = 32
-// The blocksize of Nilsimsa in bytes (not sure what values is best...?)
+// BlockSize of Nilsimsa in bytes (not sure what values is best...?)
 const BlockSize = 8
-
+type Digest struct {
-type digest struct {
+	count          int64      // number of characters that we have come across
-	count int           // number of characters that we have come across
+	acc            [256]int64 // 256-bit vector to hold the results of the Digest
-	acc [256]int        // 256-bit vector to hold the results of the digest
+	c0, c1, c2, c3 byte       // last 4 characters from previous call to Write
    c0, c1, c2, c3 byte // last 4 characters from previous call to Write
 }
 // New create a new Nilsimsa hash diget
 func New() hash.Hash {
-	d := new(digest)
+	d := new(Digest)
-	// Note that no memory is allocate other than the struct itself.
+	// Note that no memory is allocate other than the struct itself. It is better to embedd
-	// It is better to embedd last4Array into the struct itself since
+	// last4Array into the struct itself since it's maximum size is know already
-	// it's maximum size is know already
+	//	d.last4  = d.last4Array[:0] //creating the slice by re-slicing last4Array
 //	d.last4  = d.last4Array[:0] //creating the slice by re-slicing last4Array
 	return d
 }
-func (d *digest) Reset() {
+func (d *Digest) Reset() {
-	// It is probably faster to just call New() again rather than to
+	// It is probably faster to just call New() again rather than to re-use an existing struct
-	// re-use an existing struct by calling New() because presumably
+	// by calling New() because presumably the compiler does a better job of zeroing the whole
-	// the compiler does a better job of zeroing the whole struct than
+	// struct than doing the manual zeroing via copy(d.acc, zero)
 	// doing the manual zeroing via copy(d.acc, zero)
 	d.count = 0
-//	d.last4 = d.last4[:0] // Re-slice to reset size to 0 but reuse the storage
+	for i := range d.acc {
-    copy(d.acc[:], zero)  // resumably faster than re-allocation?
+		d.acc[i] = 0
 	}
 }
-var zero = make([]int, 256)
+var zero = make([]int64, 256)
-func (d *digest) Size() int { return Size }
+func (d *Digest) Size() int { return Size }
-func (d *digest) BlockSize() int { return BlockSize }
+func (d *Digest) BlockSize() int { return BlockSize }
-func (d *digest) Sum(in []byte) []byte {
+func (d *Digest) Sum(in []byte) []byte {
 	digest := ComputeDigest(&d.acc, d.count)
 	return append(in, digest[:]...)
 }
-// ComputeDigest uses a threshold (mean of the accumulator)
+// ComputeDigest uses a threshold (mean of the accumulator) to computes the Nilsimsa Digest
-// to computes the Nilsimsa digest 
+func ComputeDigest(acc *[256]int64, count int64) [Size]byte {
-func ComputeDigest(acc *[256]int, count int) [Size]byte {
+	var trigrams int64 = 0
 	trigrams := 0
 	if count == 3 {
 		trigrams = 1
 	} else if count == 4 {
 		trigrams = 4
-	} else if count > 4 {  // > 4 chars -> 8 for each char
+	} else if count > 4 { // > 4 chars -> 8 for each char
-		trigrams = 8 * count - 28
+		trigrams = 8*count - 28
 	}
-	// threshhold is the mean of the acc buckets
+	// threshold is the mean of the acc buckets
 	threshold := trigrams / 256
 	var digest [Size]byte
 	for i := uint(0); i < 256; i++ {
 		if acc[i] > threshold {
-			digest[i >> 3] += 1 << (i & 7) // equivalent to i/8, 2**(i mod 7)
+			digest[i>>3] += 1 << (i & 7) // equivalent to i/8, 2**(i mod 7)
 		}
 	}
-	// Reverse the digest
+	// Reverse the Digest
 	for i, j := 0, 31; i < j; i++ {
-        digest[i], digest[j] = digest[j], digest[i]
+		digest[i], digest[j] = digest[j], digest[i]
 		j--
-    }
+	}
 	return digest
 }
@@ -194,13 +185,13 @@ func ComputeDigest(acc *[256]int, count int) [Size]byte {
 // chunk using a 5 bytes sliding window.
 //
 // In general it is easier to just use Sum(data []byte)
-func Update(chunk []byte, count int, c0, c1, c2, c3 byte,
+func Update(chunk []byte, count int64, c0, c1, c2, c3 byte,
-	acc *[256]int) (int, byte, byte, byte, byte) {
+	acc *[256]int64) (int64, byte, byte, byte, byte) {
 	for _, c4 := range chunk {
 		count++
 		if count > 4 { // seen at least 5, so have full 5 bytes window
 			// These and c4 form the 5 bytes sliding window
-            acc[tran53(c4, c0, c1, 0)]++
+			acc[tran53(c4, c0, c1, 0)]++
 			acc[tran53(c4, c0, c2, 1)]++
 			acc[tran53(c4, c1, c2, 2)]++
 			acc[tran53(c4, c0, c3, 3)]++
@@ -226,13 +217,12 @@ func Update(chunk []byte, count int, c0, c1, c2, c3 byte,
 		}
 	}
-	// Return the slinding window which should be save by the caller.
+	// Return the sliding window which should be save by the caller.
 	// It ok to save values that have not been set because they will not be
 	// used due to check for counter > x inside the loop
 	return count, c0, c1, c2, c3
 }
 /* This version is obsolete after the introduction of the Update() function
   following the pattern used in package hash/crc32
 func Sum(data []byte) [Size]byte {
@@ -243,12 +233,12 @@ func Sum(data []byte) [Size]byte {
 	// go-nuts-allocation-optimization-stack-vs-heap
 	//
 	// last4 := make([]byte, 0, 4)
-	// acc := [256]int
+	// acc := [256]int64
 	// count, _ := update(data, 0, acc, last4)
 	//
-	// Now that both accArray and last4Array are embedded into digest struct
+	// Now that both accArray and last4Array are embedded into Digest struct
 	// there is reason not to use New() directly and just call Write().
-	// Escape analysis should create the digest struct on the stack rather
+	// Escape analysis should create the Digest struct on the stack rather
 	// than allocating it on the heap
 	d := New()
 	d.Write(data)
@@ -258,9 +248,9 @@ func Sum(data []byte) [Size]byte {
 // Write computes the hash of all of the trigrams in the chunk
 // using a sliding window of length 5
-// 
+//
 // It is part of the hash.Hash interface
-func (d *digest) Write(chunk []byte) (int, error) {
+func (d *Digest) Write(chunk []byte) (int, error) {
 	// Load up sliding window with values from values set by previous Write.
 	// it is ok even if some of the values are not valid because by checking for
 	// count > x inside the loop the invalid values are not being used
@@ -269,66 +259,72 @@ func (d *digest) Write(chunk []byte) (int, error) {
 	return len(chunk), nil
 }
-// Sum returns the Nilsimsa digest of the data.
+// Sum returns the Nilsimsa Digest of the data.
 // Like Sum() for MD5, it returns [Size]byte by value rather than a slice.
 // This way the return value does not need to be allocated on the head
 // so there is no garbage collection later.
 //
 // To use the result as a slice when using it as as a function parameter,
-// simply re-slice it using the digist[:] syntax
+// simply re-slice it using the digest[:] syntax
 // See http://blog.golang.org/go-slices-usage-and-internals
 // and search for: "slicing" an existing slice or array
-func Sum(data []byte) [Size]byte {
+func Sum(data []byte) (digest [Size]byte, err error) {
-	var acc [256]int
+	if len(data) < 5 {
 		err = fmt.Errorf("cannot compute checksum of data ")
 	}
 	var acc [256]int64
 	count, _, _, _, _ := Update(data, 0, 0, 0, 0, 0, &acc)
-	return ComputeDigest(&acc, count)
+	return ComputeDigest(&acc, count), nil
 }
-
+// HexSum returns the Nilsimsa Digest of the data as a hex string
-// HexSum returns the Nilsimsa digest of the data as a hex string
+func HexSum(data []byte) (hexDigest string, err error) {
-func HexSum(data []byte) string {
+	var sum [Size]byte
-	return fmt.Sprintf("%x", Sum(data))
+	if sum, err = Sum(data); err != nil {
 		return
 	}
 	return fmt.Sprintf("%x", sum), nil
 }
-var tran = [256]byte {
+var tran = [256]byte{
-	0x2,  0xD6, 0x9E, 0x6F, 0xF9, 0x1D, 0x04, 0xAB,
+	0x2, 0xD6, 0x9E, 0x6F, 0xF9, 0x1D, 0x04, 0xAB,
 	0xD0, 0x22, 0x16, 0x1F, 0xD8, 0x73, 0xA1, 0xAC,
-    0X3B, 0x70, 0x62, 0x96, 0x1E, 0x6E, 0x8F, 0x39,
+	0x3B, 0x70, 0x62, 0x96, 0x1E, 0x6E, 0x8F, 0x39,
 	0x9D, 0x05, 0x14, 0x4A, 0xA6, 0xBE, 0xAE, 0x0E,
-    0XCF, 0xB9, 0x9C, 0x9A, 0xC7, 0x68, 0x13, 0xE1,
+	0xCF, 0xB9, 0x9C, 0x9A, 0xC7, 0x68, 0x13, 0xE1,
 	0x2D, 0xA4, 0xEB, 0x51, 0x8D, 0x64, 0x6B, 0x50,
-    0X23, 0x80, 0x03, 0x41, 0xEC, 0xBB, 0x71, 0xCC,
+	0x23, 0x80, 0x03, 0x41, 0xEC, 0xBB, 0x71, 0xCC,
 	0x7A, 0x86, 0x7F, 0x98, 0xF2, 0x36, 0x5E, 0xEE,
-    0X8E, 0xCE, 0x4F, 0xB8, 0x32, 0xB6, 0x5F, 0x59,
+	0x8E, 0xCE, 0x4F, 0xB8, 0x32, 0xB6, 0x5F, 0x59,
 	0xDC, 0x1B, 0x31, 0x4C, 0x7B, 0xF0, 0x63, 0x01,
-    0X6C, 0xBA, 0x07, 0xE8, 0x12, 0x77, 0x49, 0x3C,
+	0x6C, 0xBA, 0x07, 0xE8, 0x12, 0x77, 0x49, 0x3C,
 	0xDA, 0x46, 0xFE, 0x2F, 0x79, 0x1C, 0x9B, 0x30,
-    0XE3, 0x00, 0x06, 0x7E, 0x2E, 0x0F, 0x38, 0x33,
+	0xE3, 0x00, 0x06, 0x7E, 0x2E, 0x0F, 0x38, 0x33,
 	0x21, 0xAD, 0xA5, 0x54, 0xCA, 0xA7, 0x29, 0xFC,
-    0X5A, 0x47, 0x69, 0x7D, 0xC5, 0x95, 0xB5, 0xF4,
+	0x5A, 0x47, 0x69, 0x7D, 0xC5, 0x95, 0xB5, 0xF4,
 	0x0B, 0x90, 0xA3, 0x81, 0x6D, 0x25, 0x55, 0x35,
-    0XF5, 0x75, 0x74, 0x0A, 0x26, 0xBF, 0x19, 0x5C,
+	0xF5, 0x75, 0x74, 0x0A, 0x26, 0xBF, 0x19, 0x5C,
 	0x1A, 0xC6, 0xFF, 0x99, 0x5D, 0x84, 0xAA, 0x66,
-    0X3E, 0xAF, 0x78, 0xB3, 0x20, 0x43, 0xC1, 0xED,
+	0x3E, 0xAF, 0x78, 0xB3, 0x20, 0x43, 0xC1, 0xED,
 	0x24, 0xEA, 0xE6, 0x3F, 0x18, 0xF3, 0xA0, 0x42,
-    0X57, 0x08, 0x53, 0x60, 0xC3, 0xC0, 0x83, 0x40,
+	0x57, 0x08, 0x53, 0x60, 0xC3, 0xC0, 0x83, 0x40,
 	0x82, 0xD7, 0x09, 0xBD, 0x44, 0x2A, 0x67, 0xA8,
-    0X93, 0xE0, 0xC2, 0x56, 0x9F, 0xD9, 0xDD, 0x85,
+	0x93, 0xE0, 0xC2, 0x56, 0x9F, 0xD9, 0xDD, 0x85,
 	0x15, 0xB4, 0x8A, 0x27, 0x28, 0x92, 0x76, 0xDE,
-    0XEF, 0xF8, 0xB2, 0xB7, 0xC9, 0x3D, 0x45, 0x94,
+	0xEF, 0xF8, 0xB2, 0xB7, 0xC9, 0x3D, 0x45, 0x94,
 	0x4B, 0x11, 0x0D, 0x65, 0xD5, 0x34, 0x8B, 0x91,
-    0X0C, 0xFA, 0x87, 0xE9, 0x7C, 0x5B, 0xB1, 0x4D,
+	0x0C, 0xFA, 0x87, 0xE9, 0x7C, 0x5B, 0xB1, 0x4D,
 	0xE5, 0xD4, 0xCB, 0x10, 0xA2, 0x17, 0x89, 0xBC,
-    0XDB, 0xB0, 0xE2, 0x97, 0x88, 0x52, 0xF7, 0x48,
+	0xDB, 0xB0, 0xE2, 0x97, 0x88, 0x52, 0xF7, 0x48,
 	0xD3, 0x61, 0x2C, 0x3A, 0x2B, 0xD1, 0x8C, 0xFB,
-    0XF1, 0xCD, 0xE4, 0x6A, 0xE7, 0xA9, 0xFD, 0xC4,
+	0xF1, 0xCD, 0xE4, 0x6A, 0xE7, 0xA9, 0xFD, 0xC4,
 	0x37, 0xC8, 0xD2, 0xF6, 0xDF, 0x58, 0x72, 0x4E,
 }
-// tran53 implementats the tran53 hash function, which is an
+// tran53 implements the tran53 hash function, which is an
 // accumulator for a transition n between the chars a, b, c
 func tran53(a, b, c, n byte) byte {
-	return ((tran[(a+n) & 0xff]^tran[b]*(n+n+1)) + tran[0xff&c^tran[n]]) & 0xff
+	return ((tran[(a+n)&0xff] ^ tran[b]*(n+n+1)) + tran[0xff&c^tran[n]]) & 0xff
 }
 // Shortcut to compute the Hamming distance between two bit vector
@@ -337,8 +333,8 @@ func tran53(a, b, c, n byte) byte {
 // popc - population count
 // popc[x] = number of 1's in binary representation of x
 // popc[a ^b] = hamming distance from a to b
-var popc = [256]byte {
+var popc = [256]byte{
-    0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03,
+	0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03,
 	0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04,
 	0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04,
 	0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05,
@@ -372,39 +368,35 @@ var popc = [256]byte {
 	0x05, 0x06, 0x06, 0x07, 0x06, 0x07, 0x07, 0x08,
 }
-
+// BitsDiffSlice compares two Nilsimsa digests slices and return the number of bits that differ.
 // BitsDiffSlice compares two Nilsimsa digests slices and
 // return the number of bits that differ.
 func BitsDiffSlice(n1, n2 []byte) byte {
 	var bits byte
 	for i := 0; i < Size; i++ {
-		bits += popc[0xff & n1[i] ^ n2[i]];
+		bits += popc[0xff&n1[i]^n2[i]]
 	}
-	return 128 - bits;
+	return 128 - bits
 }
-// BitsDiff compares two Nilsimsa digest arrays and
+// BitsDiff compares two Nilsimsa Digest arrays and return the number of bits that differ.
 // return the number of bits that differ.
 func BitsDiff(n1, n2 *[Size]byte) byte {
 	var bits byte
 	for i := 0; i < Size; i++ {
-		bits += popc[0xff & n1[i] ^ n2[i]];
+		bits += popc[0xff&n1[i]^n2[i]]
 	}
-	return 128 - bits;
+	return 128 - bits
 }
-// BitsDiffHex compares two Nilsimsa digests hex strings and
+// BitsDiffHex compares two Nilsimsa digests hex strings and return the number of bits that
-// return the number of bits that differ 
+// differ
 func BitsDiffHex(n1, n2 string) byte {
 	var bits byte
-	if len(n1) != Size * 2 {
+	if len(n1) != Size*2 {
 		panic("len(n1) != 64")
 	}
-	if len(n2) != 32 * 2 {
+	if len(n2) != 32*2 {
 		panic("len(n2) != 64")
 	}
-	for i, j := 0, 2 ; i < Size * 2; j += 2 {
+	for i, j := 0, 2; i < Size*2; j += 2 {
 		x, err := strconv.ParseInt(n1[i:j], 16, 16)
 		if err != nil {
 			panic(err)
@@ -413,8 +405,8 @@ func BitsDiffHex(n1, n2 string) byte {
 		if err != nil {
 			panic(err)
 		}
-		bits += popc[0xff & x ^ y];
+		bits += popc[0xff&x^y]
 		i = j
 	}
-	return 128 - bits;
+	return 128 - bits
 }
--- a/nilsimsa_test.go
+++ b/nilsimsa_test.go
@@ -1,23 +1,23 @@
 package nilsimsa
 import (
 	"testing"
 	"fmt"
 	"io"
 	"testing"
 )
 // tests the nilsimsa hash by choosing a random test file
-// computes the nilsimsa digest and compares to the true
+// computes the nilsimsa Digest and compares to the true
 // value stored in the pickled sid_to_nil dictionary
 func TestNilsimsa(t *testing.T) {
 	x := HexSum([]byte{})
 	if x != "0000000000000000000000000000000000000000000000000000000000000000" {
-		t.Fatalf(x)
+		t.Fatal(x)
 	}
 	x = HexSum([]byte("abcdefgh"))
 	if x != "14c8118000000000030800000004042004189020001308014088003280000078" {
-		t.Fatalf(x)
+		t.Fatal(x)
 	}
 	// Doing it the long way using incremental update
@@ -26,13 +26,13 @@ func TestNilsimsa(t *testing.T) {
 	io.WriteString(d, "efgh")
 	x = fmt.Sprintf("%x", d.Sum(nil))
 	if x != "14c8118000000000030800000004042004189020001308014088003280000078" {
-		t.Fatalf(x)
+		t.Fatal(x)
 	}
 	io.WriteString(d, "ijk")
 	x = fmt.Sprintf("%x", d.Sum(nil))
 	if x != "14c811840010000c0328200108040630041890200217582d4098103280000078" {
-		t.Fatalf(x)
+		t.Fatal(x)
 	}
 	digest1 := Sum([]byte("abcdefghijk"))
@@ -64,11 +64,11 @@ func TestNilsimsa(t *testing.T) {
 		"(srcPath, dstPath)"))
 	x2 = HexSum([]byte("return diff.NewSequenceMatcherFromFiles" +
 		"(dstPath, srcPath)"))
-	if x1 != "8beb55d08d78fed441ede9301390b49b716a11af3962db70b24540338cb70035"{
+	if x1 != "8beb55d08d78fed441ede9301390b49b716a11af3962db70b24540338cb70035" {
-		t.Fatalf(x1)
+		t.Fatal(x1)
 	}
-	if x2 != "8a5355d09968f8d451efeb309919949b73e211af7952c970f245403b8cb7a035"{
+	if x2 != "8a5355d09968f8d451efeb309919949b73e211af7952c970f245403b8cb7a035" {
-		t.Fatalf(x2)
+		t.Fatal(x2)
 	}
 	bitsDiff = BitsDiffHex(x1, x2)
 	if bitsDiff != 96 {
@@ -78,11 +78,11 @@ func TestNilsimsa(t *testing.T) {
 	x1 = HexSum([]byte("return diff.XYZ"))
 	x2 = HexSum([]byte("return diff.NewSequenceMatcherFromFiles" +
 		"(dstPath, srcPath)"))
-	if x1 != "84125570884ae840f042ea400400009a721891002011a071225247f7a5241018"{
+	if x1 != "84125570884ae840f042ea400400009a721891002011a071225247f7a5241018" {
-		t.Fatalf(x1)
+		t.Fatal(x1)
 	}
-	if x2 != "8a5355d09968f8d451efeb309919949b73e211af7952c970f245403b8cb7a035"{
+	if x2 != "8a5355d09968f8d451efeb309919949b73e211af7952c970f245403b8cb7a035" {
-		t.Fatalf(x2)
+		t.Fatal(x2)
 	}
 	bitsDiff = BitsDiffHex(x1, x2)
 	if bitsDiff != 35 {
@@ -136,7 +136,7 @@ import org.apache.commons.lang3.ArrayUtils;
 public class Nilsimsa {
    private int count    = 0;            // num characters seen
-    private int[] acc    = new int[256]; // accumulators for the digest
+    private int[] acc    = new int[256]; // accumulators for the Digest
    private int[] lastch = new int[4];   // the last four seen characters
    // pre-defined transformation arrays
@@ -184,7 +184,7 @@ public class Nilsimsa {
    }
    /**
-     * Updates the Nilsimsa digest using the given String
+     * Updates the Nilsimsa Digest using the given String
     * @param s: the String data to consider in the update
     */
    public void update(String s)  {
@@ -245,13 +245,13 @@ public class Nilsimsa {
    }
    /**
-     * @return the digest for the current Nilsimsa object.
+     * @return the Digest for the current Nilsimsa object.
     */
-    public byte[] digest() {
+    public byte[] Digest() {
        int total = 0;
        int threshold;
-        byte[] digest = new byte[32];
+        byte[] Digest = new byte[32];
-        Arrays.fill(digest, (byte)0);
+        Arrays.fill(Digest, (byte)0);
        if (count == 3) {
            total = 1;
@@ -264,11 +264,11 @@ public class Nilsimsa {
        for (int i=0; i<256; i++) {
            if (acc[i] > threshold) {
-                digest[ i>>3 ] += 1 << (i&7);
+                Digest[ i>>3 ] += 1 << (i&7);
            }
        }
-        ArrayUtils.reverse( digest );
+        ArrayUtils.reverse( Digest );
-        return digest;
+        return Digest;
    }
    /**
@@ -276,18 +276,18 @@ public class Nilsimsa {
     *      the Nilsimsa object.
     */
    public String hexdigest() {
-        return Hex.encodeHexString( digest() );
+        return Hex.encodeHexString( Digest() );
    }
    /**
-     * Compute the Nilsimsa digest for the given String.
+     * Compute the Nilsimsa Digest for the given String.
     * @param s: the String to hash
-     * @return the Nilsimsa digest.
+     * @return the Nilsimsa Digest.
     */
-    public byte[] digest(String s) {
+    public byte[] Digest(String s) {
        reset();
        update(s);
-        return digest();
+        return Digest();
    }
    /**
@@ -297,7 +297,7 @@ public class Nilsimsa {
     */
    public String hexdigest(String s) {
-        return Hex.encodeHexString( digest(s) );
+        return Hex.encodeHexString( Digest(s) );
    }
    /**
@@ -309,8 +309,8 @@ public class Nilsimsa {
    public int compare(Nilsimsa cmp) {
        byte bits = 0;
        int j;
-        byte[] n1 = digest();
+        byte[] n1 = Digest();
-        byte[] n2 = cmp.digest();
+        byte[] n2 = cmp.Digest();
        for (int i=0; i<32; i++) {
            j = 255 & n1[i] ^ n2[i];
@@ -322,142 +322,140 @@ public class Nilsimsa {
 `
 	x := HexSum([]byte(nilsimsaJavaimplementation))
 	if x != "4c900d44043f014c40f40040d8201000f246227123b28864013040008240204a" {
-		t.Fatalf(x)
+		t.Fatal(x)
 	}
 }
 func TestNilsimsa3(t *testing.T) {
-    list := [...]string{
+	list := [...]string{
-        "a",
+		"a",
-        "ab",
+		"ab",
-        "abc",
+		"abc",
-        "abcd",
+		"abcd",
-        "abcde",
+		"abcde",
-        "abcdef",
+		"abcdef",
-        "abcdefg",
+		"abcdefg",
-        "abcdefgh",
+		"abcdefgh",
-        "abcdefghi",
+		"abcdefghi",
-        "abcdefghij",
+		"abcdefghij",
-        "abcdefghijk",
+		"abcdefghijk",
-        "abcdefghijkl",
+		"abcdefghijkl",
-        "abcdefghijklm",
+		"abcdefghijklm",
-        "abcdefghijklmn",
+		"abcdefghijklmn",
-        "abcdefghijklmno",
+		"abcdefghijklmno",
-        "abcdefghijklmnop",
+		"abcdefghijklmnop",
-        "abcdefghijklmnopq",
+		"abcdefghijklmnopq",
-        "abcdefghijklmnopqr",
+		"abcdefghijklmnopqr",
-        "abcdefghijklmnopqrs",
+		"abcdefghijklmnopqrs",
-        "abcdefghijklmnopqrst",
+		"abcdefghijklmnopqrst",
-        "abcdefghijklmnopqrstu",
+		"abcdefghijklmnopqrstu",
-        "abcdefghijklmnopqrstuv",
+		"abcdefghijklmnopqrstuv",
-        "abcdefghijklmnopqrstuvw",
+		"abcdefghijklmnopqrstuvw",
-        "abcdefghijklmnopqrstuvwx",
+		"abcdefghijklmnopqrstuvwx",
-        "abcdefghijklmnopqrstuvwxy",
+		"abcdefghijklmnopqrstuvwxy",
-        "abcdefghijklmnopqrstuvwxyz",
+		"abcdefghijklmnopqrstuvwxyz",
 	}
-    results := [...]string{
+	results := [...]string{
-        "0000000000000000000000000000000000000000000000000000000000000000",
+		"0000000000000000000000000000000000000000000000000000000000000000",
-        "0000000000000000000000000000000000000000000000000000000000000000",
+		"0000000000000000000000000000000000000000000000000000000000000000",
-        "0040000000000000000000000000000000000000000000000000000000000000",
+		"0040000000000000000000000000000000000000000000000000000000000000",
-        "0440000000000000000000000000000000100000000000000008000000000000",
+		"0440000000000000000000000000000000100000000000000008000000000000",
-        "0440008000000000000000000000000000100020001200000008001200000050",
+		"0440008000000000000000000000000000100020001200000008001200000050",
-        "04c0018000000000000000000000000004188020001200000088001280000058",
+		"04c0018000000000000000000000000004188020001200000088001280000058",
-        "04c8118000000000030000000000002004188020001208004088001280000078",
+		"04c8118000000000030000000000002004188020001208004088001280000078",
-        "14c8118000000000030800000004042004189020001308014088003280000078",
+		"14c8118000000000030800000004042004189020001308014088003280000078",
-        "14c8118400000000030800010804043004189020021318094098003280000078",
+		"14c8118400000000030800010804043004189020021318094098003280000078",
-        "14c81184000000000308200108040430041890200217580d4098103280000078",
+		"14c81184000000000308200108040430041890200217580d4098103280000078",
-        "14c811840010000c0328200108040630041890200217582d4098103280000078",
+		"14c811840010000c0328200108040630041890200217582d4098103280000078",
-        "14c811840010000ca328200108044630041890200a17586d4298103280000078",
+		"14c811840010000ca328200108044630041890200a17586d4298103280000078",
-        "14ca11850010000ca328200188044630041898200a17586dc2d8103284000078",
+		"14ca11850010000ca328200188044630041898200a17586dc2d8103284000078",
-        "14ca11850030004ca3a8200188044630041898200a17586dc2d8107284000078",
+		"14ca11850030004ca3a8200188044630041898200a17586dc2d8107284000078",
-        "14ca11850032004ca3a8284188044730041898200a17586dc2d8107384000078",
+		"14ca11850032004ca3a8284188044730041898200a17586dc2d8107384000078",
-        "94ca11850432005ca3a828418804473004199c200a17586dc2d8107384004178",
+		"94ca11850432005ca3a828418804473004199c200a17586dc2d8107384004178",
-        "94ca11850433005ca3a82841880447341419be200a17586dc2d8107384004178",
+		"94ca11850433005ca3a82841880447341419be200a17586dc2d8107384004178",
-        "94ca11850433005ca3a82841a88457341419be201a17586dc6d8107384084178",
+		"94ca11850433005ca3a82841a88457341419be201a17586dc6d8107384084178",
-        "94ca11850533005ca3b82841a88657361419be201a17586dc6d8107384084178",
+		"94ca11850533005ca3b82841a88657361419be201a17586dc6d8107384084178",
-        "94ca11850533005ca3b82841aa8657371419be201a17587dc6d81077840c4178",
+		"94ca11850533005ca3b82841aa8657371419be201a17587dc6d81077840c4178",
-        "94ca15850533005ca3b92841aa8657371419be201a17587dd6d81077844cc178",
+		"94ca15850533005ca3b92841aa8657371419be201a17587dd6d81077844cc178",
-        "94ca15850533005ca3b92849aa8657371419be201a17587fd6d81077844cc978",
+		"94ca15850533005ca3b92849aa8657371419be201a17587fd6d81077844cc978",
-        "94ca15850533045cabb92869aa8657371419bea01a17587fd6f81077c44cc978",
+		"94ca15850533045cabb92869aa8657371419bea01a17587fd6f81077c44cc978",
-        "94ca95850533045cabb93869aa8657371499beb01a17587fd6f8107fc44cc978",
+		"94ca95850533045cabb93869aa8657371499beb01a17587fd6f8107fc44cc978",
-        "94ca95850733045cabb93869aa8657373499beb01a17587fd6f9107fc54cc978",
+		"94ca95850733045cabb93869aa8657373499beb01a17587fd6f9107fc54cc978",
-        "94ca95850773045cabb93869ba8657373499beb81a17587fd6f9107fc54cc978",
+		"94ca95850773045cabb93869ba8657373499beb81a17587fd6f9107fc54cc978",
 	}
-    compareResults := [...]byte {
+	compareResults := [...]byte{
-        128,
+		128,
-        127,
+		127,
-        125,
+		125,
-        120,
+		120,
-        120,
+		120,
-        120,
+		120,
-        120,
+		120,
-        120,
+		120,
-        123,
+		123,
-        122,
+		122,
-        122,
+		122,
-        121,
+		121,
-        124,
+		124,
-        123,
+		123,
-        121,
+		121,
-        123,
+		123,
-        122,
+		122,
-        124,
+		124,
-        123,
+		123,
-        123,
+		123,
-        125,
+		125,
-        122,
+		122,
-        123,
+		123,
-        124,
+		124,
-        125,
+		125,
 	}
-    step3CompareResults := [...]byte{
+	step3CompareResults := [...]byte{
-        116,
+		116,
-        104,
+		104,
-        109,
+		109,
-        111,
+		111,
-        111,
+		111,
-        113,
+		113,
-        114,
+		114,
-        116,
+		116,
 	}
-    if len(list) != len(results) {
+	if len(list) != len(results) {
 		panic("len(list) != len(results)")
 	}
-    for i, x := range(list) {
+	for i, x := range list {
-        hex := HexSum([]byte(x))
+		hex := HexSum([]byte(x))
 		if hex != results[i] {
-			t.Fatalf(hex)
+			t.Fatal(hex)
 		}
 	}
-    if len(list) != len(compareResults) + 1 {
+	if len(list) != len(compareResults)+1 {
 		panic("len(list) != len(compareResults) + 1")
 	}
-    last := Sum([]byte(list[0]))
+	last := Sum([]byte(list[0]))
-    for i, x := range list[1:] {
+	for i, x := range list[1:] {
 		sum := Sum([]byte(x))
 		bits := BitsDiff(&sum, &last)
 		if bits != compareResults[i] {
 			t.Fatalf("%x", bits)
 		}
-        last = sum
+		last = sum
 	}
-    j := 0
+	j := 0
-    last = Sum([]byte(list[0]))
+	last = Sum([]byte(list[0]))
-    for i := 4; i < len(list); i += 3 {
+	for i := 4; i < len(list); i += 3 {
 		sum := Sum([]byte(list[i]))
 		bits := BitsDiff(&sum, &last)
 		if bits != step3CompareResults[j] {
 			t.Fatalf("%x", bits)
 		}
-        last = sum
+		last = sum
-        j += 1
+		j += 1
 	}
 }