//go:build !(js && wasm) package database import ( "strings" "unicode" sha "github.com/minio/sha256-simd" ) // TokenHashes extracts unique word hashes (8-byte truncated sha256) from content. // Rules: // - Unicode-aware: words are sequences of letters or numbers. // - Lowercased using unicode case mapping. // - Ignore URLs (starting with http://, https://, www., or containing "://"). // - Ignore nostr: URIs and #[n] mentions. // - Ignore words shorter than 2 runes. // - Exclude 64-character hexadecimal strings (likely IDs/pubkeys). func TokenHashes(content []byte) [][]byte { s := string(content) var out [][]byte seen := make(map[string]struct{}) i := 0 for i < len(s) { r, size := rune(s[i]), 1 if r >= 0x80 { r, size = utf8DecodeRuneInString(s[i:]) } // Skip whitespace if unicode.IsSpace(r) { i += size continue } // Skip URLs and schemes if hasPrefixFold(s[i:], "http://") || hasPrefixFold(s[i:], "https://") || hasPrefixFold(s[i:], "nostr:") || hasPrefixFold(s[i:], "www.") { i = skipUntilSpace(s, i) continue } // If token contains "://" ahead, treat as URL and skip to space if j := strings.Index(s[i:], "://"); j == 0 || (j > 0 && isWordStart(r)) { // Only if it's at start of token before := s[i : i+j] if len(before) == 0 || allAlphaNum(before) { i = skipUntilSpace(s, i) continue } } // Skip #[n] mentions if r == '#' && i+size < len(s) && s[i+size] == '[' { end := strings.IndexByte(s[i:], ']') if end >= 0 { i += end + 1 continue } } // Collect a word start := i var runes []rune for i < len(s) { r2, size2 := rune(s[i]), 1 if r2 >= 0x80 { r2, size2 = utf8DecodeRuneInString(s[i:]) } if unicode.IsLetter(r2) || unicode.IsNumber(r2) { // Normalize decorative unicode (small caps, fraktur) to ASCII // before lowercasing for consistent indexing runes = append(runes, unicode.ToLower(normalizeRune(r2))) i += size2 continue } break } // If we didn't consume any rune for a word, advance by one rune to avoid stalling if i == start { _, size2 := utf8DecodeRuneInString(s[i:]) i += size2 continue } if len(runes) >= 2 { w := string(runes) // Exclude 64-char hex strings if isHex64(w) { continue } if _, ok := seen[w]; !ok { seen[w] = struct{}{} h := sha.Sum256([]byte(w)) out = append(out, h[:8]) } } } return out } func hasPrefixFold(s, prefix string) bool { if len(s) < len(prefix) { return false } for i := 0; i < len(prefix); i++ { c := s[i] p := prefix[i] if c == p { continue } // ASCII case-insensitive if 'A' <= c && c <= 'Z' { c = c - 'A' + 'a' } if 'A' <= p && p <= 'Z' { p = p - 'A' + 'a' } if c != p { return false } } return true } func skipUntilSpace(s string, i int) int { for i < len(s) { r, size := rune(s[i]), 1 if r >= 0x80 { r, size = utf8DecodeRuneInString(s[i:]) } if unicode.IsSpace(r) { return i } i += size } return i } func allAlphaNum(s string) bool { for _, r := range s { if !(unicode.IsLetter(r) || unicode.IsNumber(r)) { return false } } return true } func isWordStart(r rune) bool { return unicode.IsLetter(r) || unicode.IsNumber(r) } // utf8DecodeRuneInString decodes the first UTF-8 rune from s. // Returns the rune and the number of bytes consumed. func utf8DecodeRuneInString(s string) (r rune, size int) { if len(s) == 0 { return 0, 0 } // ASCII fast path b := s[0] if b < 0x80 { return rune(b), 1 } // Multi-byte: determine expected length from first byte var expectedLen int switch { case b&0xE0 == 0xC0: // 110xxxxx - 2 bytes expectedLen = 2 case b&0xF0 == 0xE0: // 1110xxxx - 3 bytes expectedLen = 3 case b&0xF8 == 0xF0: // 11110xxx - 4 bytes expectedLen = 4 default: // Invalid UTF-8 start byte return 0xFFFD, 1 } if len(s) < expectedLen { return 0xFFFD, 1 } // Decode using Go's built-in rune conversion (simple and correct) runes := []rune(s[:expectedLen]) if len(runes) == 0 { return 0xFFFD, 1 } return runes[0], expectedLen } // isHex64 returns true if s is exactly 64 hex characters (0-9, a-f) func isHex64(s string) bool { if len(s) != 64 { return false } for i := 0; i < 64; i++ { c := s[i] if c >= '0' && c <= '9' { continue } if c >= 'a' && c <= 'f' { continue } if c >= 'A' && c <= 'F' { continue } return false } return true }