/// Functionality for decomposing log text into ngrams. ///

public static class Ngram { ///

/// Maximum number of bytes in each ngram ///

public const int MaxBytes = 4; ///

/// Number of bits in each ngram ///

public const int MaxBits = MaxBytes * 8; ///

/// Lookup from input byte to token type ///

static readonly byte[] s_tokenTypes = GetTokenTypes(); ///

/// Lookup from input byte to token char ///

static readonly byte[] s_tokenChars = GetTokenChars(); ///

/// Gets a single token ///

/// The text to parse /// The token value public static ulong Read(ReadOnlySpan text) { ulong token = 0; Decompose(text, x => token = x); return token; } ///

/// Decompose a span of text into tokens ///

/// Text to scan /// Receives a set of tokens public static void Decompose(ReadOnlySpan text, Action addNgram) { if (text.Length > 0) { int type = s_tokenTypes[text[0]]; int numTokenBits = 8; ulong token = s_tokenChars[text[0]]; for (int textIdx = 1; textIdx < text.Length; textIdx++) { byte nextChar = s_tokenChars[text[textIdx]]; int nextType = s_tokenTypes[nextChar]; if (type != nextType || numTokenBits + 8 > MaxBits) { addNgram(token << (MaxBits - numTokenBits)); token = 0; numTokenBits = 0; type = nextType; } token = (token << 8) | nextChar; numTokenBits += 8; } addNgram(token << (MaxBits - numTokenBits)); } } ///

/// Decompose a string to a set of ngrams ///

/// Text to scan /// public static NgramSet Decompose(ReadOnlySpan text) { NgramSetBuilder builder = new NgramSetBuilder(); Decompose(text, builder.Add); return builder.ToNgramSet(); } ///

/// Gets the length of the first token in the given span ///

/// The text to search /// Start position for the search /// Length of the first token public static ReadOnlySpan GetText(ReadOnlySpan text, int pos) { int type = s_tokenTypes[text[pos]]; for (int end = pos + 1; ; end++) { if (end == text.Length || s_tokenTypes[text[end]] != type) { return text.Slice(pos, end - pos); } } } ///

/// Gets the length of the first token in the given span ///

/// The text to search /// Offset of the window to read from the token /// Length of the first token public static ulong GetWindowedValue(ReadOnlySpan text, int offset) { ulong token = 0; for (int idx = 0; idx < MaxBytes; idx++) { token <<= 8; if (offset >= 0 && offset < text.Length) { token |= s_tokenChars[text[offset]]; } offset++; } return token; } ///

/// Gets the length of the first token in the given span ///

/// The text to search /// Offset of the window to read from the token /// Whether to allow only matching the start of the string /// Length of the first token public static ulong GetWindowedMask(ReadOnlySpan text, int offset, bool allowPartialMatch) { ulong token = 0; for (int idx = 0; idx < MaxBytes; idx++) { token <<= 8; if (offset >= 0 && (offset < text.Length || !allowPartialMatch)) { token |= 0xff; } offset++; } return token; } ///

/// Build the lookup table for token types ///

/// Array whose elements map from an input byte to token type static byte[] GetTokenTypes() { byte[] charTypes = new byte[256]; for (int idx = 'a'; idx <= 'z'; idx++) { charTypes[idx] = 1; } for (int idx = 'A'; idx <= 'Z'; idx++) { charTypes[idx] = 1; } for (int idx = '0'; idx <= '9'; idx++) { charTypes[idx] = 2; } charTypes[' '] = 3; charTypes['\t'] = 3; charTypes['\n'] = 4; return charTypes; } ///

/// Build the lookup table for token types ///

/// Array whose elements map from an input byte to token type static byte[] GetTokenChars() { byte[] chars = new byte[256]; for (int idx = 0; idx < 256; idx++) { chars[idx] = (byte)idx; } for (int idx = 'A'; idx <= 'Z'; idx++) { chars[idx] = (byte)('a' + idx - 'A'); } return chars; } } }