// Copyright Epic Games, Inc. All Rights Reserved. using System; namespace EpicGames.Horde.Logs { /// /// Functionality for decomposing log text into ngrams. /// public static class Ngram { /// /// Maximum number of bytes in each ngram /// public const int MaxBytes = 4; /// /// Number of bits in each ngram /// public const int MaxBits = MaxBytes * 8; /// /// Lookup from input byte to token type /// static readonly byte[] s_tokenTypes = GetTokenTypes(); /// /// Lookup from input byte to token char /// static readonly byte[] s_tokenChars = GetTokenChars(); /// /// Gets a single token /// /// The text to parse /// The token value public static ulong Read(ReadOnlySpan text) { ulong token = 0; Decompose(text, x => token = x); return token; } /// /// Decompose a span of text into tokens /// /// Text to scan /// Receives a set of tokens public static void Decompose(ReadOnlySpan text, Action addNgram) { if (text.Length > 0) { int type = s_tokenTypes[text[0]]; int numTokenBits = 8; ulong token = s_tokenChars[text[0]]; for (int textIdx = 1; textIdx < text.Length; textIdx++) { byte nextChar = s_tokenChars[text[textIdx]]; int nextType = s_tokenTypes[nextChar]; if (type != nextType || numTokenBits + 8 > MaxBits) { addNgram(token << (MaxBits - numTokenBits)); token = 0; numTokenBits = 0; type = nextType; } token = (token << 8) | nextChar; numTokenBits += 8; } addNgram(token << (MaxBits - numTokenBits)); } } /// /// Decompose a string to a set of ngrams /// /// Text to scan /// public static NgramSet Decompose(ReadOnlySpan text) { NgramSetBuilder builder = new NgramSetBuilder(); Decompose(text, builder.Add); return builder.ToNgramSet(); } /// /// Gets the length of the first token in the given span /// /// The text to search /// Start position for the search /// Length of the first token public static ReadOnlySpan GetText(ReadOnlySpan text, int pos) { int type = s_tokenTypes[text[pos]]; for (int end = pos + 1; ; end++) { if (end == text.Length || s_tokenTypes[text[end]] != type) { return text.Slice(pos, end - pos); } } } /// /// Gets the length of the first token in the given span /// /// The text to search /// Offset of the window to read from the token /// Length of the first token public static ulong GetWindowedValue(ReadOnlySpan text, int offset) { ulong token = 0; for (int idx = 0; idx < MaxBytes; idx++) { token <<= 8; if (offset >= 0 && offset < text.Length) { token |= s_tokenChars[text[offset]]; } offset++; } return token; } /// /// Gets the length of the first token in the given span /// /// The text to search /// Offset of the window to read from the token /// Whether to allow only matching the start of the string /// Length of the first token public static ulong GetWindowedMask(ReadOnlySpan text, int offset, bool allowPartialMatch) { ulong token = 0; for (int idx = 0; idx < MaxBytes; idx++) { token <<= 8; if (offset >= 0 && (offset < text.Length || !allowPartialMatch)) { token |= 0xff; } offset++; } return token; } /// /// Build the lookup table for token types /// /// Array whose elements map from an input byte to token type static byte[] GetTokenTypes() { byte[] charTypes = new byte[256]; for (int idx = 'a'; idx <= 'z'; idx++) { charTypes[idx] = 1; } for (int idx = 'A'; idx <= 'Z'; idx++) { charTypes[idx] = 1; } for (int idx = '0'; idx <= '9'; idx++) { charTypes[idx] = 2; } charTypes[' '] = 3; charTypes['\t'] = 3; charTypes['\n'] = 4; return charTypes; } /// /// Build the lookup table for token types /// /// Array whose elements map from an input byte to token type static byte[] GetTokenChars() { byte[] chars = new byte[256]; for (int idx = 0; idx < 256; idx++) { chars[idx] = (byte)idx; } for (int idx = 'A'; idx <= 'Z'; idx++) { chars[idx] = (byte)('a' + idx - 'A'); } return chars; } } }