// Copyright Epic Games, Inc. All Rights Reserved.
using System;
namespace EpicGames.Horde.Logs
{
///
/// Functionality for decomposing log text into ngrams.
///
public static class Ngram
{
///
/// Maximum number of bytes in each ngram
///
public const int MaxBytes = 4;
///
/// Number of bits in each ngram
///
public const int MaxBits = MaxBytes * 8;
///
/// Lookup from input byte to token type
///
static readonly byte[] s_tokenTypes = GetTokenTypes();
///
/// Lookup from input byte to token char
///
static readonly byte[] s_tokenChars = GetTokenChars();
///
/// Gets a single token
///
/// The text to parse
/// The token value
public static ulong Read(ReadOnlySpan text)
{
ulong token = 0;
Decompose(text, x => token = x);
return token;
}
///
/// Decompose a span of text into tokens
///
/// Text to scan
/// Receives a set of tokens
public static void Decompose(ReadOnlySpan text, Action addNgram)
{
if (text.Length > 0)
{
int type = s_tokenTypes[text[0]];
int numTokenBits = 8;
ulong token = s_tokenChars[text[0]];
for (int textIdx = 1; textIdx < text.Length; textIdx++)
{
byte nextChar = s_tokenChars[text[textIdx]];
int nextType = s_tokenTypes[nextChar];
if (type != nextType || numTokenBits + 8 > MaxBits)
{
addNgram(token << (MaxBits - numTokenBits));
token = 0;
numTokenBits = 0;
type = nextType;
}
token = (token << 8) | nextChar;
numTokenBits += 8;
}
addNgram(token << (MaxBits - numTokenBits));
}
}
///
/// Decompose a string to a set of ngrams
///
/// Text to scan
///
public static NgramSet Decompose(ReadOnlySpan text)
{
NgramSetBuilder builder = new NgramSetBuilder();
Decompose(text, builder.Add);
return builder.ToNgramSet();
}
///
/// Gets the length of the first token in the given span
///
/// The text to search
/// Start position for the search
/// Length of the first token
public static ReadOnlySpan GetText(ReadOnlySpan text, int pos)
{
int type = s_tokenTypes[text[pos]];
for (int end = pos + 1; ; end++)
{
if (end == text.Length || s_tokenTypes[text[end]] != type)
{
return text.Slice(pos, end - pos);
}
}
}
///
/// Gets the length of the first token in the given span
///
/// The text to search
/// Offset of the window to read from the token
/// Length of the first token
public static ulong GetWindowedValue(ReadOnlySpan text, int offset)
{
ulong token = 0;
for (int idx = 0; idx < MaxBytes; idx++)
{
token <<= 8;
if (offset >= 0 && offset < text.Length)
{
token |= s_tokenChars[text[offset]];
}
offset++;
}
return token;
}
///
/// Gets the length of the first token in the given span
///
/// The text to search
/// Offset of the window to read from the token
/// Whether to allow only matching the start of the string
/// Length of the first token
public static ulong GetWindowedMask(ReadOnlySpan text, int offset, bool allowPartialMatch)
{
ulong token = 0;
for (int idx = 0; idx < MaxBytes; idx++)
{
token <<= 8;
if (offset >= 0 && (offset < text.Length || !allowPartialMatch))
{
token |= 0xff;
}
offset++;
}
return token;
}
///
/// Build the lookup table for token types
///
/// Array whose elements map from an input byte to token type
static byte[] GetTokenTypes()
{
byte[] charTypes = new byte[256];
for (int idx = 'a'; idx <= 'z'; idx++)
{
charTypes[idx] = 1;
}
for (int idx = 'A'; idx <= 'Z'; idx++)
{
charTypes[idx] = 1;
}
for (int idx = '0'; idx <= '9'; idx++)
{
charTypes[idx] = 2;
}
charTypes[' '] = 3;
charTypes['\t'] = 3;
charTypes['\n'] = 4;
return charTypes;
}
///
/// Build the lookup table for token types
///
/// Array whose elements map from an input byte to token type
static byte[] GetTokenChars()
{
byte[] chars = new byte[256];
for (int idx = 0; idx < 256; idx++)
{
chars[idx] = (byte)idx;
}
for (int idx = 'A'; idx <= 'Z'; idx++)
{
chars[idx] = (byte)('a' + idx - 'A');
}
return chars;
}
}
}