From ed5a9d99a3b1bc1c64684005fdc29377932c9b76 Mon Sep 17 00:00:00 2001 From: Your Name Date: Wed, 30 Apr 2025 22:11:22 +1200 Subject: [PATCH] Broken --- src/litecask.hpp | 4928 ++++++++++++++++++++++++++++++++++++++++++++++ src/server.cpp | 175 +- src/server.hpp | 12 +- 3 files changed, 5070 insertions(+), 45 deletions(-) create mode 100644 src/litecask.hpp diff --git a/src/litecask.hpp b/src/litecask.hpp new file mode 100644 index 0000000..bd22d9c --- /dev/null +++ b/src/litecask.hpp @@ -0,0 +1,4928 @@ +// Litecask - High performance, persistent embedded Key-Value storage engine. +// Single header file +// +// The MIT License (MIT) +// +// Copyright(c) 2023, Damien Feneyrou +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files(the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions : +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#pragma once + +// ========================================================================================== +// Quick doc +// ========================================================================================== + +// A simple example inserting and retrieving a value is shown below: +/* +// example.cpp . Place the file litecask.h is in the same folder +// Build with: 'c++ --std=c++17 example.cpp -o example' (Linux) or 'cl.exe /std:c++17 /EHsc example.cpp' (Windows) + +#include "litecask.h" + +int main(int argc, char** argv) +{ + litecask::Datastore store; + store.open("/tmp/my_temp_db"); + + // Store an entry + std::vector value{1,2,3,4,5,6,7,8}; + store.put("my key identifier", value); + + // Retrieve the entry + std::vector retrievedValue; + store.get("my key identifier", retrievedValue); + assert(retrievedValue==value); + + store.close(); +} + */ + +// ========================================================================================== +// Version +// ========================================================================================== + +#define LITECASK_VERSION_MAJOR 1 +#define LITECASK_VERSION_MINOR 0 +#define LITECASK_VERSION_PATCH 0 +#define LITECASK_VERSION (LITECASK_VERSION_MAJOR * 100 * 100 + LITECASK_VERSION_MINOR * 100 + LITECASK_VERSION_PATCH) + +// ========================================================================================== +// Includes +// ========================================================================================== + +#if defined(_MSC_VER) +// Windows +#define NOMINMAX +#include +#include +#pragma intrinsic(_umul128) // For Wyhash + +#else + +// Linux +#include // OS open +#include // mmap +#include // process ID + +#endif + +// Standard +#include + +#include +#include +#include +#include +#include +#include +#include // va_args for logging +#include +#include +#include // for std::function +#include +#include + +// String and vector primitive can be overriden by custom implementation with same interface +#ifndef lcString +#include +#define lcString std::string +#endif +#ifndef lcVector +#include +#define lcVector std::vector +#endif + +// Select the standard shared_mutex or the more performant custom one (default). Follow the definition for more information. +//#define LITECASK_STANDARD_SHARED_MUTEX +#ifdef LITECASK_STANDARD_SHARED_MUTEX +#include +#endif + +// Macros for likely and unlikely branching +#if defined(__GNUC__) || defined(__INTEL_COMPILER) || defined(__clang__) +#define LITECASK_LIKELY(x) __builtin_expect(!!(x), 1) +#define LITECASK_UNLIKELY(x) __builtin_expect(!!(x), 0) +#else +#define LITECASK_LIKELY(x) (x) +#define LITECASK_UNLIKELY(x) (x) +#endif + +// Macro to check the printf-like API and detect formatting mismatch at compile time +#if defined(__GNUC__) +#define LITECASK_PRINTF_CHECK(formatStringIndex_, firstArgIndex_) \ + __attribute__((__format__(__printf__, formatStringIndex_, firstArgIndex_))) +#define LITECASK_PRINTF_FORMAT_STRING +#elif _MSC_VER +#define LITECASK_PRINTF_CHECK(formatStringIndex_, firstArgIndex_) +#define LITECASK_PRINTF_FORMAT_STRING _Printf_format_string_ +#else +#define LITECASK_PRINTF_CHECK(formatStringIndex_, firstArgIndex_) +#define LITECASK_PRINTF_FORMAT_STRING +#endif + +// Macro to disable thread sanitizing on a function +// This is required on the code using optimistic locking, which is basically detecting at runtime the +// data race and retrying until no more collision occurs. +#if defined(__clang__) || defined(__GNUC__) +#define LITECASK_ATTRIBUTE_NO_SANITIZE_THREAD __attribute__((no_sanitize_thread)) +#else +#define LITECASK_ATTRIBUTE_NO_SANITIZE_THREAD +#endif + +namespace litecask +{ + +namespace fs = std::filesystem; + +// ========================================================================================== +// Definitions +// ========================================================================================== + +enum class Status { + Ok = 0, + StoreNotOpen = 1, + StoreAlreadyOpen = 2, + BadDiskAccess = 3, + CannotOpenStore = 4, + StoreAlreadyInUse = 5, + BadKeySize = 6, + InconsistentKeyIndex = 7, + UnorderedKeyIndex = 8, + BadValueSize = 9, + EntryNotFound = 10, + EntryCorrupted = 11, + BadParameterValue = 12, + InconsistentParameterValues = 13, + OutOfMemory = 14, +}; + +struct DatastoreCounters { + // API calls + std::atomic openCallQty = 0; + std::atomic openCallFailedQty = 0; + std::atomic closeCallQty = 0; + std::atomic closeCallFailedQty = 0; + std::atomic putCallQty = 0; + std::atomic putCallFailedQty = 0; + std::atomic removeCallQty = 0; + std::atomic removeCallNotFoundQty = 0; + std::atomic removeCallFailedQty = 0; + std::atomic getCallQty = 0; + std::atomic getCallNotFoundQty = 0; + std::atomic getCallCorruptedQty = 0; + std::atomic getCallFailedQty = 0; + std::atomic getWriteBufferHitQty = 0; + std::atomic getCacheHitQty = 0; + std::atomic queryCallQty = 0; + std::atomic queryCallFailedQty = 0; + // Data files + std::atomic dataFileCreationQty = 0; + std::atomic dataFileMaxQty = 0; + std::atomic activeDataFileSwitchQty = 0; + // Index + std::atomic indexArrayCleaningQty = 0; + std::atomic indexArrayCleanedEntries = 0; + // Maintenance (merge / compaction) + std::atomic mergeCycleQty = 0; + std::atomic mergeCycleWithMergeQty = 0; + std::atomic mergeGainedDataFileQty = 0; + std::atomic mergeGainedBytes = 0; + std::atomic hintFileCreatedQty = 0; +}; + +struct ValueCacheCounters { + std::atomic insertCallQty = 0; + std::atomic getCallQty = 0; + std::atomic removeCallQty = 0; + std::atomic currentInCacheValueQty = 0; + std::atomic hitQty = 0; + std::atomic missQty = 0; + std::atomic evictedQty = 0; +}; + +struct DataFileStats { + uint64_t fileQty = 0; + uint64_t entries = 0; + uint64_t entryBytes = 0; + uint64_t tombBytes = 0; + uint64_t tombEntries = 0; + uint64_t deadBytes = 0; + uint64_t deadEntries = 0; +}; + +struct Config { + // General store parameters + // ======================== + + // 'dataFileMaxBytes' defines the maximum byte size of a data file before switching to a new one. + // It implicitely limits the maximum size of the database as there can be at most 65535 data files. + // Bigger data files make the total size bigger (up to 65535*4GB = 281 TiB) + // Smaller data files make the merge time shorter + uint32_t dataFileMaxBytes = 100'000'000; + // 'mergeCyclePeriodMs' defines the merge period for the database, in milliseconds. + // This merge cycle first checks if the 'merge' process is needed. If positive, the eligible data files + // are selected and compacted into defragmented and smaller files which eventually replace the old ones. + uint32_t mergeCyclePeriodMs = 60'000; + // 'upkeepCyclePeriodMs' defines the upkeep period for the internal structures, in milliseconds. + // It copes mainly with the cache eviction and the KeyDir resizing. This latter does not wait the end of + // the cycle and start working immediately + uint32_t upkeepCyclePeriodMs = 1000; + // 'writeBufferFlushPeriodMs' defines the maximum time for the write buffer to be flushed on disk. + // This limits the amount of data that can be lost in case of sudden interruption of the program, while + // avoiding costly disk access at each write operation. + // Note that the effective period is the maximum between upkeepCyclePeriodMs and writeBufferFlushPeriodMs. + // Note also that the "put" API offers to force-flush directly on disk (with a performance cost). + uint32_t writeBufferFlushPeriodMs = 5000; + // 'upkeepKeyDirBatchSize' defines the quantity of KeyDir entries to update in a row. + // This includes both KeyDir resizing and data file compaction mechanisms. + // A higher quantity of entries will make the transition finish earlier, at the price of higher spikes of + // latency on entry write or update. A too low value could paradoxically induce a forced resizing of the + // remaining part of the KeyDir if the next resize arrives before the end of the previous one. + uint32_t upkeepKeyDirBatchSize = 100'000; + // 'upkeepValueCacheBatchSize' defines the quantity of cached value entries to update in a row in the LRU. + // A higher quantity of entries will make the background task finish earlier, at the price of higher spikes of + // latency on entry write or update. A too low value could paradoxically induce a forced task to clean and + // evict cached values at inserting time. + uint32_t upkeepValueCacheBatchSize = 10000; + // 'valueCacheTargetMemoryLoadPercentage' configures the target load for the cache, so that the remaining free space + // ensures a performant insertion in the cache. The eviction required to meet this target load is deferred in a + // background task. Too low a value wastes cache memory. Too high a value prevent the insertion a new entry because + // of lack of free space. + uint32_t valueCacheTargetMemoryLoadPercentage = 90; + + // Merge Triggers + // ============== + // They determine the conditions under which merging will be invoked. They fall into two basic categories: + + // 'mergeTriggerDataFileFragmentationPercentage' describes the percentage of dead keys to total keys in a file + // that triggers merging. + // Increasing this value will cause merging to occur less often. + uint32_t mergeTriggerDataFileFragmentationPercentage = 50; + // 'mergeTriggerDataFileDeadByteThreshold' describes how much data stored for dead keys in a single file will trigger merging. + // Increasing the value causes merging to occur less often, whereas decreasing the value causes merging to happen more often. + uint32_t mergeTriggerDataFileDeadByteThreshold = 50'000'000; + + // Merge data file selection + // ========================= + // These parameters determine which files will be selected for inclusion in a merge operation. + + // 'mergeSelectDataFileFragmentationPercentage' describes which percentage of dead keys to total keys in a file causes + // it to be included in the merge. + // Note: this value shall be equal or less than the corresponding trigger threshold. + uint32_t mergeSelectDataFileFragmentationPercentage = 30; + // 'mergeSelectDataFileDeadByteThreshold' describes which ratio the minimum amount of data occupied by dead keys + // in a file to cause it to be included in the merge. + // Note: this value shall be equal or less than the corresponding trigger threshold. + uint32_t mergeSelectDataFileDeadByteThreshold = 10'000'000; + // 'mergeSelectDataFileSmallSizeTheshold' describes the minimum size below which a file is included in the merge. + // The purpose is to reduce the quantity of small data files to keep open file quantity low. + uint32_t mergeSelectDataFileSmallSizeTheshold = 10'000'000; +}; + +enum class LogLevel { Debug = 0, Info = 1, Warn = 2, Error = 3, Fatal = 4, None = 5 }; + +// Defines the part of the key [start index; size[ that is used as an index/tag +// Example: Consider the key "UJohn Doe/CUS/TTax document/0001" with indexes [ (0, 9), (10, 3), (14, 13) ] +// This allows querying for any User "UJohn Doe", or Country "CUS", or Type "TTax document" entries. +// In this pure text key example (binary would be more efficient but a less good as example), the first byte +// ('U', 'C' or 'T') prevents mixing "columns", and the separating "/" is purely for human readability. +struct KeyIndex { + uint8_t startIdx; + uint8_t size; +}; + +// This structure defines a 'query result' by providing a memory span, when using an arena allocator query API. +struct QueryResult { + uint8_t* ptr; + uint16_t size; +}; + +// ========================================================================================== +// Arena allocator +// ========================================================================================== + +// Thread safety shall be enforced externally +class ArenaAllocator +{ + public: + // 'minAllocChunkBytes' is the performed allocation size if the requested amount is smaller than this value. + // For efficiency reasons, it should to be several orders of magnitude larger than the typical allocation size + ArenaAllocator(size_t minAllocChunkBytes = 1024 * 1024) : _minAllocChunkBytes(minAllocChunkBytes) {} + + ~ArenaAllocator() + { + for (auto& m : _memChunks) { delete[] m.basePtr; } + } + + uint8_t* allocate(size_t bytes) + { + // Ensure 8-bytes alignment + bytes = (bytes + 7) & (~((size_t)0x7)); + + // Ensure enough space + while (_currentIdx < _memChunks.size() && !_memChunks[_currentIdx].isEnoughSpace(bytes)) { ++_currentIdx; } + if (_currentIdx >= _memChunks.size()) { + size_t allocatedSize = std::max(_minAllocChunkBytes, bytes); + _memChunks.push_back({new uint8_t[allocatedSize], 0, allocatedSize}); + } + + MemChunk& m = _memChunks[_currentIdx]; + uint8_t* ptr = m.basePtr + m.usedSize; + m.usedSize += bytes; + _allocatedBytes += bytes; + + return ptr; + } + + size_t getAllocatedBytes() const { return _allocatedBytes; } + + void reset() + { + if (!_memChunks.empty()) { + for (size_t i = 0; i <= _currentIdx; ++i) { _memChunks[i].usedSize = 0; } + } + _allocatedBytes = 0; + _currentIdx = 0; + } + + private: + struct MemChunk { + uint8_t* basePtr = nullptr; + size_t usedSize = 0; + size_t allocatedSize = 0; + bool isEnoughSpace(size_t bytes) const { return usedSize + bytes < allocatedSize; } + }; + size_t _allocatedBytes = 0; + size_t _currentIdx = 0; + lcVector _memChunks; + const size_t _minAllocChunkBytes = 0; +}; + +namespace // Local functions namespace +{ + +// ========================================================================================== +// Internal file helper functions +// ========================================================================================== + +// OS common + +enum class OsOpenMode { READ, APPEND }; + +struct DirEntry { + lcString name; + bool isDir; +}; + +inline bool +osGetDirContent(const fs::path& path, lcVector& entries) +{ + entries.clear(); + std::error_code ec; + for (auto const& dirEntryIt : fs::directory_iterator(path, ec)) { + if (dirEntryIt.is_regular_file() || dirEntryIt.is_directory()) { + entries.push_back(DirEntry{(--(dirEntryIt.path()).end())->string(), dirEntryIt.is_directory()}); + } + } + return (!ec); +} + +inline int64_t +osGetFileSize(const fs::path& path) +{ + std::error_code ec; + int64_t fileSize = (int64_t)fs::file_size(path, ec); + return ec ? (int64_t)-1 : fileSize; +} + +inline bool +osRenameFile(const fs::path& from, const fs::path& to) +{ + std::error_code ec; + fs::rename(from, to, ec); + return (!ec); +} + +inline bool +osRemoveFile(const fs::path& path) +{ + std::error_code ec; + return fs::remove(path, ec); +} + +#if defined(_MSC_VER) +// Windows +using lcOsFileHandle = HANDLE; +const lcOsFileHandle InvalidFileHandle = INVALID_HANDLE_VALUE; + +// UTF-8 -> UTF-16 conversion for interacting with Windows API +std::wstring +utf8ToUtf16(const lcString& s) +{ + constexpr uint32_t offsetPerTrailingByte[3] = {0x0, 0x3080, 0xE2080}; + std::wstring outUtf16; + outUtf16.reserve(s.size()); + const char* cursor = &s[0]; + const char* endInput = cursor + s.size(); + int trailingBytes; + + while (cursor < endInput) { + if (((*cursor) & 0x80) == 0x00) + trailingBytes = 0; + else if (((*cursor) & 0xE0) == 0xC0) + trailingBytes = 1; + else if (((*cursor) & 0xF0) == 0xE0) + trailingBytes = 2; + else { + break; + } // Failure, only 16 bits is supported, not 32 bits codepoints + if (cursor + trailingBytes >= endInput) { break; } // Failure due to corrupted input + + uint32_t output = 0; + switch (trailingBytes) { + case 2: + output += *cursor++; + output <<= 6; // fall through + case 1: + output += *cursor++; + output <<= 6; // fall through + case 0: + output += *cursor++; + } + outUtf16.push_back((char16_t)(output - offsetPerTrailingByte[trailingBytes])); + } + + return outUtf16; +} + +// UTF-16 -> UTF-8 conversion for interacting with Windows API +lcString +utf16ToUtf8(const std::wstring& s) +{ + constexpr uint8_t firstBytes[4] = {0x00, 0x00, 0xC0, 0xE0}; + lcString outUtf8; + outUtf8.reserve(s.size()); + + for (wchar_t codepoint : s) { + if ((codepoint >= 0xD800 && codepoint <= 0xDBFF)) break; // Failure, corrupted input + int outSize = (codepoint < 0x80) ? 1 : ((codepoint < 0x800) ? 2 : 3); + + size_t curSize = outUtf8.size(); + outUtf8.resize(curSize + outSize); + switch (outSize) { + case 3: + outUtf8[curSize + 2] = (uint8_t)((codepoint | 0x80) & 0xBF); + codepoint >>= 6; // fall through + case 2: + outUtf8[curSize + 1] = (uint8_t)((codepoint | 0x80) & 0xBF); + codepoint >>= 6; // fall through + case 1: + outUtf8[curSize + 0] = (uint8_t)(codepoint | firstBytes[outSize]); + } + } + return outUtf8; +} + +// For "standard" file usage, with userland cache +inline FILE* +osFopen(const fs::path& path, const lcString& mode) +{ + return _wfopen(utf8ToUtf16(path.string()).c_str(), utf8ToUtf16(mode).c_str()); +} + +// For the data files live access which requires specific characteristics: no cache, random read location, always end of file write +inline lcOsFileHandle +osOsOpen(const fs::path& path, OsOpenMode mode) +{ + if (mode == OsOpenMode::READ) { + return CreateFileW((LPCWSTR)utf8ToUtf16(path.string()).c_str(), GENERIC_READ, FILE_SHARE_READ, NULL, OPEN_EXISTING, + FILE_ATTRIBUTE_NORMAL | FILE_FLAG_RANDOM_ACCESS, NULL); + } else if (mode == OsOpenMode::APPEND) { + return CreateFileW((LPCWSTR)utf8ToUtf16(path.string()).c_str(), FILE_APPEND_DATA | GENERIC_READ, FILE_SHARE_READ, NULL, + CREATE_ALWAYS, FILE_ATTRIBUTE_NORMAL | FILE_FLAG_RANDOM_ACCESS, NULL); + } + return InvalidFileHandle; +} + +inline bool +osOsRead(lcOsFileHandle handle, void* buffer, size_t bufferSize, uint32_t fileOffset) +{ + DWORD readBytes; + OVERLAPPED overlap = {0}; + overlap.Offset = fileOffset; // Using "overlap" to have a local read offset. Call should be synchronous. + bool status = ReadFile(handle, buffer, (DWORD)bufferSize, &readBytes, &overlap); + if (!status && GetLastError() == ERROR_IO_PENDING) { status = GetOverlappedResult(handle, &overlap, &readBytes, TRUE); } + return status; +} + +inline bool +osOsWrite(lcOsFileHandle handle, const void* buffer, size_t bufferSize) +{ + DWORD writtenBytes; + OVERLAPPED overlap = {0}; + overlap.Offset = 0xFFFFFFFF; // Using "overlap" to have a local write offset at the end. Call should be synchronous. + overlap.OffsetHigh = 0xFFFFFFFF; + bool status = WriteFile(handle, buffer, (DWORD)bufferSize, &writtenBytes, &overlap); + if (!status && GetLastError() == ERROR_IO_PENDING) { status = GetOverlappedResult(handle, &overlap, &writtenBytes, TRUE); } + return status; +} + +inline void +osOsClose(lcOsFileHandle handle) +{ + CloseHandle(handle); +} + +inline bool +osIsValidHandle(lcOsFileHandle handle) +{ + return (handle != InvalidFileHandle); +} + +#else +// Linux +using lcOsFileHandle = int; +constexpr lcOsFileHandle InvalidFileHandle = -1; + +// For "standard" file usage, with userland cache +inline FILE* +osFopen(const fs::path& path, const lcString& mode) +{ + return fopen(path.c_str(), mode.c_str()); +} + +// For the data files live access which requires specific characteristics: no cache, random read location, always end of file write +inline lcOsFileHandle +osOsOpen(const lcString& path, OsOpenMode mode) +{ + if (mode == OsOpenMode::READ) { + return ::open(path.c_str(), O_RDONLY); + } else if (mode == OsOpenMode::APPEND) { + return ::open(path.c_str(), O_RDWR | O_APPEND | O_TRUNC | O_CREAT, S_IRUSR | S_IWUSR); + } + return InvalidFileHandle; +} + +inline bool +osOsRead(lcOsFileHandle handle, void* buffer, size_t bufferSize, uint32_t fileOffset) +{ + return (pread(handle, buffer, bufferSize, fileOffset) == (ssize_t)bufferSize); +} + +inline bool +osOsWrite(lcOsFileHandle handle, const void* buffer, size_t bufferSize) +{ + return (write(handle, buffer, bufferSize) == (ssize_t)bufferSize); +} + +inline void +osOsClose(lcOsFileHandle handle) +{ + ::close(handle); +} + +inline bool +osIsValidHandle(lcOsFileHandle handle) +{ + return (handle > InvalidFileHandle); +} + +#endif + +// ========================================================================================== +// Lock file (to ensure that at most 1 process accesses the same database) +// ========================================================================================== + +inline Status +lockDatabase(const fs::path& dbDirectory) +{ + fs::path lockFilename = dbDirectory / "litecask.lockfile"; + + int64_t ourPid = getpid(); + char pidString[32]; + + constexpr int MaxTryQty = 3; + int remainingTryQty = MaxTryQty; + while (remainingTryQty > 0) { + if (remainingTryQty != MaxTryQty) { std::this_thread::sleep_for(std::chrono::milliseconds(100)); } + remainingTryQty--; + + // Check the current lock file, if exists + if (fs::exists(lockFilename)) { + // Read the content + FILE* fh = osFopen(lockFilename, "rb"); + if (!fh) { continue; } + size_t length = fread(pidString, 1, sizeof(pidString) - 1, fh); + fclose(fh); + if (length <= 0 || length >= sizeof(pidString)) { continue; } // Lock content may not be written yet + pidString[length] = 0; + int64_t readPid = strtoll(pidString, nullptr, 10); + if (readPid == 0) { continue; } // Weird case, the written content seems not a hex number, as it should... + + // Check against the current processes in the system +#if defined(_MSC_VER) + // Windows: The 'OpenProcess' API allows to check if the process with ID 'readPid' is still alive. + // Note: it shall not exit with code "STILL_ACTIVE" (239) for this to work + bool isProcessStillRunning = true; + HANDLE hPrevProc = OpenProcess(PROCESS_QUERY_INFORMATION | PROCESS_VM_READ, false, (DWORD)readPid); + if (hPrevProc == NULL) { + isProcessStillRunning = false; + } else { + DWORD cr; + if (GetExitCodeProcess(hPrevProc, &cr) == 0 || cr != STILL_ACTIVE) { isProcessStillRunning = false; } + CloseHandle(hPrevProc); + } + if (isProcessStillRunning) { return Status::StoreAlreadyInUse; } // Database is locked by an existing process +#else + // Linux: Check in the virtual filesystem of the kernel if a process with ID 'ReadPid' is listed + lcVector entries; + if (!osGetDirContent("/proc/", entries)) { return Status::BadDiskAccess; } + for (const auto& e : entries) { + if (!e.isDir) continue; // We search only directory with the PID as name + if (!strcmp(e.name.c_str(), pidString)) { return Status::StoreAlreadyInUse; } // Database is locked by an existing process + } +#endif + + // Not a valid lock: we remove it + if (!osRemoveFile(lockFilename)) { return Status::BadDiskAccess; } + } + + FILE* fh = osFopen(lockFilename, "wbx"); // 'x' ensures that the file is created by this call + if (!fh) { continue; } + snprintf(pidString, sizeof(pidString), "%" PRId64, ourPid); + if (fwrite(pidString, 1, strlen(pidString), fh) != (size_t)strlen(pidString)) { + fclose(fh); + return Status::BadDiskAccess; + } + fclose(fh); + return Status::Ok; + + } // End of retries + + return Status::BadDiskAccess; +} + +inline Status +unlockDatabase(const fs::path& dbDirectory) +{ + fs::path lockFilename = dbDirectory / "litecask.lockfile"; + if (!fs::exists(lockFilename)) { return Status::Ok; } // Weird, but it is supposed that the lock is no more valid + + // Sanity: check that it is indeed our lock (the content shall be our PID) + // Read the content + int64_t ourPid = getpid(); + char pidString[32]; + + FILE* fh = osFopen(lockFilename, "rb"); + if (!fh) { return Status::Ok; } // Weird case. Anyway, it is then assumed that the lock is no more valid + size_t length = fread(pidString, 1, sizeof(pidString) - 1, fh); + fclose(fh); + + if (length <= 0 || length >= sizeof(pidString)) { + return Status::Ok; + } // Weird case. Anyway, it is then assumed that the lock is no more valid + pidString[length] = 0; + int64_t readPid = strtoll(pidString, nullptr, 10); + if (readPid != ourPid) { return Status::Ok; } // Weird case. Anyway, it is then assumed that the lock is no more valid + + // Our lock: we remove it + if (!osRemoveFile(lockFilename)) { return Status::BadDiskAccess; } + return Status::Ok; +} + +} // namespace + +// ========================================================================================== +// Internal +// ========================================================================================== + +namespace detail +{ + +// Storage location, as a compressed pointer (32 bits) +typedef uint32_t KeyLoc; +typedef uint32_t ValueLoc; + +// Constants +constexpr const char DataFileSuffix[] = ".litecask_data"; +constexpr const char HintFileSuffix[] = ".litecask_hint"; +constexpr const char TmpFileSuffix[] = ".tmp"; +constexpr const char LogFileSuffix[] = ".log"; +constexpr const char ToRemoveFileSuffix[] = ".litecask_to_remove"; +constexpr uint32_t DiskWorkBufferSize = 10'000'000; +constexpr uint32_t MinDataFileMaxBytes = 1024; +constexpr uint64_t CpuCacheLine = 2 * 64; // Destructive interferences is 2 cache lines +constexpr uint32_t DeletedEntry = 0xFFFFFFFF; +constexpr ValueLoc NotStored = 0xFFFFFFFF; // Sentinel for "not stored" +constexpr size_t MaxValueSize = 0xFFFF0000; + +// KeyDir table associativity. 1 is classical (1-associative), 8 is max for the cache line (8-associative so 8*8=64 bytes) +// Power of two is expected to keep the cache line alignment, so valid values are 1, 2, 4 and 8 +constexpr uint32_t KeyDirAssocQty = 8; + +// Arbitrary constant value. On a range of first 256 bytes of a key, 64 indexes should be enough for everyone +constexpr uint32_t MaxKeyIndexQty = 64; + +// Default write buffer byte size +// In practice, its value does not matter much as long as it can amortize the calls to kernel in a reasonable factor +constexpr uint32_t DefaultWriteBufferBytes = 100'000; + +// Big allocation of virtual memory. Physical memory will be 'committed' by the OS depending on the real need. +// Such automatically extended memory chunk provides a common base address and enables 32-bit pointer compression on 64-bit arch. +// A 32-bit compressed pointer is simply the delta between the memory pointer and the common base pointer, shifted by 3 bits as +// a 8 bytes alignement is enforced. As such, the total addressable memory range is 35 bits = 32 GB. +constexpr uint64_t KeyStorageAllocBytes = (uint64_t)16384 * 1024 * 1024; // Yes, huge allocation but mostly virtual memory + +constexpr uint32_t ValueFlagQueueTypeMask = 0x3; // 2 bits for queue types +constexpr uint32_t ValueFlagActive = 0x4; // Bit set when the cache value is accessed. Used by deferred bumping LRU mechanism +constexpr uint32_t ValueMutexQty = 1024; // "Bucketized" cache lock + +// On-file hint entry: 16 bytes + index size + key size +// Structure of an entry in a hint file +struct HintFileEntry { + uint32_t fileOffset; + uint32_t expTimeSec; + uint32_t valueSize; + uint16_t keySize; + uint8_t keyIndexSize; // In bytes + uint8_t reserved; + // uint8_t data[0] The key then the indexes are stored here +}; + +// On-file entry: 16 bytes + index size + key size + value size +// Structure of an entry in a data file +struct DataFileEntry { + uint32_t checksum; // LITECASK_HASH_FUNC low 32 bits + uint32_t expTimeSec; + uint32_t valueSize; + uint16_t keySize; + uint8_t keyIndexSize; // In bytes + uint8_t reserved; + // uint8_t data[0] The key, the indexes, then the value are stored here +}; + +// In-memory KeyDir entry: it is composed of 2 parts +// 1) First part is inside the hashtable and points on the key location and its metadata (8 bytes) +struct MapEntry { + uint32_t hash; + KeyLoc loc; +}; + +// 2) Second part is the metadata pointed by the first part, of 22 bytes + key size +struct KeyChunk { + uint32_t expTimeSec; + uint32_t valueSize; // max key+value size is 4GB + ValueLoc cacheLocation; + uint32_t fileOffset; // max data file size is 4 GB + uint16_t fileId; + uint16_t keySize; + uint8_t keyIndexSize; + uint8_t changeCounter; // incremented at each update, to prevents ABA problems + // uint8_t data[0] The key then the indexes are stored here +}; + +// Value cache metadata +struct ValueChunk { + uint64_t ownerId; // Typically the 64 bit hash of the key + uint32_t expTimeSec; + uint32_t size; + uint16_t flags; + uint16_t unused; + ValueLoc prev; + ValueLoc next; +}; + +// This structure is used when loading hint or data file +struct LoadedKeyChunk { + KeyChunk metadata; + uint32_t keyHash; + uint8_t* key; + uint8_t* keyIndexes; +}; + +// In-memory KeyDirPatch entry: 16 bytes +// Update to apply to a KeyDir entry when merging +struct KeyDirPatch { + uint32_t keyHash; // Identifier. Unique if combined with the old fileId and fileOffset + uint32_t oldFileOffset; // Patch shall not be applied if this value differs + uint32_t fileOffset; // max data file size is 4 GB + uint16_t oldFileId; // Patch shall not be applied if this value differs + uint16_t newFileId; +}; + +struct MergeFileInfo { + uint16_t fileId; + lcVector patches; +}; + +// In-memory DataFile: information and statistics for a data file +struct DataFile { + lcString filename; + lcOsFileHandle handle = InvalidFileHandle; + std::atomic bytes = 0; + std::atomic entries = 0; + std::atomic tombBytes = 0; + std::atomic tombEntries = 0; + std::atomic deadBytes = 0; + std::atomic deadEntries = 0; + + void dump(int index, bool isActive = false) const + { + if (index < 0) { + if (!osIsValidHandle(handle)) return; + printf(" %s%s:\n", filename.c_str(), isActive ? " (ACTIVE)" : ""); + } else { + printf(" %3d) %s%s:\n", index, filename.c_str(), isActive ? " (ACTIVE)" : ""); + } + if (!osIsValidHandle(handle)) { + printf(" Not in use\n"); + } else { + printf(" Bytes : %8u\n", bytes.load()); + printf(" Entries : %8u\n", entries.load()); + printf(" Tomb bytes : %8u\n", tombBytes.load()); + printf(" Tomb entries: %8u\n", tombEntries.load()); + printf(" Dead bytes : %8u\n", deadBytes.load()); + printf(" Dead entries: %8u\n", deadEntries.load()); + } + } +}; + +// ========================================================================================== +// Wyhash https://github.com/wangyi-fudan/wyhash/tree/master (18a25157b modified) +// This is free and unencumbered software released into the public domain under The Unlicense +// (http://unlicense.org/) +// ========================================================================================== + +static inline void +_wymum(uint64_t* A, uint64_t* B) +{ +#if defined(_MSC_VER) + *A = _umul128(*A, *B, B); +#else + __uint128_t r = *A; + r *= *B; + *A = (uint64_t)r; + *B = (uint64_t)(r >> 64); +#endif +} + +static inline uint64_t +_wymix(uint64_t A, uint64_t B) +{ + _wymum(&A, &B); + return A ^ B; +} +static inline uint64_t +_wyr8(const uint8_t* p) +{ + uint64_t v; // NOLINT(cppcoreguidelines-init-variables) + memcpy(&v, p, 8); + return v; +} +static inline uint64_t +_wyr4(const uint8_t* p) +{ + uint32_t v; // NOLINT(cppcoreguidelines-init-variables) + memcpy(&v, p, 4); + return v; +} +static inline uint64_t +_wyr3(const uint8_t* p, size_t k) +{ + return (((uint64_t)p[0]) << 16) | (((uint64_t)p[k >> 1]) << 8) | p[k - 1]; +} + +static inline uint64_t +wyhash(const void* key, size_t len) +{ + constexpr uint64_t secret0 = 0x2d358dccaa6c78a5ull; + constexpr uint64_t secret1 = 0x8bb84b93962eacc9ull; + constexpr uint64_t secret2 = 0x4b33a62ed433d4a3ull; + constexpr uint64_t secret3 = 0x4d5a2da51de1aa47ull; + const uint8_t* p = (const uint8_t*)key; + uint64_t seed = 0xca813bf4c7abf0a9ull; // seed ^= _wymix(seed ^ secret0, secret1); with fixed seed = 0 + uint64_t a = 0, b = 0; + + if (LITECASK_LIKELY(len <= 16)) { + if (LITECASK_LIKELY(len >= 4)) { + a = (_wyr4(p) << 32) | _wyr4(p + ((len >> 3) << 2)); + b = (_wyr4(p + len - 4) << 32) | _wyr4(p + len - 4 - ((len >> 3) << 2)); + } else if (LITECASK_LIKELY(len > 0)) { + a = _wyr3(p, len); + b = 0; + } else { + a = b = 0; + } + } else { + size_t i = len; + if (LITECASK_UNLIKELY(i >= 48)) { + uint64_t see1 = seed, see2 = seed; + do { + seed = _wymix(_wyr8(p) ^ secret1, _wyr8(p + 8) ^ seed); + see1 = _wymix(_wyr8(p + 16) ^ secret2, _wyr8(p + 24) ^ see1); + see2 = _wymix(_wyr8(p + 32) ^ secret3, _wyr8(p + 40) ^ see2); + p += 48; + i -= 48; + } while (LITECASK_LIKELY(i >= 48)); + seed ^= see1 ^ see2; + } + while (LITECASK_UNLIKELY(i > 16)) { + seed = _wymix(_wyr8(p) ^ secret1, _wyr8(p + 8) ^ seed); + i -= 16; + p += 16; + } + a = _wyr8(p + i - 16); + b = _wyr8(p + i - 8); + } + a ^= secret1; + b ^= seed; + _wymum(&a, &b); + return _wymix(a ^ secret0 ^ len, b ^ secret1); +} + +#define LITECASK_HASH_FUNC(key, keySize) wyhash(key, keySize) + +// ========================================================================================== +// Readers-writer lock +// ========================================================================================== + +#ifdef LITECASK_STANDARD_SHARED_MUTEX + +// The standard shared mutex is a risk-less implementation but its performance does not scale well with increasing thread quantity (on Linux +// at least) probably due to its generic implementation not focussed enough on false sharing across cores and avoiding going to kernel. +// Measurements show that 100% reading scales poorly and that ~5% write collapses the overall performance. +// This RWLock is an encapsulation of the standard library and its presence just serves validation and comparison purposes. +class RWLock +{ + public: + RWLock() {} + ~RWLock() {} + + void lockRead() { _mx.lock_shared(); } + void unlockRead() { _mx.unlock_shared(); } + void lockWrite() { _mx.lock(); } + void unlockWrite() { _mx.unlock(); } + + private: + std::shared_mutex _mx; +}; +#else + +// This custom implementation of shared mutex avoids costly false sharing by having a dedicated cache line per thread to mark its lock +// request ( more memory is used compared to the standard shared mutex). The cost of the per-reader check is moved on the exclusive lock +// side, which is fully in agreement with our system where writing operations are serialized and more expensive. Also, lock requests spin +// before going to the kernel, improving the reactivity in most cases. +class RWLock +{ + // Soft limit on the thread quantity before switching to exclusive lock as a fallback + static constexpr uint64_t MaxThreads = 32; + + public: + RWLock() : excLockReq(false), stateArraySptr(std::make_shared()), stateArray(*stateArraySptr) + { + static std::atomic uniqueIdGenerator = 1; // Global for all RWLock objects + uniqueLockId = uniqueIdGenerator++; + (void)reserved; // For nitty compilers + } + + ~RWLock() + { + for (auto& lock : stateArray) { lock.state = Invalid; } + } + + void lockWrite() + { + // Mostly Spin until the writer flag is acquired + uint64_t counter = 0; + bool oldValue = false; + while (!excLockReq.compare_exchange_weak(oldValue, true, std::memory_order_seq_cst)) { + oldValue = false; + if (((++counter) & 0x0FFFFF) == 0) { std::this_thread::yield(); } // Back to OS scheduler if too long spinning + } + + // Wait for readers to stop using the shared lock + for (auto& i : stateArray) { + while (i.state.load(std::memory_order_seq_cst) >= Busy) {} + } + } + + void unlockWrite() + { + excLockReq.store(false, std::memory_order_release); // Simply release the writer flag + } + + void lockRead() + { + // Get the index of this thread in this lock + int threadIndex = accessIndex(); + + // Check if the thread has no index yet and there are some potential free index + if (threadIndex < 0 && stateArraySptr.use_count() <= (int)stateArray.size()) { + for (int idx = 0; idx < (int)stateArray.size(); ++idx) { + int oldValue = Uninit; + if (stateArray[idx].state == Uninit && stateArray[idx].state.compare_exchange_strong(oldValue, Free)) { + accessIndex(idx); + threadIndex = idx; + break; + } + } + } + + if (threadIndex >= 0) { + // Notify the reader's access + stateArray[threadIndex].state.store(Busy, std::memory_order_seq_cst); + + // If a writer has the exclusive lock, then rollback and wait that it finishes + while (excLockReq.load(std::memory_order_seq_cst)) { + // Rollback the reader's access + stateArray[threadIndex].state.store(Free, std::memory_order_seq_cst); + // Wait that the writer releases the lock + uint64_t counter = 0; + while (excLockReq.load(std::memory_order_seq_cst)) { + if (((++counter) & 0x0FFFFF) == 0) { std::this_thread::yield(); } // Back to OS scheduler if too long spinning + } + // Notify again the reader's access. If no writer has the lock, it will prevent them to take it. + stateArray[threadIndex].state.store(Busy, std::memory_order_seq_cst); + } + } else { + // Case more threads than array size: fallback to exclusive lock but without waiting for other readers + uint64_t counter = 0; + bool oldValue = false; + while (!excLockReq.compare_exchange_weak(oldValue, true, std::memory_order_seq_cst)) { + oldValue = false; + if (((++counter) & 0x0FFFFF) == 0) { std::this_thread::yield(); } // Back to OS scheduler if too long spinning + } + } + } + + void unlockRead() + { + int threadIndex = accessIndex(); + if (threadIndex >= 0) { + stateArray[threadIndex].state.store(Free, std::memory_order_release); + } else { + excLockReq.store(false, std::memory_order_release); // Case more thread than array size: fallback to exclusive lock + } + } + + private: + // Constants + static constexpr int Invalid = -1; + static constexpr int Uninit = 0; + static constexpr int Free = 1; + static constexpr int Busy = 2; + + // Definitions + struct State { + alignas(CpuCacheLine) std::atomic state{Uninit}; + }; + using StateArray = std::array; + using StateArraySptr = std::shared_ptr; + + // Thread local structure bridging between the current thread and all lock instances + struct LockContext { + LockContext(int index, const StateArraySptr& ptr) : threadIndex(index), stateArraySptr(ptr) {} + LockContext(LockContext&& src) noexcept : threadIndex(src.threadIndex), stateArraySptr(std::move(src.stateArraySptr)) {} + LockContext& operator=(LockContext&& src) noexcept + { + if (this == &src) return *this; + threadIndex = src.threadIndex; + stateArraySptr = std::move(src.stateArraySptr); + return *this; + } + ~LockContext() + { + if (stateArraySptr.use_count() > 0) { (*stateArraySptr)[threadIndex].state--; } + } + + int threadIndex; + StateArraySptr stateArraySptr; + }; + + int accessIndex(int registrationIndex = -1) + { + // These per-thread lookups are shared by all RWlock instances + thread_local static std::vector perLockContext; + thread_local static std::vector perLockId; // Cache-friendly to find the associated context + + // Getter case: return the value from the lookup + if (LITECASK_LIKELY(registrationIndex < 0)) { + for (size_t i = 0; i < perLockId.size(); ++i) { + if (perLockId[i] == uniqueLockId) { return perLockContext[i].threadIndex; } + } + return -1; + } + + // Setter case: create a new context in this thread local lookup + perLockContext.emplace_back(registrationIndex, stateArraySptr); + perLockId.push_back(uniqueLockId); + + // Take the opportunity to clean all deleted lock in this thread (accessible only from this thread...) + for (size_t i = 0; i < perLockId.size();) { + if (perLockContext[i].stateArraySptr->at(perLockContext[i].threadIndex).state < Uninit) { + perLockContext[i] = std::move(perLockContext.back()); + perLockContext.pop_back(); + perLockId[i] = perLockId.back(); + perLockId.pop_back(); + } else { + ++i; + } + } + return registrationIndex; + } + + // Fields + std::atomic excLockReq; + const StateArraySptr stateArraySptr; // Shared with thread local contexts + uint8_t reserved[CpuCacheLine]; // Prevent false sharing between threads + StateArray& stateArray; // Intra-object local access, protected by the internal shared ptr + uint32_t uniqueLockId; +}; + +#endif + +// ========================================================================================== +// TLSF allocator +// ========================================================================================== + +// This allocator possesses interesting properties for our usage: fast, code is small, response time is bounded, internal fragmentation +// is bounded and external fragmentation is low in practice (good coalescing). +// See http://www.gii.upv.es/tlsf/ for details. The code is inspired from https://github.com/jserv/tlsf-bsd . +// It is not optimized for threading (as tcmalloc or jemalloc) and less efficient than ptmalloc (derived from dlmalloc described here +// https://gee.cs.oswego.edu/dl/html/malloc.html. As TLSF, it is a "heap" allocator) which has specific handling of small and big sizes. +// This implementation does not rely on "userland managed pages" but on virtual memory, which simplifies the code. +// Such single-heap allocator enables the use of "compressed pointers" (32 bits instead of 64 bits), hence reducing overhead. + +constexpr uint32_t TlsfAlignShift = 3; // 3 LSB cleared = 8 bytes alignment +constexpr uint32_t TlsfSlShift = 4; // 4 bits for the second layer, so 16 sub lists +constexpr uint32_t TlsfFlShift = TlsfSlShift + TlsfAlignShift; +constexpr uint64_t TlsfSmallSize = (1 << TlsfFlShift); // Sizes less than 128 bytes go into the first layer +constexpr uint32_t TlsfSlQty = (1 << TlsfSlShift); +constexpr uint32_t TlsfFlQty = 32; // Allows a theoritical max allocation of 1 << (32 + 3(align) + 4(sl) - (margin) 2) + +constexpr uint64_t TlsfBlockOverhead = sizeof(uint64_t); // Size of 'dataSizeAndFlags' placed just before payloads +constexpr uint64_t TlsfFlagFree = 0x1; +constexpr uint64_t TlsfFlagPrevFree = 0x2; + +struct tlsfBlock { + // The structure of a free block is: + // [offset - 8] pointer to previous block in memory. Valid only if the previous block is free + // [offset + 0] [bit 31 <- bit 2] block size [bit 1] flag prev block is free [bit 0] flag block is free + // [offset + 8] Next free block in free list + // [offset + 16] Previous free block in free list + // + // The structure of a used block is: + // [offset - 8] pointer to previous block in memory. Valid only if the previous block is free + // [offset + 0] [bit 31 <- bit 2] block size [bit 1] flag prev block is free [bit 0] flag block is free + // [offset + 8] payload (seen by user) + + tlsfBlock* prevBlockIfFree; // Valid only if previous block is free. That is why we need the 'PrevBlockIsFree' flag + uint64_t dataSizeAndFlags; // Real block info = overhead + tlsfBlock* nextFreeBlock; // Valid only if block is free. Else it is the start of the payload + tlsfBlock* prevFreeBlock; // Valid only if block is free. Else it is part the payload + + char* header() { return (char*)this + offsetof(tlsfBlock, dataSizeAndFlags); } + + char* payload() { return header() + sizeof(dataSizeAndFlags); } + + static tlsfBlock* fromPayload(void* mem) { return (tlsfBlock*)((char*)mem - offsetof(tlsfBlock, nextFreeBlock)); } + + tlsfBlock* getNext() { return (tlsfBlock*)(header() + getPayloadSize()); } + + void setPayloadSize(uint64_t size) { dataSizeAndFlags = size | (dataSizeAndFlags & (TlsfFlagFree | TlsfFlagPrevFree)); } + + uint64_t getPayloadSize() const { return (dataSizeAndFlags & ~(TlsfFlagFree | TlsfFlagPrevFree)); } + + bool isFree() const { return (dataSizeAndFlags & TlsfFlagFree); } + + bool isPrevFree() const { return (dataSizeAndFlags & TlsfFlagPrevFree); } + + void setFree(bool state) + { + assert(isFree() != state && "block free bit unchanged"); + dataSizeAndFlags = state ? (dataSizeAndFlags | TlsfFlagFree) : (dataSizeAndFlags & ~TlsfFlagFree); + tlsfBlock* next = getNext(); + next->prevBlockIfFree = this; + next->setPrevFree(state); + } + + void setPrevFree(bool state) + { + dataSizeAndFlags = state ? (dataSizeAndFlags | TlsfFlagPrevFree) : (dataSizeAndFlags & ~TlsfFlagPrevFree); + } +}; + +class TlsfAllocator +{ + public: + TlsfAllocator(uint64_t maxAllocatableBytes) + { + if (maxAllocatableBytes) { +#if defined(_MSC_VER) + // On Windows, virtual allocation is better done in two phases: first reserve the total, then commit by chunks + SYSTEM_INFO sSysInfo; + GetSystemInfo(&sSysInfo); + _allocGranularity = sSysInfo.dwAllocationGranularity; // Preferred to page size, which is very small for our usage + _arenaMaxAllocatableBytes = ((maxAllocatableBytes + _allocGranularity - 1) / _allocGranularity) * _allocGranularity; + _arenaBasePtr = (uint8_t*)VirtualAlloc(NULL, _arenaMaxAllocatableBytes, MEM_RESERVE, PAGE_NOACCESS); + assert(_arenaBasePtr); +#else + _arenaMaxAllocatableBytes = maxAllocatableBytes; + _arenaBasePtr = (uint8_t*)mmap(nullptr, _arenaMaxAllocatableBytes, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_NORESERVE, -1, 0); + assert(_arenaBasePtr != MAP_FAILED); // NOLINT +#endif + assert((((uintptr_t)_arenaBasePtr) % (1 << TlsfAlignShift)) == 0); + } + } + + ~TlsfAllocator() + { + if (_arenaMaxAllocatableBytes) { +#if defined(_MSC_VER) + [[maybe_unused]] bool status = VirtualFree(_arenaBasePtr, 0, MEM_RELEASE); + assert(status); +#else + [[maybe_unused]] int status = munmap(_arenaBasePtr, _arenaMaxAllocatableBytes); + assert(status == 0); +#endif + } + } + + void reset() + { + // Reset the allocator. This invalidates all previous allocations + _arenaAllocatedBytes = 0; + _flBitmap = 0; + memset(_slBitmaps, 0, sizeof(_slBitmaps)); + memset(_freeBlocks, 0, sizeof(_freeBlocks)); + _statAllocatedBytes = 0; + } + + void* malloc(uint64_t size) + { + uint64_t adjustedSize = getAdjustedSize(size); + if (LITECASK_UNLIKELY(adjustedSize == 0)) return nullptr; + + // Compute the initial layer indexes + uint32_t firstLayerIdx = 0, secondLayerIdx = 0; + findSizeFittingList(adjustedSize, &firstLayerIdx, &secondLayerIdx); + + tlsfBlock* block = nullptr; + for (int pass = 0; !block && pass < 2; ++pass) { + // Second chance is after growing the arena + if (LITECASK_UNLIKELY(pass == 1 && !extendArena(adjustedSize))) { return nullptr; } + + // Is the second layer of the targeted first layer populated for this size? + uint32_t slBitmap = _slBitmaps[firstLayerIdx] & ~((1U << secondLayerIdx) - 1); // Clear the too small sizes + if (!slBitmap) { + // Need to look up larger first level lists + uint32_t flBitmap = _flBitmap & (~((1U << (firstLayerIdx + 1)) - 1)); + if (LITECASK_UNLIKELY(!flBitmap)) { continue; } // No such free blocks available, go to next pass + + // Update the first layer to this larger one + firstLayerIdx = countTrailingZeros(flBitmap); // Take the lowest bit available, as the bitmap has been masked accordingly + slBitmap = _slBitmaps[firstLayerIdx]; // Update the second layer bitmap, any of these list would fit + assert(slBitmap && "second level bitmap is null in spite of the first level bitmap"); + } + + secondLayerIdx = countTrailingZeros(slBitmap); // Take the lowest bit available, as the bitmap has been masked accordingly + assert(secondLayerIdx < TlsfSlQty && "wrong second level"); + block = _freeBlocks[firstLayerIdx][secondLayerIdx]; + } + assert(block); + assert(block && block->getPayloadSize() >= size && "insufficient block size"); + + detachFreeBlockFromFreeList(block, firstLayerIdx, secondLayerIdx); + + // Split this free block to isolate the (left) part that will be used + if (block->getPayloadSize() >= sizeof(tlsfBlock) + size) { + // Split a block into two, the second of which is free + tlsfBlock* rightBlock = (tlsfBlock*)(block->header() + size); + uint64_t rightBlockSize = block->getPayloadSize() - (size + TlsfBlockOverhead); + rightBlock->dataSizeAndFlags = rightBlockSize | 0; // Initialize without flags + rightBlock->setFree(true); + + // Deduce the removed size from the block + block->setPayloadSize(size); + + // Link the two blocks as "consecutive" + block->getNext()->prevBlockIfFree = block; + rightBlock->setPrevFree(true); // 'block' is not yet in use + + // Insert the remaining free part of the block in the free list + insertBlockInFreeList(rightBlock); + } + + // Mark the block as used + block->setFree(false); + _statAllocatedBytes += TlsfBlockOverhead + block->getPayloadSize(); + return block->payload(); + } + + void free(void* mem) + { + if (LITECASK_UNLIKELY(!mem)) { return; } + + // Back to the block structure + tlsfBlock* block = tlsfBlock::fromPayload(mem); + + assert(!block->isFree() && "block already marked as free"); + block->setFree(true); + assert(_statAllocatedBytes >= TlsfBlockOverhead + block->getPayloadSize()); + _statAllocatedBytes -= TlsfBlockOverhead + block->getPayloadSize(); + + // Merge this free block with previous one if it is free + block = mergeBlockWithPreviousIfFree(block); + + // Merge this free block with next one if it is free + tlsfBlock* next = block->getNext(); + assert(next && "next block can't be null"); + if (next->isFree()) { + // Remove the next block from the free list + uint32_t firstLayerIdx = 0, secondLayerIdx = 0; + findSizeFittingList(next->getPayloadSize(), &firstLayerIdx, &secondLayerIdx); + detachFreeBlockFromFreeList(next, firstLayerIdx, secondLayerIdx); + + // Absorb the next block inside the current one + block->dataSizeAndFlags += next->getPayloadSize() + TlsfBlockOverhead; // Flags untouched + block->getNext()->prevBlockIfFree = block; + } + + // Back in one of the free lists + insertBlockInFreeList(block); + } + + uint32_t compress(uint8_t* ptr) const { return (uint32_t)((uint64_t)(ptr - _arenaBasePtr) >> 3); } + + uint8_t* uncompress(uint32_t compressedPtr) const { return _arenaBasePtr + (((uint64_t)compressedPtr) << 3); } + + uint64_t getRealAllocatedSize(void* mem) { return tlsfBlock::fromPayload(mem)->getPayloadSize(); } + + // Returns the in-used quantity (free + overhead), not the mmap-ed one + uint64_t getAllocatedBytes() const { return _statAllocatedBytes; } + + uint64_t getMaxAllocatableBytes() const { return _arenaMaxAllocatableBytes; } + + struct CheckContext { + uint32_t firstLayerIdx; + uint32_t secondLayerIdx; + uint32_t firstLayerMask; + uint32_t secondLayerMask; + }; + + static CheckContext getSizeCheckContext(uint64_t size) + { + CheckContext cc{}; + uint64_t adjustedSize = getAdjustedSize(size); + findSizeFittingList(adjustedSize, &cc.firstLayerIdx, &cc.secondLayerIdx); + cc.firstLayerMask = ~((1U << (cc.firstLayerIdx + 1)) - 1); // Any bit set in this FL mask is ok + cc.secondLayerMask = ~((1U << cc.secondLayerIdx) - 1); // Any bit set in this SL mask from FL firstLayerIdx is ok + return cc; + } + + bool isAllocatable(const CheckContext& cc) const + { + return (_slBitmaps[cc.firstLayerIdx] & cc.secondLayerMask) || (_flBitmap & cc.firstLayerMask); + } + +#ifndef LITECASK_BUILD_FOR_TEST // Allows looking inside the allocator internal, for testing purposes + private: +#endif + // Fix the provided size (minimum size + alignment) and return the corresponding size in the free list + static uint64_t getAdjustedSize(uint64_t& size) + { + // Adjust the allocated size (ceil-aligned with a minimum value) + constexpr uint64_t TlsfMaxSize = ((uint64_t)1) << (TlsfFlQty + TlsfFlShift - 2); // 64 GB should be enough for everyone + // Min size is 24 bytes in this implementation. It could be halved with compressed pointers and reduced max allocatable size. + // However, no benefit would be provided in the context of Litecask, as minimum key and value chunks are above this size. + constexpr uint64_t MinAllocatedSize = sizeof(tlsfBlock) - sizeof(tlsfBlock*); + constexpr uint64_t AlignmentMask = (1 << TlsfAlignShift) - 1; // 0x7 when decrypted, 8 bytes alignment + size = std::max((size + AlignmentMask) & (~AlignmentMask), MinAllocatedSize); + if (LITECASK_UNLIKELY(size > TlsfMaxSize)) { return 0; } + + // Find the list of free blocks with big enough size + uint64_t adjustedSize = size; + if (size >= TlsfSmallSize) { + uint32_t firstLayerBitIdx = (uint32_t)(63 - countLeadingZeros(size)); + uint64_t layersBitMask = (((uint64_t)1) << (firstLayerBitIdx - TlsfSlShift)) - 1; + // Keep only bits from first and second layers, with a ceiling + adjustedSize = (size + layersBitMask) & ~layersBitMask; + } + return adjustedSize; + } + + static void findSizeFittingList(uint64_t size, uint32_t* firstLayerIdx, uint32_t* secondLayerIdx) + { + // Due to the 8-bytes alignment constraint multiplied by the 16 second layers, the initial first layers + // are not fully populated. + // We choose to populate only the layer 0 for "small" block size (128=2^7) then the natural first layers + if (size < TlsfSmallSize) { + *firstLayerIdx = 0; + *secondLayerIdx = (uint32_t)(size / (TlsfSmallSize / TlsfSlQty)); // Step is 128/16 = 8 bytes + } else { + uint32_t firstLayerBitIdx = (uint32_t)(63 - countLeadingZeros(size)); + *firstLayerIdx = 1 + firstLayerBitIdx - TlsfFlShift; + // Shift to keep the second layer bits starting at bit0, and clear the top bit (set to 1, first layer) + *secondLayerIdx = (uint32_t)(size >> (firstLayerBitIdx - TlsfSlShift)) & (TlsfSlQty - 1); + } + assert(*firstLayerIdx < TlsfFlQty && "wrong first level"); + assert(*secondLayerIdx < TlsfSlQty && "wrong second level"); + } + + void detachFreeBlockFromFreeList(tlsfBlock* block, uint32_t firstLayerIdx, uint32_t secondLayerIdx) + { + // Detach the block from list neighbors + tlsfBlock* prev = block->prevFreeBlock; + tlsfBlock* next = block->nextFreeBlock; + if (next) { next->prevFreeBlock = prev; } + if (prev) { prev->nextFreeBlock = next; } + + // Update list head if needed + if (_freeBlocks[firstLayerIdx][secondLayerIdx] == block) { + _freeBlocks[firstLayerIdx][secondLayerIdx] = next; + if (!next) { // Update bitmaps if the list is empty + _slBitmaps[firstLayerIdx] &= ~(1U << secondLayerIdx); // Second layer bitmap + if (!_slBitmaps[firstLayerIdx]) _flBitmap &= ~(1U << firstLayerIdx); // First layer bitmap if second layer is empty + } + } + } + + void insertBlockInFreeList(tlsfBlock* block) + { + // Find the fitting list + uint32_t firstLayerIdx = 0, secondLayerIdx = 0; + findSizeFittingList(block->getPayloadSize(), &firstLayerIdx, &secondLayerIdx); + + // Insert in the list + tlsfBlock* current = _freeBlocks[firstLayerIdx][secondLayerIdx]; + block->nextFreeBlock = current; + block->prevFreeBlock = nullptr; + if (current) current->prevFreeBlock = block; + _freeBlocks[firstLayerIdx][secondLayerIdx] = block; + + // Mark the bitmaps + _flBitmap |= 1U << firstLayerIdx; + _slBitmaps[firstLayerIdx] |= 1U << secondLayerIdx; + } + + tlsfBlock* mergeBlockWithPreviousIfFree(tlsfBlock* block) + { + if (!block->isPrevFree()) { return block; } + + // Remove the previous block from the free list + tlsfBlock* prev = block->prevBlockIfFree; + assert(prev && "prev block can't be null"); + assert(prev->isFree() && "prev block is not free though marked as such"); + uint32_t firstLayerIdx = 0, secondLayerIdx = 0; + findSizeFittingList(prev->getPayloadSize(), &firstLayerIdx, &secondLayerIdx); + detachFreeBlockFromFreeList(prev, firstLayerIdx, secondLayerIdx); + + // Absorb the block inside the previous one + prev->dataSizeAndFlags += block->getPayloadSize() + TlsfBlockOverhead; // Flags untouched + prev->getNext()->prevBlockIfFree = prev; + return prev; + } + + bool extendArena(uint64_t size) + { + // Note: first allocation is shifted by the field prevBlockIfFree + uint64_t firstAllocOverhead = (_arenaAllocatedBytes == 0) ? TlsfBlockOverhead : 0; + + // Compute the size to match the allocation granularity + // Mandatory on Windows, but generalized because we do not need fine grained allocation and can avoid its individual cost + assert((size & 0x7) == 0); + uint64_t requiredNewAllocatedSize = _arenaAllocatedBytes + size + TlsfBlockOverhead + firstAllocOverhead; + uint64_t granularNewAllocatedSize = ((requiredNewAllocatedSize + _allocGranularity - 1) / _allocGranularity) * _allocGranularity; + uint64_t granularSize = granularNewAllocatedSize - _arenaAllocatedBytes - TlsfBlockOverhead - firstAllocOverhead; + if (granularNewAllocatedSize > _arenaMaxAllocatableBytes) { return false; } + +#if defined(_MSC_VER) + // Windows: Memory shall be committed before use + [[maybe_unused]] void* committedPtr = + VirtualAlloc(_arenaBasePtr + _arenaAllocatedBytes, granularNewAllocatedSize - _arenaAllocatedBytes, MEM_COMMIT, PAGE_READWRITE); + assert(committedPtr); +#endif + tlsfBlock* addedBlock = (tlsfBlock*)((char*)_arenaBasePtr + _arenaAllocatedBytes - 2 * TlsfBlockOverhead + firstAllocOverhead); + + // 1st alloc requires the creation of the sentinel (null size & not free). Previous is not free either because below the base + // pointer + if (_arenaAllocatedBytes == 0) { + addedBlock->dataSizeAndFlags = /* Null size */ 0 | /* Not free and previous not free either because first block ever */ 0; + } + + // Transform the sentinel into a valid free block with the newly added size + assert(addedBlock->getPayloadSize() == 0 && !addedBlock->isFree() && "the old sentinel is corrupted"); + addedBlock->dataSizeAndFlags |= granularSize | /* Free to recent allocation, previous block flag is untouched */ TlsfFlagFree; + addedBlock = mergeBlockWithPreviousIfFree(addedBlock); + insertBlockInFreeList(addedBlock); + + // Add a new sentinel (dummy last block of null size) + tlsfBlock* sentinel = addedBlock->getNext(); + sentinel->prevBlockIfFree = addedBlock; + sentinel->dataSizeAndFlags = + /* Null size */ 0 | /* Not free (sentinel) and previous block free due to recent allocation */ TlsfFlagPrevFree; + assert(sentinel->getPayloadSize() == 0 && !sentinel->isFree() && "the new sentinel is corrupted"); + + // Commit the allocation + _arenaAllocatedBytes = granularNewAllocatedSize; + return true; + } + +#if defined(_MSC_VER) + static uint32_t countTrailingZeros(uint32_t v) { return _tzcnt_u32(v); } + static uint32_t countLeadingZeros(uint64_t v) { return (uint32_t)_lzcnt_u64(v); } +#else + static uint32_t countTrailingZeros(uint32_t v) { return __builtin_ctz(v); } + static uint32_t countLeadingZeros(uint64_t v) { return __builtin_clzll(v); } +#endif + + // Bitmaps of the first and second layers + uint32_t _flBitmap = 0; + uint32_t _slBitmaps[TlsfFlQty] = {}; + + // Free lists for each (first, second) layer + tlsfBlock* _freeBlocks[TlsfFlQty][TlsfSlQty] = {}; + uint64_t _statAllocatedBytes = 0; + + // Internal arena allocator + uint8_t* _arenaBasePtr = nullptr; + uint64_t _arenaAllocatedBytes = 0; + uint64_t _arenaMaxAllocatableBytes = 0; + uint64_t _allocGranularity = 65536; // Only really required on Windows, but generalized +}; + +// ========================================================================================== +// Value cache +// ========================================================================================== + +inline uint32_t +getValueLockIndex(ValueLoc loc) +{ + uint32_t x = ((uint32_t)loc) ^ 2463534242; + x ^= x << 13; + x ^= x >> 17; + x ^= x << 5; + return x & (ValueMutexQty - 1); // Kind of xorshift output +} + +inline void +lockValueLocation(ValueLoc loc, std::array& valueMutexes) +{ + valueMutexes[getValueLockIndex(loc)].lock(); +} + +inline void +unlockValueLocation(ValueLoc loc, std::array& valueMutexes) +{ + valueMutexes[getValueLockIndex(loc)].unlock(); +} + +class ValueCache +{ + // Definitions + enum class LruType { None = 0, Hot = 1, Warm = 2, Cold = 3, Qty = 4 }; + + static constexpr int SmallBatchSize = 10; + + public: + ValueCache(uint64_t valueMaxAllocBytes) : _tlsfAlloc(valueMaxAllocBytes) {} + + ~ValueCache() {} + + bool setTargetMemoryLoad(double load) + { + if (load < 0. || load > 1.0) { return false; } + _targetMemoryLoad = load; + return true; + } + + // To call when all references to this memory is no more used + void reset() { _tlsfAlloc.reset(); } + + bool isEnabled() const { return (getMaxAllocatableBytes() > 0); } + + uint64_t getAllocatedBytes() const { return _tlsfAlloc.getAllocatedBytes(); } + + uint64_t getMaxAllocatableBytes() const { return _tlsfAlloc.getMaxAllocatableBytes(); } + + ValueLoc insertValue(const void* data, uint32_t size, uint64_t ownerId, uint32_t expTimeSec) + { + ++_stats.insertCallQty; + ValueLoc loc = NotStored; + uint32_t targetSize = size + sizeof(ValueChunk); + + _mxMalloc.lock(); + uint8_t* ptr = (uint8_t*)_tlsfAlloc.malloc(targetSize); + _mxMalloc.unlock(); + + // If allocation failed, some forced evictions are needed + if (!ptr) { + TlsfAllocator::CheckContext cc = _tlsfAlloc.getSizeCheckContext(targetSize); // For optimized check of "allocatability" + + int remainingTries = SmallBatchSize; + bool isAllocatable = false; + + do { + _mxLrus.lock(); + if (_queues[(uint32_t)LruType::Cold].tail == NotStored) { + updateLruHotAndWarm(SmallBatchSize); + if (_queues[(uint32_t)LruType::Cold].tail == NotStored) { + _mxLrus.unlock(); + break; + } + } + + ValueLoc locEvict = _queues[(uint32_t)LruType::Cold].tail; + + if (locEvict != NotStored) { + lockValueLocation(locEvict, _valueMutexes); + ValueChunk* c = getValueChunk(locEvict); + // If entry is active, move it to warm + if (c->flags & ValueFlagActive) { + lruRemove(c); + lruInsertFront(LruType::Warm, locEvict, c); + } + // Else we found our eviction + else { + lruRemove(c); + c->ownerId = 0; + _mxMalloc.lock(); + _tlsfAlloc.free(c); + _mxMalloc.unlock(); + ++_stats.evictedQty; + --_stats.currentInCacheValueQty; + isAllocatable = _tlsfAlloc.isAllocatable(cc); // Cheap check + } + unlockValueLocation(locEvict, _valueMutexes); + } + _mxLrus.unlock(); + + } while (remainingTries-- && !isAllocatable); + + if (isAllocatable) { + _mxMalloc.lock(); + ptr = (uint8_t*)_tlsfAlloc.malloc(targetSize); + _mxMalloc.unlock(); + } + } + + if (!ptr) { return NotStored; } + + // Install meta data before the data + ValueChunk* c = (ValueChunk*)ptr; + c->ownerId = ownerId; + c->expTimeSec = expTimeSec; + c->size = size; + c->flags = 0; + memcpy(((uint8_t*)c) + sizeof(ValueChunk), data, size); + + ++_stats.currentInCacheValueQty; + + // Insert this new entry inside the Hot queue + loc = _tlsfAlloc.compress(ptr); + _mxLrus.lock(); + lruInsertFront(LruType::Hot, loc, c); + _mxLrus.unlock(); + + return loc; + } + + bool removeValue(ValueLoc loc, uint64_t ownerId) + { + ++_stats.removeCallQty; + if (loc == NotStored) { return false; } + + _mxLrus.lock(); + lockValueLocation(loc, _valueMutexes); + ValueChunk* c = getValueChunk(loc); + + // Check the ownerId and some sanity fields + // By calling this function, the reference to `loc` is already removed in the above layers + if (c->ownerId != ownerId || c->size == NotStored) { + unlockValueLocation(loc, _valueMutexes); + _mxLrus.unlock(); + return false; + } + + // Invalidate + lruRemove(c); + c->ownerId = 0; + + unlockValueLocation(loc, _valueMutexes); + _mxLrus.unlock(); + + // Free + _mxMalloc.lock(); + _tlsfAlloc.free(c); + _mxMalloc.unlock(); + --_stats.currentInCacheValueQty; + + return true; + } + + bool getValue(ValueLoc loc, uint64_t checKOwnerId, uint32_t checkValueSize, lcVector& data) + { + ++_stats.getCallQty; + if (loc == NotStored) { return false; } + + lockValueLocation(loc, _valueMutexes); + ValueChunk* c = getValueChunk(loc); + + // Check the ownerId and size, as this 'loc' could have been meanwhile evicted + if (c->ownerId != checKOwnerId || c->size != checkValueSize) { + unlockValueLocation(loc, _valueMutexes); + ++_stats.missQty; + return false; + } + + // Update the LRU + c->flags |= ValueFlagActive; + ++_stats.hitQty; + + // Copy data to the output + data.resize(c->size); + memcpy(data.data(), ((uint8_t*)c) + sizeof(ValueChunk), c->size); + + unlockValueLocation(loc, _valueMutexes); + return true; + } + + // Upkeeping background task + void backgroundUpdateLru(uint32_t batchSize) + { + // Loop until no more work to do or batch exhausted + while (batchSize > 0) { + _mxLrus.lock(); + uint32_t consumed = updateLruHotAndWarm(batchSize); + _mxLrus.unlock(); + batchSize = (consumed == 0 || consumed > batchSize) ? 0 : batchSize - consumed; + } + } + + // Upkeeping background task + void backgroundPreventiveEviction(uint32_t batchSize) + { + if (!isEnabled()) { return; } + + uint64_t targetAllocatedBytes = (uint64_t)(_targetMemoryLoad * (double)_tlsfAlloc.getMaxAllocatableBytes()); + + // Loop until no more work to do or batch exhausted + while ((batchSize--) > 0 && _tlsfAlloc.getAllocatedBytes() > targetAllocatedBytes) { + _mxLrus.lock(); + if (_queues[(uint32_t)LruType::Cold].tail == NotStored) { + updateLruHotAndWarm(SmallBatchSize); + if (_queues[(uint32_t)LruType::Cold].tail == NotStored) { + _mxLrus.unlock(); + break; + } + } + + ValueLoc loc = _queues[(uint32_t)LruType::Cold].tail; + + if (loc != NotStored) { + lockValueLocation(loc, _valueMutexes); + ValueChunk* c = getValueChunk(loc); + // If entry is active, move it to warm + if (c->flags & ValueFlagActive) { + lruRemove(c); + lruInsertFront(LruType::Warm, loc, c); + } + // Else we found our eviction + else { + lruRemove(c); + c->ownerId = 0; + _mxMalloc.lock(); + _tlsfAlloc.free(c); + _mxMalloc.unlock(); + ++_stats.evictedQty; + --_stats.currentInCacheValueQty; + } + unlockValueLocation(loc, _valueMutexes); + } + _mxLrus.unlock(); + + } // End of batch processing + } + + const ValueCacheCounters& getCounters() const { return _stats; } + + void dump() const + { + printf("Cache:\n"); + printf(" insert call: %" PRId64 "\n", _stats.insertCallQty.load()); + printf(" get call: %" PRId64 "\n", _stats.getCallQty.load()); + printf(" remove call: %" PRId64 "\n", _stats.removeCallQty.load()); + printf(" hit qty: %" PRId64 "\n", _stats.hitQty.load()); + printf(" miss qty: %" PRId64 "\n", _stats.missQty.load()); + printf(" evicted qty: %" PRId64 "\n", _stats.evictedQty.load()); + printf(" hit ratio: %.3f\n", (double)_stats.hitQty / (double)std::max((uint64_t)1, _stats.hitQty + _stats.missQty)); + } + + private: + ValueChunk* getValueChunk(KeyLoc loc) const { return (ValueChunk*)_tlsfAlloc.uncompress(loc); } + + // Lock shall be taken beforehand + void lruRemove(ValueChunk* c) + { + assert((c->flags & ValueFlagQueueTypeMask) != (uint32_t)LruType::None); + LruQueue& queue = _queues[c->flags & ValueFlagQueueTypeMask]; + c->flags = (uint32_t)LruType::None; // No more present in any LRU + + if (c->prev != NotStored) { + getValueChunk(c->prev)->next = c->next; + } else { + queue.head = c->next; + } + if (c->next != NotStored) { + getValueChunk(c->next)->prev = c->prev; + } else { + queue.tail = c->prev; + } + assert(queue.bytes >= c->size); + queue.bytes -= c->size; + } + + // Lock shall be taken beforehand + void lruInsertFront(LruType lruType, ValueLoc loc, ValueChunk* c) + { + assert(lruType != LruType::None); + assert((c->flags & ValueFlagQueueTypeMask) == (uint32_t)LruType::None); + LruQueue& queue = _queues[(uint32_t)lruType]; + + c->flags = (uint16_t)lruType; // Active flag is cleared when bumped + c->prev = NotStored; + if (queue.head != NotStored) { + getValueChunk(queue.head)->prev = loc; + c->next = queue.head; + } else { + queue.tail = loc; + c->next = NotStored; + } + queue.head = loc; + queue.bytes += c->size; + } + + uint32_t updateLruHotAndWarm(uint32_t batchSize) + { + int64_t allBytes = (int64_t)_queues[(uint32_t)LruType::Hot].bytes + (int64_t)_queues[(uint32_t)LruType::Warm].bytes + + (int64_t)_queues[(uint32_t)LruType::Cold].bytes; + ValueLoc loc = NotStored; + uint32_t consumed = 0; + + // Move from Hot to Warm or Cold + int64_t moveQty = batchSize; + int64_t moveBytes = std::max((int64_t)0, (int64_t)_queues[(uint32_t)LruType::Hot].bytes - (allBytes * 20 / 100)); + while (moveQty-- && moveBytes > 0 && (loc = _queues[(uint32_t)LruType::Hot].tail) != NotStored) { + lockValueLocation(loc, _valueMutexes); + ValueChunk* c = getValueChunk(loc); + bool isActive = (c->flags & ValueFlagActive); + lruRemove(c); + // Move it to the Warm or Cold LRU, depending on its active state + lruInsertFront(isActive ? LruType::Warm : LruType::Cold, loc, c); + moveBytes -= c->size; + ++consumed; + unlockValueLocation(loc, _valueMutexes); + } + + // Move from Warm to Warm (bumped) or Cold + moveQty = batchSize; + moveBytes = std::max((int64_t)0, (int64_t)_queues[(uint32_t)LruType::Warm].bytes - (allBytes * 40 / 100)); + while (moveQty-- && moveBytes > 0 && (loc = _queues[(uint32_t)LruType::Warm].tail) != NotStored) { + lockValueLocation(loc, _valueMutexes); + ValueChunk* c = getValueChunk(loc); + bool isActive = (c->flags & ValueFlagActive); + lruRemove(c); + // Move it to the Warm or Cold LRU, depending on its access state + lruInsertFront(isActive ? LruType::Warm : LruType::Cold, loc, c); + moveBytes -= c->size; + ++consumed; + unlockValueLocation(loc, _valueMutexes); + } + + // Note: Move from Cold to Warm is performed independently. It depends on the cache filled ratio. + + return consumed; + } + + struct LruQueue { + ValueLoc head = NotStored; + ValueLoc tail = NotStored; + uint32_t bytes = 0; + }; + std::mutex _mxLrus; + std::mutex _mxMalloc; + LruQueue _queues[(uint32_t)LruType::Qty]; + double _targetMemoryLoad = 0.90; + + TlsfAllocator _tlsfAlloc; + ValueCacheCounters _stats; + std::array _valueMutexes; +}; + +// ========================================================================================== +// Index Hashmap +// ========================================================================================== + +struct IndexChunk { + uint32_t keyPartSize; // Length in bytes of the tag (=part of the key) + uint32_t entries; // Quantity of used entries + // uint8_t keyPart[0] the part of the key is stored here (padded to 4 bytes), followed by the list of hash32 of related entries + uint32_t* getHashArrayStart() const + { + constexpr uint32_t AlignMask = (uint32_t)(sizeof(uint32_t) - 1); + return (uint32_t*)((uint8_t*)this + sizeof(IndexChunk) + ((keyPartSize + AlignMask) & (~AlignMask))); + } +}; + +class IndexMap +{ + public: + IndexMap(uint64_t indexMaxAllocBytes, uint32_t initMapSize) : _tlsfAlloc(indexMaxAllocBytes) + { + // Sanity: Check that the initial size is a power of 2 + uint32_t checkSize = initMapSize; + while ((checkSize & 1) == 0) checkSize >>= 1; + assert(checkSize == 1); + + // Allocate the initial map and key storage + resize(initMapSize); + } + + ~IndexMap() + { + delete[] _table0.allocPtr; + delete[] _table1.allocPtr; + } + + void clear() + { + for (uint32_t i = 0; i < _table0.maxSize; ++i) { _table0.nodes[i].hash = Empty; } + _table0.size = 0; + for (uint32_t i = 0; i < _table1.maxSize; ++i) { _table1.nodes[i].hash = Empty; } + _table1.size = 0; + } + + uint32_t size() const { return _table0.size + _table1.size; } + + uint32_t capacity() const { return std::max(_table0.maxSize, _table1.maxSize); } + + bool empty() const { return (size() == 0); } + + static constexpr uint64_t Empty = 0; + static constexpr uint64_t FirstValid = 1; + + // An external writer RW-lock shall ensure 1 writer at a time + Status insertIndex(const void* keyPart, uint32_t keyPartSize, uint32_t entryKeyHash) + { + uint32_t keyHash = (uint32_t)LITECASK_HASH_FUNC(keyPart, keyPartSize); + if (keyHash < FirstValid) keyHash += FirstValid; + if (entryKeyHash < FirstValid) entryKeyHash += FirstValid; + + Table* currentTable = _currentTable.load(); + uint32_t mask = (currentTable->maxSize - 1) & (~(KeyDirAssocQty - 1)); + int idx = keyHash & mask; + uint32_t probeIncr = 1; + uint32_t cellId = 0; + + while (true) { + cellId = 0; + while (cellId < KeyDirAssocQty && currentTable->nodes[idx + cellId].hash >= FirstValid) { + if (currentTable->nodes[idx + cellId].hash == keyHash) { + IndexChunk* indexChunk = (IndexChunk*)_tlsfAlloc.uncompress(currentTable->nodes[idx + cellId].loc); + if (indexChunk->keyPartSize == keyPartSize && + !memcmp(((uint8_t*)indexChunk) + sizeof(IndexChunk), keyPart, keyPartSize)) { + // Match found, entry update case + _optimisticsCounters[keyHash & OptCounterMask]++; + + const uint32_t arrayStartOffset = (uint32_t)((uint8_t*)indexChunk->getHashArrayStart() - (uint8_t*)indexChunk); + const uint32_t accessibleEntrySize = (uint32_t)(_tlsfAlloc.getRealAllocatedSize(indexChunk) - arrayStartOffset); + if ((indexChunk->entries + 1) * sizeof(uint32_t) > accessibleEntrySize) { + // New allocation required because the current one is too small + const uint32_t newMaxEntries = 2 * indexChunk->entries; + uint32_t targetSize = arrayStartOffset + newMaxEntries * (uint32_t)sizeof(uint32_t); + uint8_t* ptr = (uint8_t*)_tlsfAlloc.malloc(targetSize); + if (ptr == nullptr) { return Status::OutOfMemory; } + memcpy(ptr, indexChunk, arrayStartOffset + indexChunk->entries * sizeof(uint32_t)); + currentTable->nodes[idx + cellId].loc = _tlsfAlloc.compress(ptr); + _tlsfAlloc.free(indexChunk); + indexChunk = (IndexChunk*)ptr; + } + + // Store the new entry + assert((indexChunk->entries + 1) * sizeof(uint32_t) <= + (uint32_t)(_tlsfAlloc.getRealAllocatedSize(indexChunk) - arrayStartOffset)); + indexChunk->getHashArrayStart()[indexChunk->entries++] = entryKeyHash; + + _optimisticsCounters[keyHash & OptCounterMask]++; + return Status::Ok; + } + } + ++cellId; + } + + if (cellId < KeyDirAssocQty) { break; } // Empty space spotted on this cache line, so key has not been found + idx = (idx + (probeIncr * KeyDirAssocQty)) & mask; + ++probeIncr; // Between linear and quadratic probing + } + + const uint32_t newMaxEntries = 2; // Starts with 2 elements max + constexpr uint32_t AlignMask = (uint32_t)(sizeof(uint32_t) - 1); + uint32_t targetSize = sizeof(IndexChunk) + ((keyPartSize + AlignMask) & (~AlignMask)) + newMaxEntries * sizeof(uint32_t); + uint8_t* ptr = (uint8_t*)_tlsfAlloc.malloc(targetSize); + if (ptr == nullptr) { return Status::OutOfMemory; } + IndexChunk* indexChunk = (IndexChunk*)ptr; + *indexChunk = {keyPartSize, 0}; + memcpy(ptr + sizeof(IndexChunk), keyPart, keyPartSize); + indexChunk->getHashArrayStart()[indexChunk->entries++] = entryKeyHash; + currentTable->nodes[idx + cellId] = {keyHash, _tlsfAlloc.compress(ptr)}; + + currentTable->size += 1; + if ((uint64_t)128 * (_table0.size + _table1.size) > _maxLoadFactor128th * currentTable->maxSize) { + resize(2 * currentTable->maxSize); + } + return Status::Ok; + } + + // An external reader RW-lock shall ensure that there is no edit at the same time + uint32_t getEntryHashes(const void* keyPart, uint16_t keyPartSize, lcVector* entryHashes = nullptr) + { + uint32_t keyHash = (uint32_t)LITECASK_HASH_FUNC(keyPart, keyPartSize); + if (keyHash < FirstValid) keyHash += FirstValid; + + Table* currentTable = _currentTable.load(); + uint32_t mask = (currentTable->maxSize - 1) & (~(KeyDirAssocQty - 1)); + int idx = keyHash & mask; + uint32_t probeIncr = 1; + + while (true) { + uint32_t cellId = 0; + for (; cellId < KeyDirAssocQty && currentTable->nodes[idx + cellId].hash >= FirstValid; ++cellId) { + if (currentTable->nodes[idx + cellId].hash == keyHash) { + IndexChunk* indexChunk = (IndexChunk*)_tlsfAlloc.uncompress(currentTable->nodes[idx + cellId].loc); + if (indexChunk->keyPartSize == keyPartSize && + !memcmp(((uint8_t*)indexChunk) + sizeof(IndexChunk), keyPart, keyPartSize)) { + if (entryHashes) { + entryHashes->resize(indexChunk->entries); + memcpy(entryHashes->data(), indexChunk->getHashArrayStart(), indexChunk->entries * sizeof(uint32_t)); + } + return indexChunk->entries; + } + } + } + + if (cellId < KeyDirAssocQty) { break; } // Empty space spotted on this cache line, so key has not been found + idx = (idx + (probeIncr * KeyDirAssocQty)) & mask; + ++probeIncr; // Between linear and quadratic probing + } + + // Not found + return 0; + } + + // An external writer RW-lock shall ensure 1 writer at a time and keep the lock as long as the returned array is used + bool getEntryHashesForUpdate(const void* keyPart, uint32_t keyPartSize, uint32_t** entryHashes, uint32_t** entries) + { + uint32_t keyHash = (uint32_t)LITECASK_HASH_FUNC(keyPart, keyPartSize); + if (keyHash < FirstValid) keyHash += FirstValid; + + Table* currentTable = _currentTable.load(); + uint32_t mask = (currentTable->maxSize - 1) & (~(KeyDirAssocQty - 1)); + int idx = keyHash & mask; + uint32_t probeIncr = 1; + + while (true) { + uint32_t cellId = 0; + for (; cellId < KeyDirAssocQty && currentTable->nodes[idx + cellId].hash >= FirstValid; ++cellId) { + if (currentTable->nodes[idx + cellId].hash == keyHash) { + IndexChunk* indexChunk = (IndexChunk*)_tlsfAlloc.uncompress(currentTable->nodes[idx + cellId].loc); + if (indexChunk->keyPartSize == keyPartSize && + !memcmp(((uint8_t*)indexChunk) + sizeof(IndexChunk), keyPart, keyPartSize)) { + *entryHashes = indexChunk->getHashArrayStart(); + *entries = &(indexChunk->entries); + return true; + } + } + } + + if (cellId < KeyDirAssocQty) { break; } // Empty space spotted on this cache line, so key has not been found + idx = (idx + (probeIncr * KeyDirAssocQty)) & mask; + ++probeIncr; // Between linear and quadratic probing + } + + // Not found, which is not supposed to happen unless the index is removed in-between + return false; + } + + // Above this load factor, the KeyDir will get resized + bool setMaxLoadFactor(double f) + { + if (f <= 0. || f > 1.) return false; + _maxLoadFactor128th = (uint64_t)(128. * f); + return true; + } + + double getLoadFactor() const + { + return (double)(_table0.size + _table1.size) / (double)std::max(std::max(_table0.maxSize, _table1.maxSize), 1U); + } + + uint64_t getEstimatedUsedMemoryBytes() const + { + return sizeof(MapEntry) * (_table0.maxSize + _table1.maxSize) + _tlsfAlloc.getAllocatedBytes(); + } + + void resize(uint32_t newMaxSize) + { + // Allocate the new table + Table* newTable = (_currentTable.load() == &_table0) ? &_table1 : &_table0; + delete[] newTable->allocPtr; + newTable->allocPtr = new uint8_t[newMaxSize * sizeof(MapEntry) + detail::CpuCacheLine]; // For cache-line aligned base pointer + newTable->nodes = (MapEntry*)((((uintptr_t)newTable->allocPtr) + detail::CpuCacheLine - 1) & // NOLINT(performance-no-int-to-ptr) + (~(detail::CpuCacheLine - 1))); + memset(newTable->nodes, 0, sizeof(MapEntry) * newMaxSize); + newTable->maxSize = newMaxSize; + newTable->size = 0; + + Table* oldTable = (newTable == &_table0) ? &_table1 : &_table0; + uint32_t newMask = (newTable->maxSize - 1) & (~(KeyDirAssocQty - 1)); + + // Transfer the data + for (uint32_t oldIdx = 0; oldIdx < oldTable->maxSize; ++oldIdx) { + if (oldTable->nodes[oldIdx].hash < FirstValid) continue; + + uint32_t newIdx = oldTable->nodes[oldIdx].hash & newMask; + uint32_t probeIncr = 1; + uint32_t cellId = 0; + + while (true) { + cellId = 0; + while (cellId < KeyDirAssocQty && newTable->nodes[newIdx + cellId].hash >= FirstValid) ++cellId; + if (cellId < KeyDirAssocQty) { break; } // Empty space spotted on this cache line + newIdx = (newIdx + (probeIncr * KeyDirAssocQty)) & newMask; + ++probeIncr; + } + + assert(cellId < KeyDirAssocQty && newTable->nodes[newIdx + cellId].hash < FirstValid); + newTable->nodes[newIdx + cellId] = oldTable->nodes[oldIdx]; + newTable->size += 1; + oldTable->size -= 1; // So that the sum of the sizes is accurate + } + + oldTable->size = 0; // Cleared table + + // Swap + _currentTable.store(newTable); + } + + private: + // Definitions + struct Table { + MapEntry* nodes = nullptr; // Cache line aligned + uint32_t size = 0; + uint32_t maxSize = 0; + uint8_t* allocPtr = nullptr; // May not be aligned on a cache line/// + }; + + // Constants + static constexpr int OptCounterQty = 8192; + static constexpr int OptCounterMask = OptCounterQty - 1; + static constexpr int CurrentTableNbr = (1 << 0); + + // Fields + Table _table0; + Table _table1; + std::atomic _currentTable = &_table1; + uint64_t _maxLoadFactor128th = (uint64_t)(0.90 * 128); // 90% load factor with 8-associativity is ok + TlsfAllocator _tlsfAlloc; + std::array, OptCounterQty> _optimisticsCounters = {0}; +}; + +// ========================================================================================== +// KeyDir Hashmap +// ========================================================================================== + +struct OldKeyChunk { + bool isValid = false; + uint32_t valueSize = 0; + ValueLoc cacheLocation = NotStored; + uint16_t fileId = 0; + uint16_t keyIndexQty = 0; // Index qty, and not bytes + KeyIndex keyIndexes[MaxKeyIndexQty]; +}; + +class KeyDirMap +{ + public: + KeyDirMap(uint64_t keyMaxAllocBytes, uint32_t initMapSize, const std::function& notifyResizing) + : _notifyResizing(notifyResizing), _tlsfAlloc(keyMaxAllocBytes) + { + assert(notifyResizing); + + // Sanity: Check that the initial size is a power of 2 + uint32_t checkSize = initMapSize; + while ((checkSize & 1) == 0) checkSize >>= 1; + assert(checkSize == 1); + + // Allocate the initial map and key storage + resize(initMapSize); + } + + ~KeyDirMap() + { + delete[] _table0.allocPtr; + delete[] _table1.allocPtr; + } + + void reset() + { + for (uint32_t i = 0; i < _table0.maxSize; ++i) { _table0.nodes[i].hash = Empty; } + _table0.size = 0; + for (uint32_t i = 0; i < _table1.maxSize; ++i) { _table1.nodes[i].hash = Empty; } + _table1.size = 0; + } + + uint32_t size() const { return _table0.size + _table1.size; } + + uint32_t capacity() const { return std::max(_table0.maxSize, _table1.maxSize); } + + bool empty() const { return (size() == 0); } + + static constexpr uint64_t Empty = 0; + static constexpr uint64_t FirstValid = 1; + +#define LITECASK_FIND_KEY_LOOP(parameterActionCode) \ + mask = (currentTable->maxSize - 1) & (~(KeyDirAssocQty - 1)); \ + idx = keyHash & mask; \ + probeIncr = 1; \ + \ + while (true) { \ + uint32_t cellId = 0; \ + for (; cellId < KeyDirAssocQty && currentTable->nodes[idx + cellId].hash >= FirstValid; ++cellId) { \ + if (currentTable->nodes[idx + cellId].hash == keyHash) { \ + KeyChunk* keyChunk = getKey(currentTable->nodes[idx + cellId].loc); \ + if (keyChunk->keySize == keySize && !memcmp(((uint8_t*)keyChunk) + sizeof(KeyChunk), key, keySize)) { \ + if (keyChunk->expTimeSec > 0 && keyChunk->expTimeSec <= _nowTimeSec) { break; } \ + parameterActionCode; \ + } \ + } \ + } \ + if (cellId < KeyDirAssocQty) { break; } /* Empty space spotted on this cache line, so key has not been found */ \ + idx = (idx + (probeIncr * KeyDirAssocQty)) & mask; \ + ++probeIncr; /* Between linear and quadratic probing */ \ + } + +#define LITECASK_FIND_KEY_AND_DO_ACTION(parameterActionCode) \ + uint32_t mask; \ + int idx; \ + uint32_t probeIncr; \ + if (keyHash < FirstValid) keyHash += FirstValid; \ + Table* currentTable = ((_signalBitmap.load() & CurrentTableNbr) == 0) ? &_table0 : &_table1; \ + LITECASK_FIND_KEY_LOOP(parameterActionCode); \ + if (_signalBitmap.load() & UnderResizing) { /* Try the other table */ \ + currentTable = (currentTable == &_table0) ? &_table1 : &_table0; \ + LITECASK_FIND_KEY_LOOP(parameterActionCode); \ + } + +#define LITECASK_FIND_HASH_LOOP(parameterActionCode) \ + mask = (currentTable->maxSize - 1) & (~(KeyDirAssocQty - 1)); \ + idx = keyHash & mask; \ + probeIncr = 1; \ + \ + while (true) { \ + uint32_t cellId = 0; \ + for (; cellId < KeyDirAssocQty && currentTable->nodes[idx + cellId].hash >= FirstValid; ++cellId) { \ + if (currentTable->nodes[idx + cellId].hash == keyHash) { \ + KeyChunk* keyChunk = getKey(currentTable->nodes[idx + cellId].loc); \ + parameterActionCode; \ + } \ + } \ + if (cellId < KeyDirAssocQty) { break; } /* Empty space spotted on this cache line, so key has not been found */ \ + idx = (idx + (probeIncr * KeyDirAssocQty)) & mask; \ + ++probeIncr; /* Between linear and quadratic probing */ \ + } + +#define LITECASK_FIND_HASH_AND_DO_ACTION(parameterActionCode) \ + uint32_t mask; \ + int idx; \ + uint32_t probeIncr; \ + if (keyHash < FirstValid) keyHash += FirstValid; \ + Table* currentTable = ((_signalBitmap.load() & CurrentTableNbr) == 0) ? &_table0 : &_table1; \ + LITECASK_FIND_HASH_LOOP(parameterActionCode); \ + if (_signalBitmap.load() & UnderResizing) { /* Try the other table */ \ + currentTable = (currentTable == &_table0) ? &_table1 : &_table0; \ + LITECASK_FIND_HASH_LOOP(parameterActionCode); \ + } + + void updateMergedValueLocation(uint32_t keyHash, uint16_t oldFileId, uint32_t oldFileOffset, uint16_t newFileId, uint32_t newFileOffset) + { + LITECASK_FIND_HASH_AND_DO_ACTION({ + if (keyChunk->fileId == oldFileId && keyChunk->fileOffset == oldFileOffset) { + keyChunk->fileId = newFileId; + keyChunk->fileOffset = newFileOffset; + return; + } + // Continue looking for another entry with the same hash in the keydir + }); + } + + void updateCachedValueLocation(uint32_t keyHash, const void* key, uint16_t keySize, uint32_t checkValueSize, uint8_t checkChangeCounter, + ValueLoc newCacheLocation) + { + LITECASK_FIND_KEY_AND_DO_ACTION({ + if (keyChunk->valueSize == checkValueSize && keyChunk->changeCounter == checkChangeCounter) { + keyChunk->cacheLocation = newCacheLocation; + } + return; + }); + } + + LITECASK_ATTRIBUTE_NO_SANITIZE_THREAD + bool getKeyAndIndexes(uint32_t keyHash, lcVector& key, lcVector& keyIndexes) + { + LITECASK_FIND_HASH_AND_DO_ACTION({ + bool wasNotTheRightHash = false; + uint32_t lockCounterBefore; + do { + lockCounterBefore = _optimisticsCounters[keyHash & OptCounterMask]; // Optimistic locking + uint32_t valueSize = keyChunk->valueSize; + uint16_t keySize = keyChunk->keySize; + uint8_t keyIndexSize = keyChunk->keyIndexSize; + + // Ensure non corrupted sizes + if ((lockCounterBefore & 0x1) == 0 && (_optimisticsCounters[keyHash & OptCounterMask]) == lockCounterBefore) { + if (valueSize == DeletedEntry || (keyChunk->expTimeSec > 0 && keyChunk->expTimeSec <= _nowTimeSec)) { + wasNotTheRightHash = true; // The entry with matching hash is invalid. We continue looking for another entry. + continue; + } + key.resize(keySize); + memcpy(key.data(), ((uint8_t*)keyChunk) + sizeof(KeyChunk), keySize); + keyIndexes.resize(keyIndexSize); + if (keyIndexSize) { memcpy(keyIndexes.data(), ((uint8_t*)keyChunk) + sizeof(KeyChunk) + keySize, keyIndexSize); } + } + } while ((lockCounterBefore & 0x1) || + (_optimisticsCounters[keyHash & OptCounterMask]) != lockCounterBefore); // Odd means under write + if (wasNotTheRightHash) { continue; } // Continue looking for another entry in the keydir + return true; + }); + return false; + } + + LITECASK_ATTRIBUTE_NO_SANITIZE_THREAD + bool cleanIndex(uint32_t keyHash, const void* keyPart, uint16_t keyPartSize) + { + LITECASK_FIND_HASH_AND_DO_ACTION({ + if (keyChunk->valueSize != DeletedEntry && (keyChunk->expTimeSec == 0 || keyChunk->expTimeSec > _nowTimeSec)) { + continue; // The entry is valid, so no cleaning needed. We continue looking for an entry to clean. + } + + // If entry is not deleted and the key part is found, then it shall be kept (also in the index) + // Else, it shall be removed (if needed) and index updated. + uint8_t* key = (uint8_t*)keyChunk + sizeof(KeyChunk); + uint8_t* kiArray = key + keyChunk->keySize; + for (int kIdx = 0; kIdx < (int)keyChunk->keyIndexSize; kIdx += sizeof(KeyIndex)) { + KeyIndex& ki = *(KeyIndex*)(kiArray + kIdx); + if (ki.size != keyPartSize || memcmp(&key[ki.startIdx], keyPart, keyPartSize)) { continue; } + // Key part is present + // Case obsolete entry: the key part is removed (keeping the order) + if (kIdx + sizeof(KeyIndex) < (int)keyChunk->keyIndexSize) { + memmove(&ki, (&ki) + sizeof(KeyIndex), ((int)keyChunk->keyIndexSize - (kIdx + sizeof(KeyIndex)))); + } + assert(keyChunk->keyIndexSize >= (uint8_t)sizeof(KeyIndex)); + keyChunk->keyIndexSize -= (uint8_t)sizeof(KeyIndex); + return true; // Key part found and removed from the entry. The index definitely needs cleaning + } + + continue; // Key part is not found in this entry. We continue looking for an entry to clean + }); + return true; // No hash matching entry with such key part found. The index definitely needs cleaning + } + + LITECASK_ATTRIBUTE_NO_SANITIZE_THREAD + bool find(uint32_t keyHash, const void* key, uint16_t keySize, KeyChunk& entry) + { + LITECASK_FIND_KEY_AND_DO_ACTION({ + if (_isInstrumentationEnable) { + if (probeIncr > _instrumentedProbeMax) { _instrumentedProbeMax = probeIncr; } + _instrumentedProbeSum += probeIncr; + ++_instrumentedFindCount; + } + uint32_t lockCounterBefore; + do { + lockCounterBefore = _optimisticsCounters[keyHash & OptCounterMask]; // Optimistic locking + entry = *keyChunk; + } while ((lockCounterBefore & 0x1) || + (_optimisticsCounters[keyHash & OptCounterMask]) != lockCounterBefore); // Odd means under write + return true; + }); + return false; + } + + bool invalidateExpiredTtl(uint32_t keyDirIndex, uint32_t& keyHash, uint32_t& keySize, uint32_t& oldValueSize, uint16_t& oldFileId, + ValueLoc& oldCacheLoc) + { + // Checks only in current table. An external lock shall ensure 1 writer at a time + Table* currentTable = ((_signalBitmap.load() & CurrentTableNbr) == 0) ? &_table0 : &_table1; + + // Ensure that the entry is expired (whatever the entry), as indicated by the previous probing + MapEntry& kde = currentTable->nodes[keyDirIndex]; + if (kde.hash < FirstValid) { return false; } + KeyChunk* keyChunk = getKey(kde.loc); + if (keyChunk->valueSize == DeletedEntry) { return false; } + + _optimisticsCounters[keyHash & OptCounterMask]++; + + // Copy some old values + keyHash = kde.hash; + keySize = keyChunk->keySize; + oldValueSize = keyChunk->valueSize; + oldFileId = keyChunk->fileId; + oldCacheLoc = keyChunk->cacheLocation; + + // Invalidate the meta data chunk + keyChunk->expTimeSec = 0; + keyChunk->valueSize = DeletedEntry; + keyChunk->cacheLocation = NotStored; + + _optimisticsCounters[keyHash & OptCounterMask]++; + return true; + } + + // Returns true when the entry is successfully stored. May fail if OOM + LITECASK_ATTRIBUTE_NO_SANITIZE_THREAD + Status insertEntry(uint32_t keyHash, const void* key, const void* keyIndexes, const KeyChunk& entry, OldKeyChunk& oldEntry) + { + if (keyHash < FirstValid) keyHash += FirstValid; + oldEntry.isValid = false; + + // Insert only in current table. An external lock shall ensure 1 writer at a time + Table* currentTable = ((_signalBitmap.load() & CurrentTableNbr) == 0) ? &_table0 : &_table1; + uint32_t mask = (currentTable->maxSize - 1) & (~(KeyDirAssocQty - 1)); + int idx = keyHash & mask; + uint32_t probeIncr = 1; + uint32_t cellId = 0; + + while (true) { + cellId = 0; + while (cellId < KeyDirAssocQty && currentTable->nodes[idx + cellId].hash >= FirstValid) { + if (currentTable->nodes[idx + cellId].hash == keyHash) { + KeyChunk* keyChunk = getKey(currentTable->nodes[idx + cellId].loc); + if (keyChunk->keySize == entry.keySize && !memcmp(((uint8_t*)keyChunk) + sizeof(KeyChunk), key, entry.keySize)) { + // Match found, entry update case + _optimisticsCounters[keyHash & OptCounterMask]++; + + oldEntry.isValid = true; + oldEntry.valueSize = keyChunk->valueSize; + oldEntry.cacheLocation = keyChunk->cacheLocation; + oldEntry.fileId = keyChunk->fileId; + oldEntry.keyIndexQty = (uint16_t)((int)keyChunk->keyIndexSize / sizeof(KeyIndex)); + if (oldEntry.keyIndexQty) { + memcpy(&oldEntry.keyIndexes, ((uint8_t*)keyChunk) + sizeof(KeyChunk) + keyChunk->keySize, + keyChunk->keyIndexSize); + } + Status storageStatus = updateKey(key, keyIndexes, entry, currentTable->nodes[idx + cellId].loc); + + _optimisticsCounters[keyHash & OptCounterMask]++; + return storageStatus; + } + } + ++cellId; + } + if (cellId < KeyDirAssocQty) { break; } /* Empty space spotted on this cache line, so key has not been found */ + idx = (idx + (probeIncr * KeyDirAssocQty)) & mask; + ++probeIncr; // Between linear and quadratic probing + } + + KeyLoc keyLoc = NotStored; + Status storageStatus = insertKey(key, keyIndexes, entry, keyLoc); + if (storageStatus != Status::Ok) { return storageStatus; } // Failure to store the key due to OOM or too big key + // No need for protection vs "get" as there is no key removal API (tombstone instead) + assert(cellId < KeyDirAssocQty && currentTable->nodes[idx + cellId].hash < FirstValid); + currentTable->nodes[idx + cellId] = {keyHash, keyLoc}; + + currentTable->size += 1; + if ((uint64_t)128 * (_table0.size + _table1.size) > _maxLoadFactor128th * currentTable->maxSize) { + resize(2 * currentTable->maxSize); + } + return Status::Ok; + } + + void resize(uint32_t newMaxSize) + { + // Emergency case: if the next resize arrives and the previous is not finished, just force-finish it + constexpr uint32_t EmergencyBatchSize = 1'000'000; + if (isResizingOngoing()) { + while (isResizingOngoing()) { backgroundResizeWork(EmergencyBatchSize, true); } + } + + // Allocate the new table + Table* newTable = ((_signalBitmap.load() & CurrentTableNbr) == 0) ? &_table1 : &_table0; + delete[] newTable->allocPtr; + newTable->allocPtr = new uint8_t[newMaxSize * sizeof(MapEntry) + detail::CpuCacheLine]; // For cache-line aligned base pointer + newTable->nodes = (MapEntry*)((((uintptr_t)newTable->allocPtr) + detail::CpuCacheLine - 1) & // NOLINT(performance-no-int-to-ptr) + (~(detail::CpuCacheLine - 1))); + memset(newTable->nodes, 0, sizeof(MapEntry) * newMaxSize); + newTable->maxSize = newMaxSize; + newTable->size = 0; + + // Start the background resizing process + _resizeNextIdx = 0; + if (_table0.size != 0 || _table1.size != 0) { + _signalBitmap.store(UnderResizing | (_signalBitmap.load() ^ CurrentTableNbr)); + _notifyResizing(newMaxSize, true, false); // Notify the start of the resizing job + } else { + // Construction time (empty tables) + _signalBitmap.store(_signalBitmap.load() ^ CurrentTableNbr); + } + } + + // Note: writer lock is expected to be taken + void backgroundResizeWork(uint32_t batchSize, bool wasForced = false) + { + assert(batchSize > 0); + if (!isResizingOngoing()) { return; } + + Table* oldTable = ((_signalBitmap.load() & CurrentTableNbr) == 0) ? &_table1 : &_table0; + Table* newTable = (oldTable == &_table0) ? &_table1 : &_table0; + uint32_t newMask = (newTable->maxSize - 1) & (~(KeyDirAssocQty - 1)); + uint32_t lastOldIdx = std::min(_resizeNextIdx + batchSize, oldTable->maxSize); + + // Transfer a batch of data + for (uint32_t oldIdx = _resizeNextIdx; oldIdx < lastOldIdx; ++oldIdx) { + if (oldTable->nodes[oldIdx].hash < FirstValid) continue; + + uint32_t newIdx = oldTable->nodes[oldIdx].hash & newMask; + uint32_t probeIncr = 1; + uint32_t cellId = 0; + + while (true) { + cellId = 0; + while (cellId < KeyDirAssocQty && newTable->nodes[newIdx + cellId].hash >= FirstValid) ++cellId; + if (cellId < KeyDirAssocQty) { break; } // Empty space spotted on this cache line + newIdx = (newIdx + (probeIncr * KeyDirAssocQty)) & newMask; + ++probeIncr; + } + + assert(cellId < KeyDirAssocQty && newTable->nodes[newIdx + cellId].hash < FirstValid); + newTable->nodes[newIdx + cellId] = oldTable->nodes[oldIdx]; + newTable->size += 1; + oldTable->size -= 1; // So that the sum of the sizes is accurate + } + + _resizeNextIdx = lastOldIdx; + if (lastOldIdx == oldTable->maxSize) { + oldTable->size = 0; // Cleared table + _notifyResizing(newTable->maxSize, false, wasForced); // Notify the end of the resizing job + _signalBitmap.store((~UnderResizing) & _signalBitmap.load()); + } + } + + uint32_t backgroundExpiredKeyCleaning(uint32_t& batchSize) + { + assert(batchSize > 0); + + Table* table = ((_signalBitmap.load() & CurrentTableNbr) == 0) ? &_table0 : &_table1; + uint32_t lastIdx = std::min(_ttlNextIdx + batchSize, table->maxSize); + + // Analyze a batch of data + for (uint32_t idx = _ttlNextIdx; idx < lastIdx; ++idx, --batchSize) { + // Skip entries which are empty, without TTL, or not expired TTL + if (table->nodes[idx].hash < FirstValid) { continue; } + KeyChunk* keyChunk = getKey(table->nodes[idx].loc); + if (keyChunk->valueSize == DeletedEntry || keyChunk->expTimeSec == 0 || keyChunk->expTimeSec > _nowTimeSec) { continue; } + + // Return the entry to remove, after checking again (under lock) that the TTL is indeed expired + // The data race condition is harmless, just a little waste of CPU if this probing is defeated later + _ttlNextIdx = (idx + 1 >= table->maxSize) ? 0 : idx + 1; + return idx; + } + + _ttlNextIdx = (lastIdx >= table->maxSize) ? 0 : lastIdx; + batchSize = 0; + return NotStored; + } + + bool isResizingOngoing() const { return (_signalBitmap.load() & UnderResizing); } + + // Above this load factor, the KeyDir will get resized + bool setMaxLoadFactor(double f) + { + if (f <= 0. || f > 1.) return false; + _maxLoadFactor128th = (uint64_t)(128. * f); + return true; + } + + double getLoadFactor() const + { + return (double)(_table0.size + _table1.size) / (double)std::max(std::max(_table0.maxSize, _table1.maxSize), 1U); + } + + uint64_t getEstimatedUsedMemoryBytes() const + { + return sizeof(MapEntry) * (_table0.maxSize + _table1.maxSize) + _tlsfAlloc.getAllocatedBytes(); + } + + // Only enabled in tests, to analyse the probing of the hash table when reading some entries + void setInstrumentationEnable(bool isEnable) { _isInstrumentationEnable = isEnable; } + + // It returns the monotonic counters of probe count and find API call count, and the max probe count per call since the last + // getProbeCount call + void getProbeCount(uint64_t& probeMax, uint64_t& probeSum, uint64_t& findCount) + { + probeMax = _instrumentedProbeMax; + _instrumentedProbeMax = 0; + probeSum = _instrumentedProbeSum; + findCount = _instrumentedFindCount; + } + + void setNow(uint32_t nowTimeSec) { _nowTimeSec = nowTimeSec; } + + private: + KeyChunk* getKey(KeyLoc loc) const { return (KeyChunk*)_tlsfAlloc.uncompress(loc); } + + Status insertKey(const void* key, const void* keyIndexes, const KeyChunk& entry, KeyLoc& loc) + { + uint32_t targetSize = (uint32_t)(sizeof(KeyChunk) + entry.keySize + entry.keyIndexSize); + uint8_t* ptr = (uint8_t*)_tlsfAlloc.malloc(targetSize); + if (ptr == nullptr) { return Status::OutOfMemory; } + loc = _tlsfAlloc.compress(ptr); + KeyChunk* c = (KeyChunk*)ptr; + *c = entry; + c->changeCounter += 1; + memcpy(ptr + sizeof(KeyChunk), key, entry.keySize); + if (entry.keyIndexSize) { memcpy(ptr + sizeof(KeyChunk) + entry.keySize, keyIndexes, entry.keyIndexSize); } + return Status::Ok; + } + + Status updateKey(const void* key, const void* keyIndexes, const KeyChunk& entry, KeyLoc& locToUpdate) + { + KeyChunk* keyChunk = getKey(locToUpdate); + + uint32_t accessibleKeyIndexSize = (uint32_t)(_tlsfAlloc.getRealAllocatedSize(keyChunk) - sizeof(KeyChunk) - entry.keySize); + if (entry.keyIndexSize > accessibleKeyIndexSize) { + // New allocation required because the current one is too small + KeyLoc newKeyLoc = NotStored; + Status storageStatus = insertKey(key, keyIndexes, entry, newKeyLoc); + if (storageStatus != Status::Ok) { return storageStatus; } // Failure to store the key due to OOM + KeyLoc oldKeyLoc = locToUpdate; + locToUpdate = newKeyLoc; + + // Invalidate and free the old chunk + memset(keyChunk, 0, sizeof(KeyChunk)); + keyChunk->cacheLocation = NotStored; + keyChunk->fileOffset = NotStored; + keyChunk->fileId = 0xFFFF; + _tlsfAlloc.free(_tlsfAlloc.uncompress(oldKeyLoc)); + } else if (entry.valueSize == DeletedEntry) { + // For deletion, we keep the previous key index. Setting back the size is enough. + // This is needed to avoid multiple entries in the index lookups if this entry is added back later + // with some matching key indexes. + uint8_t oldKeyIndexSize = keyChunk->keyIndexSize; + *keyChunk = entry; + keyChunk->keyIndexSize = oldKeyIndexSize; + keyChunk->changeCounter += 1; + } else { + *keyChunk = entry; + keyChunk->changeCounter += 1; + if (entry.keyIndexSize) { memcpy(((uint8_t*)keyChunk) + sizeof(KeyChunk) + entry.keySize, keyIndexes, entry.keyIndexSize); } + } + return Status::Ok; + } + + // Definitions + struct Table { + MapEntry* nodes = nullptr; // Cache line aligned + uint32_t size = 0; + uint32_t maxSize = 0; + uint8_t* allocPtr = nullptr; // May not be aligned on a cache line + }; + + // Constants + static constexpr int OptCounterQty = 8192; + static constexpr int OptCounterMask = OptCounterQty - 1; + static constexpr int CurrentTableNbr = (1 << 0); + static constexpr int UnderResizing = (1 << 1); + + // Fields + alignas(CpuCacheLine) Table _table0; + Table _table1; + uint64_t _maxLoadFactor128th = (uint64_t)(0.90 * 128); // 90% load factor with 8-associativity is ok + std::atomic _signalBitmap = 0; // Table 0 and not resizing + uint32_t _resizeNextIdx = 0; + uint32_t _ttlNextIdx = 0; + uint32_t _nowTimeSec = 0; + + alignas(CpuCacheLine) std::array, OptCounterQty> _optimisticsCounters = {0}; + std::function _notifyResizing; + + TlsfAllocator _tlsfAlloc; + bool _isInstrumentationEnable = false; + uint64_t _instrumentedProbeMax = 0; + uint64_t _instrumentedProbeSum = 0; + uint64_t _instrumentedFindCount = 0; +}; + +} // namespace detail + +// ========================================================================================== +// Datastore +// ========================================================================================== + +class Datastore // NOLINT(clang-analyzer-optin.performance.Padding) Padding is not optimal due to the alignas directives +{ + public: + Datastore(size_t cacheBytes = 256 * 1024 * 1024) + { + _upkeepLastActiveFlushedTimeMs = + std::chrono::duration_cast(std::chrono::steady_clock::now().time_since_epoch()).count(); + setLogHandler({}); // Install the default handler + + constexpr uint32_t initialMapSize = 16 * 1024; + _writeBuffer.resize(detail::DefaultWriteBufferBytes); + _keyDir = new detail::KeyDirMap(detail::KeyStorageAllocBytes, initialMapSize, [&](uint32_t newSize, bool isStart, bool wasForced) { + notifyKeyDirResizing(newSize, isStart, wasForced); + }); + _valueCache = new detail::ValueCache((uint64_t)cacheBytes); + _indexMap = new detail::IndexMap(detail::KeyStorageAllocBytes, initialMapSize); + + updateNow(); + } + + ~Datastore() + { + if (_isInitialized) { close(); } + delete _keyDir; + delete _valueCache; + delete _indexMap; + } + + // Observability + // ========================================================================================== + + void dumpFd(bool withIndex = false) + { + using namespace litecask::detail; + _mxDataFiles.lockRead(); + for (uint32_t fileId = 0; fileId < _dataFiles.size(); ++fileId) { + DataFile* dfd = _dataFiles[fileId]; + dfd->dump(withIndex ? fileId : -1, fileId == _activeDataFileId); + } + _mxDataFiles.unlockRead(); + } + + DataFileStats getFileStats() const + { + using namespace litecask::detail; + DataFileStats stats; + _mxDataFiles.lockRead(); + + for (uint32_t fileId = 0; fileId < _dataFiles.size(); ++fileId) { + DataFile* dfd = _dataFiles[fileId]; + if (!osIsValidHandle(dfd->handle)) continue; + + stats.fileQty += 1; + stats.entries += dfd->entries; + stats.entryBytes += dfd->bytes; + stats.tombBytes += dfd->tombBytes; + stats.tombEntries += dfd->tombEntries; + stats.deadBytes += dfd->deadBytes; + stats.deadEntries += dfd->deadEntries; + } + + _mxDataFiles.unlockRead(); + return stats; + } + + Config getConfig() const { return _config; } + + const DatastoreCounters& getCounters() const { return _stats; } + + const ValueCacheCounters& getValueCacheCounters() const { return _valueCache->getCounters(); } + + uint64_t getValueCacheAllocatedBytes() const { return _valueCache->getAllocatedBytes(); } + + uint64_t getValueCacheMaxAllocatableBytes() const { return _valueCache->getMaxAllocatableBytes(); } + + // Returns an estimate of the memory usage + uint64_t getEstimatedUsedMemoryBytes(bool withCache = false) const + { + uint64_t usedMem = 0; + _mxDataFiles.lockRead(); + usedMem += sizeof(detail::DataFile) * _dataFiles.size(); // Data files storage (small) + _mxDataFiles.unlockRead(); + usedMem += _keyDir->getEstimatedUsedMemoryBytes(); // KeyDirMap (big) + usedMem += _indexMap->getEstimatedUsedMemoryBytes(); // Index Map (may be big, depends on index usage) + usedMem += _writeBuffer.size() * sizeof(uint8_t); // Write buffers (small) + if (withCache) { + usedMem += _valueCache->getAllocatedBytes(); // Value cache storage (depends on config) + } + return usedMem; + } + + bool setLogLevel(LogLevel level) + { + if (level >= LogLevel::Debug && level <= LogLevel::None) { + _logLevel = level; + return true; + } + return false; + } + + void setLogHandler(const std::function& logHandler) + { + if (logHandler) { + _logHandler = logHandler; + } else { + _logHandler = [&](LogLevel level, const char* message, bool closeDbNotification) { + defaultLogHandler(level, message, closeDbNotification); + }; + } + } + + static const char* toString(Status status) + { + switch (status) { + case Status::Ok: + return "Ok"; + case Status::StoreNotOpen: + return "datastore is not open"; + case Status::StoreAlreadyOpen: + return "datastore is already open"; + case Status::BadDiskAccess: + return "bad disk access"; + case Status::CannotOpenStore: + return "cannot access the datastore directory path"; + case Status::StoreAlreadyInUse: + return "datastore already open and locked by another process"; + case Status::BadKeySize: + return "key size is out of bounds"; + case Status::InconsistentKeyIndex: + return "key indexes are inconsistent"; + case Status::UnorderedKeyIndex: + return "key indexes are not ordered"; + case Status::BadValueSize: + return "value size is out of bounds"; + case Status::EntryNotFound: + return "entry has not be found"; + case Status::EntryCorrupted: + return "entry is corrupted"; + case Status::BadParameterValue: + return "bad parameter value"; + case Status::InconsistentParameterValues: + return "inconsistent parameter values"; + case Status::OutOfMemory: + return "operation failed due to out of memory"; + default: + return "UNKNOWN"; + } + } + + // Configuration + // ========================================================================================== + + // Defines the buffer size to write entries and batch the costly disk access. + // This optimization is effective as each disk write system call has a fixed base cost. + // Too big a size may create spikes of latency when flushing the buffer, too small a size reduces write throughput performance. + // In practice small values are enough to amortize the system calls + Status setWriteBufferBytes(uint32_t writeBufferBytes) + { + _mxWriteBuffer.lockWrite(); + flushWriteBufferUnlocked(); + _writeBuffer.resize(writeBufferBytes); + _mxWriteBuffer.unlockWrite(); + return Status::Ok; + } + + Status setConfig(const Config& config) + { + if (config.dataFileMaxBytes < detail::MinDataFileMaxBytes) { + log(LogLevel::Warn, "setConfig: too small 'dataFileMaxBytes' parameter value. Shall be above %d", detail::MinDataFileMaxBytes); + return Status::BadParameterValue; + } + if (config.mergeCyclePeriodMs == 0) { + log(LogLevel::Warn, "setConfig: 'mergeCyclePeriodMs' shall be a positive integer."); + return Status::BadParameterValue; + } + if (config.upkeepCyclePeriodMs == 0) { + log(LogLevel::Warn, "setConfig: 'upkeepCyclePeriodMs' shall be a positive integer."); + return Status::BadParameterValue; + } + if (config.upkeepKeyDirBatchSize == 0) { + log(LogLevel::Warn, "setConfig: 'upkeepKeyDirBatchSize' shall be a positive integer."); + return Status::BadParameterValue; + } + if (config.upkeepValueCacheBatchSize == 0) { + log(LogLevel::Warn, "setConfig: 'upkeepValueCacheBatchSize' shall be a positive integer."); + return Status::BadParameterValue; + } + if (config.valueCacheTargetMemoryLoadPercentage > 100) { + log(LogLevel::Warn, "setConfig: 'valueCacheTargetMemoryLoadPercentage' shall be in the range [0; 100]"); + return Status::BadParameterValue; + } + if (config.mergeTriggerDataFileFragmentationPercentage < 1 || config.mergeTriggerDataFileFragmentationPercentage > 100) { + log(LogLevel::Warn, "setConfig: 'mergeTriggerDataFileFragmentationPercentage' shall be in the range ]0; 100]."); + return Status::BadParameterValue; + } + if (config.mergeTriggerDataFileDeadByteThreshold > config.dataFileMaxBytes) { + log(LogLevel::Warn, + "setConfig: too big 'mergeTriggerDataFileDeadByteThreshold' parameter value. Shall be below dataFileMaxBytes=%d", + config.dataFileMaxBytes); + return Status::InconsistentParameterValues; + } + if (config.mergeSelectDataFileFragmentationPercentage < 1 || config.mergeSelectDataFileFragmentationPercentage > 100) { + log(LogLevel::Warn, "setConfig: 'mergeSelectDataFileFragmentationPercentage' shall be in the range ]0; 100]."); + return Status::BadParameterValue; + } + if (config.mergeSelectDataFileFragmentationPercentage > config.mergeTriggerDataFileFragmentationPercentage) { + log(LogLevel::Warn, + "setConfig: too big 'mergeSelectDataFileFragmentationPercentage' parameter value. Shall be below " + "mergeTriggerDataFileFragmentationPercentage=%d", + config.mergeTriggerDataFileFragmentationPercentage); + return Status::InconsistentParameterValues; + } + if (config.mergeSelectDataFileDeadByteThreshold > config.mergeTriggerDataFileDeadByteThreshold) { + log(LogLevel::Warn, + "setConfig: too big 'mergeSelectDataFileDeadByteThreshold' parameter value. Shall be below " + "mergeTriggerDataFileDeadByteThreshold=%d", + config.mergeTriggerDataFileDeadByteThreshold); + return Status::InconsistentParameterValues; + } + if (config.mergeSelectDataFileSmallSizeTheshold < detail::MinDataFileMaxBytes) { + log(LogLevel::Warn, "setConfig: too small 'mergeSelectDataFileSmallSizeTheshold' parameter value. Shall be above %d", + detail::MinDataFileMaxBytes); + return Status::BadParameterValue; + } + + // Accepted config + _mxConfig.lock(); + _config = config; + _dataFileMaxBytes = (uint64_t)config.dataFileMaxBytes; // Harmless data race (integrity is ensured) + _valueCache->setTargetMemoryLoad(0.01 * config.valueCacheTargetMemoryLoadPercentage); + _mxConfig.unlock(); + return Status::Ok; + } + + // Open and close + // ========================================================================================== + + Status open(fs::path dbDirectoryPath, bool doCreateIfNotExist = true) + { + using namespace litecask::detail; + if (_isInitialized) { + ++_stats.openCallFailedQty; + log(LogLevel::Error, "'open' failed: %s", toString(Status::StoreAlreadyOpen)); + return Status::StoreAlreadyOpen; + } + + // Input directory massaging + std::error_code ec; + dbDirectoryPath /= ""; + if (!fs::exists(dbDirectoryPath) && doCreateIfNotExist) { fs::create_directories(dbDirectoryPath, ec); } + if (!fs::exists(dbDirectoryPath) || !fs::is_directory(dbDirectoryPath, ec)) { + ++_stats.openCallFailedQty; + log(LogLevel::Error, "'open' failed: %s", toString(Status::CannotOpenStore)); + return Status::CannotOpenStore; + } + + // Lock the database via a lock file + Status s = lockDatabase(dbDirectoryPath); + if (s != Status::Ok) { + ++_stats.openCallFailedQty; + log(LogLevel::Error, "'open' failed: unable to lock the datastore, %s.", toString(s)); + return s; + } + + lcVector baseDataFilenames; + s = sanitizeAndCollectDataFiles(dbDirectoryPath, _maxDataFileIndex, baseDataFilenames); + if (s != Status::Ok) { + ++_stats.openCallFailedQty; + log(LogLevel::Error, "'open' failed: unable to clean the datastore, %s.", toString(s)); + return s; + } + + if (!doCreateIfNotExist && baseDataFilenames.empty()) { + ++_stats.openCallFailedQty; + log(LogLevel::Error, "'open' failed because there is no datastore at the provided path."); + unlockDatabase(dbDirectoryPath); + return Status::CannotOpenStore; + } + + // Reset all fields + _directory = dbDirectoryPath; + _keyDir->reset(); + _valueCache->reset(); + for (detail::DataFile* dfd : _dataFiles) delete dfd; + _dataFiles.clear(); + _freeDataFileIds.clear(); + _activeDataOffset = 0; + _activeFlushedDataOffset = 0; + _activeDataFileId = 0xFFFF; + _mergeWork.store(false); + _mergeExit.store(false); + _upkeepWork.store(false); + _upkeepExit.store(false); + _someHintFilesAreMissing = false; + _upkeepLastActiveFlushedDataOffset = NotStored; + _upkeepLastActiveDataFileId = 0xFFFF; + updateNow(); + + lcVector keyDirEntries; + keyDirEntries.reserve(16384); + ArenaAllocator loadArena; + + // Loop on data files to load + for (const auto& baseDataFilename : baseDataFilenames) { + uint16_t fileId = getFreeDataFileIdUnlocked(); + loadArena.reset(); + + // Check for hint file + if (!loadHintFile(baseDataFilename + HintFileSuffix, fileId, loadArena, keyDirEntries)) { + // Hint file failed or does not exist, let's load directly the data file + _someHintFilesAreMissing = true; + if (!loadDataFile(baseDataFilename + DataFileSuffix, fileId, loadArena, keyDirEntries)) { + ++_stats.openCallFailedQty; + log(LogLevel::Error, "'open' failed: unable to read the datastore."); + return Status::CannotOpenStore; + } + } + + // Create the data file descriptor + DataFile* newFd = _dataFiles[fileId]; + newFd->filename = baseDataFilename + DataFileSuffix; + newFd->handle = osOsOpen(newFd->filename, OsOpenMode::READ); + assert(osIsValidHandle(newFd->handle)); + KeyChunk entryToErase; + OldKeyChunk oldEntry; + + // Populate the key directory + for (uint32_t entryIdx = 0; entryIdx < keyDirEntries.size(); ++entryIdx) { + const LoadedKeyChunk& entry = keyDirEntries[entryIdx]; + uint16_t keySize = entry.metadata.keySize; + + if (entry.metadata.valueSize == DeletedEntry) { + // Deletion case + if (_keyDir->find(entry.keyHash, entry.key, entry.metadata.keySize, entryToErase)) { + // As there is an entry in the KeyDir, a tombstone is required instead + _dataFiles[entryToErase.fileId]->deadBytes += + sizeof(DataFileEntry) + + ((entryToErase.valueSize == DeletedEntry) ? keySize : (keySize + entryToErase.valueSize)); + _dataFiles[entryToErase.fileId]->deadEntries += 1; + _keyDir->insertEntry(entry.keyHash, entry.key, entry.keyIndexes, entry.metadata, oldEntry); + } else { + // No entry present in KeyDir, so no tombstone added there + newFd->deadBytes += sizeof(DataFileEntry) + keySize; + newFd->deadEntries += 1; + } + newFd->tombBytes += (uint32_t)sizeof(DataFileEntry) + keySize; + newFd->tombEntries += 1; + newFd->bytes += (uint32_t)sizeof(DataFileEntry) + keySize; + newFd->entries += 1; + } + + else if (entry.metadata.expTimeSec == 0 || entry.metadata.expTimeSec > _nowTimeSec) { + // Value case + if (_keyDir->insertEntry(entry.keyHash, entry.key, entry.keyIndexes, entry.metadata, oldEntry) == Status::Ok && + oldEntry.isValid) { + // Replace an entry: update file descriptors + _dataFiles[oldEntry.fileId]->deadBytes += + sizeof(DataFileEntry) + ((oldEntry.valueSize == DeletedEntry) ? keySize : (keySize + oldEntry.valueSize)); + _dataFiles[oldEntry.fileId]->deadEntries += 1; + } + newFd->bytes += (uint32_t)sizeof(DataFileEntry) + keySize + entry.metadata.valueSize; + newFd->entries += 1; + } + + else { + // Expired TTL case + newFd->deadBytes += (uint32_t)sizeof(DataFileEntry) + keySize + entry.metadata.valueSize; + newFd->deadEntries += 1; + newFd->bytes += (uint32_t)sizeof(DataFileEntry) + keySize + entry.metadata.valueSize; + newFd->entries += 1; + } + } + } + + // Finalize + createNewActiveDataFileUnlocked(); + _mergeThread = std::thread(&Datastore::mergeThreadEntry, this); + _upkeepThread = std::thread(&Datastore::upkeepThreadEntry, this); + _isInitialized = true; + ++_stats.openCallQty; + log(LogLevel::Info, "Datastore successfully opened"); + return Status::Ok; + } + + Status close() + { + using namespace litecask::detail; + + if (!_isInitialized) { + ++_stats.closeCallFailedQty; + log(LogLevel::Error, "'close' failed: %s", toString(Status::StoreNotOpen)); + return Status::StoreNotOpen; + } + log(LogLevel::Info, "Closing datastore"); + + // Stop the maintenance threads + { + std::unique_lock lk(_mergeMutex); + _mergeExit.store(true); + _mergeCv.notify_one(); + } + { + std::unique_lock lk(_upkeepMutex); + _upkeepExit.store(true); + _upkeepCv.notify_one(); + } + _mergeThread.join(); + _upkeepThread.join(); + _mergeExit.store(false); + _upkeepExit.store(false); + + // Lock the database and make it uninitialized + _mxActiveFile.lock(); + _mxDataFiles.lockWrite(); + _mxKeyDir.lock(); + _isInitialized = false; + + // Clean the data file + _mxWriteBuffer.lockWrite(); + flushWriteBufferUnlocked(); + _mxWriteBuffer.unlockWrite(); + for (auto* dfd : _dataFiles) { + if (osIsValidHandle(dfd->handle)) { + osOsClose(dfd->handle); + dfd->handle = InvalidFileHandle; + } + delete dfd; + } + _dataFiles.clear(); + _logHandler(LogLevel::Info, "closing", true); + + // Resetting all fields + _activeDataOffset = 0; + _activeDataFileId = 0; + _activeFlushedDataOffset = 0; + + unlockDatabase(_directory); + + // Reset the state + _directory.clear(); + + _mxKeyDir.unlock(); + _mxDataFiles.unlockWrite(); + _mxActiveFile.unlock(); + ++_stats.closeCallQty; + return Status::Ok; + } + + // Access API: put, remove, get + // ========================================================================================== + + Status put(const void* key, size_t keySize, const void* value, size_t valueSize, const lcVector& keyIndexes = {}, + uint32_t ttlSec = 0, bool forceDiskSync = false) + { + using namespace litecask::detail; + + if (keySize == 0 || keySize >= USHRT_MAX) { + ++_stats.putCallFailedQty; + return Status::BadKeySize; + } + if (keyIndexes.size() > MaxKeyIndexQty) { + ++_stats.putCallFailedQty; + return Status::InconsistentKeyIndex; + } + KeyIndex lastIdx{0, 0}; + for (const KeyIndex& ki : keyIndexes) { + if (ki.size == 0 || ki.startIdx + ki.size > keySize) { + ++_stats.putCallFailedQty; + return Status::InconsistentKeyIndex; + } + if (ki.startIdx < lastIdx.startIdx || (ki.startIdx == lastIdx.startIdx && ki.size <= lastIdx.size)) { + ++_stats.putCallFailedQty; + return Status::UnorderedKeyIndex; + } + lastIdx = ki; + } + if (valueSize >= detail::MaxValueSize) { + ++_stats.putCallFailedQty; + return Status::BadValueSize; + } + + uint64_t keyHash = LITECASK_HASH_FUNC(key, keySize); + uint32_t checksum = (uint32_t)(keyHash ^ LITECASK_HASH_FUNC(value, valueSize)); + + _mxActiveFile.lock(); + if (!_isInitialized) { + _mxActiveFile.unlock(); + ++_stats.putCallFailedQty; + return Status::StoreNotOpen; + } + + // Check that the limit of the data file size is not exceeded (taking into account the 64 overflow) + // The only exception is if we are at the beginning of a new file, so that any entry size can fit the data file + if (_activeDataOffset > 0 && + (uint64_t)_activeDataOffset + sizeof(DataFileEntry) + (uint64_t)keySize + (uint64_t)valueSize >= _dataFileMaxBytes) { + createNewActiveDataFileUnlocked(); // Now the entry can be written whatever its size (new file) + } + + _mxDataFiles.lockRead(); + lcOsFileHandle fh = _dataFiles[_activeDataFileId]->handle; + assert(osIsValidHandle(fh)); + + // Write entry in the memory write buffer + _mxWriteBuffer.lockWrite(); + size_t keyIndexSize = keyIndexes.size() * sizeof(KeyIndex); + if ((_activeDataOffset - _activeFlushedDataOffset) + (sizeof(DataFileEntry) + keySize + keyIndexSize + valueSize) > + _writeBuffer.size()) { + flushWriteBufferUnlocked(); + } + + uint32_t expTimeSec = (ttlSec == 0) ? 0 : ttlSec + _nowTimeSec; + DataFileEntry dfe{checksum, expTimeSec, (uint32_t)valueSize, (uint16_t)keySize, (uint8_t)keyIndexSize, 0}; + uint32_t entryActiveDataOffset = _activeDataOffset; + uint16_t entryActiveDataFileId = _activeDataFileId; + assert(_activeDataOffset >= _activeFlushedDataOffset); + + if ((_activeDataOffset - _activeFlushedDataOffset) + (sizeof(DataFileEntry) + keySize + keyIndexSize + valueSize) <= + _writeBuffer.size()) { + // Store in the write buffer + uint32_t dataOffset = _activeDataOffset - _activeFlushedDataOffset; + memcpy(&_writeBuffer[dataOffset], &dfe, sizeof(DataFileEntry)); + memcpy(&_writeBuffer[dataOffset + sizeof(DataFileEntry)], key, keySize); + if (keyIndexSize > 0) { + memcpy(&_writeBuffer[dataOffset + sizeof(DataFileEntry) + keySize], (uint8_t*)keyIndexes.data(), keyIndexSize); + } + if (valueSize > 0) { memcpy(&_writeBuffer[dataOffset + sizeof(DataFileEntry) + keySize + keyIndexSize], value, valueSize); } + + // Update the active offset + _activeDataOffset += (uint32_t)(sizeof(DataFileEntry) + keySize + keyIndexSize + valueSize); + if (forceDiskSync) { flushWriteBufferUnlocked(); } + } + + else { + // Too big entry: the write buffer has already been synced-flushed, so the entry is directly written in the file + assert(_activeDataOffset == _activeFlushedDataOffset); + if (!osOsWrite(fh, &dfe, sizeof(DataFileEntry)) || !osOsWrite(fh, key, keySize)) { + fatalHandler("Put: Unable to write the header and key (size=%" PRId64 ") in the datafile", keySize); + } + if (keyIndexSize > 0 && !osOsWrite(fh, (uint8_t*)keyIndexes.data(), keyIndexSize)) { + fatalHandler("Put: Unable to write the key indexes (size=%" PRId64 ") in the datafile", keyIndexSize); + } + if (valueSize > 0 && !osOsWrite(fh, value, valueSize)) { + fatalHandler("Put: Unable to write the value (size=%" PRId64 ") in the datafile", valueSize); + } + + // Update the offsets (flush also) after this unoptimized write + _activeDataOffset += (uint32_t)(sizeof(DataFileEntry) + keySize + keyIndexSize + valueSize); + _activeFlushedDataOffset = _activeDataOffset; + } + + _mxWriteBuffer.unlockWrite(); + + // Update active data file stats + _dataFiles[entryActiveDataFileId]->bytes += (uint32_t)(sizeof(DataFileEntry) + keySize + keyIndexSize + valueSize); + _dataFiles[entryActiveDataFileId]->entries += 1; + + _mxDataFiles.unlockRead(); + _mxActiveFile.unlock(); + + // Push in cache + ValueLoc cacheLoc = NotStored; + if (_valueCache->isEnabled()) { cacheLoc = _valueCache->insertValue(value, (uint32_t)valueSize, keyHash, expTimeSec); } + + // Update the KeyDir + OldKeyChunk oldEntry; + _mxKeyDir.lock(); + Status storageStatus = _keyDir->insertEntry((uint32_t)keyHash, key, keyIndexes.data(), + {expTimeSec, (uint32_t)valueSize, cacheLoc, entryActiveDataOffset, + entryActiveDataFileId, (uint16_t)keySize, (uint8_t)keyIndexSize, (uint8_t)checksum}, + oldEntry); + _mxKeyDir.unlock(); + + if (storageStatus != Status::Ok) { // Can be too big a key (precise check done here) or out of memory + if (storageStatus == Status::OutOfMemory) { + // This error deserves a dedicated log message + // In this case, the run-time behavior of the database is compromised. + // The data files are however still correct and consistent, only the in-memory information is incomplete. + log(LogLevel::Error, + "Unable to store the new key due to out of memory, the run-time integrity of the datastore is compromised (data files " + "are ok). You should stop and relaunch the application to recover it. If not enough, using tools to perform a full " + "merge on the data to make it more compact could help."); + } + return storageStatus; + } + + // Update the index map + int lastOldIdx = 0; + for (const KeyIndex& ki : keyIndexes) { + // Insert in the index map only if it is not present in the old list. Run time is O(N) as lists are sorted. + bool doAdd = !oldEntry.isValid; + if (!doAdd) { + while (lastOldIdx < oldEntry.keyIndexQty && + (oldEntry.keyIndexes[lastOldIdx].startIdx < ki.startIdx || + (oldEntry.keyIndexes[lastOldIdx].startIdx == ki.startIdx && oldEntry.keyIndexes[lastOldIdx].size < ki.size))) { + ++lastOldIdx; + } + doAdd = (lastOldIdx >= oldEntry.keyIndexQty || oldEntry.keyIndexes[lastOldIdx].startIdx != ki.startIdx || + oldEntry.keyIndexes[lastOldIdx].size != ki.size); + } + if (doAdd) { + _mxIndexMap.lockWrite(); + _indexMap->insertIndex((uint8_t*)key + ki.startIdx, ki.size, (uint32_t)keyHash); + _mxIndexMap.unlockWrite(); + } + } + + // Update case? + if (oldEntry.isValid) { + // Remove the old entry from the cache + if (oldEntry.cacheLocation != NotStored && _valueCache->isEnabled()) { + _valueCache->removeValue(oldEntry.cacheLocation, keyHash); + } + + // Update "old" file descriptor statistics for proper maintenance + _mxDataFiles.lockRead(); + _dataFiles[oldEntry.fileId]->deadBytes += (uint32_t)(sizeof(DataFileEntry) + keySize + keyIndexSize + + ((oldEntry.valueSize == DeletedEntry) ? 0 : oldEntry.valueSize)); + _dataFiles[oldEntry.fileId]->deadEntries += 1; + _mxDataFiles.unlockRead(); + } + + ++_stats.putCallQty; + return Status::Ok; + } + + // Variant 1: key as vector + Status put(const lcVector& key, const void* value, size_t valueSize, const lcVector& keyIndexes = {}, + uint32_t ttlSec = 0, bool forceDiskSync = false) + { + return put(key.data(), key.size(), value, valueSize, keyIndexes, ttlSec, forceDiskSync); + } + + // Variant 2: key as string + Status put(const lcString& key, const void* value, size_t valueSize, const lcVector& keyIndexes = {}, uint32_t ttlSec = 0, + bool forceDiskSync = false) + { + return put(key.data(), key.size(), value, valueSize, keyIndexes, ttlSec, forceDiskSync); + } + + // Variant 3: key as vector and value as vector + Status put(const lcVector& key, const lcVector& value, const lcVector& keyIndexes = {}, uint32_t ttlSec = 0, + bool forceDiskSync = false) + { + return put(key.data(), key.size(), value.data(), value.size(), keyIndexes, ttlSec, forceDiskSync); + } + + // Variant 4: key as string and value as vector + Status put(const lcString& key, const lcVector& value, const lcVector& keyIndexes = {}, uint32_t ttlSec = 0, + bool forceDiskSync = false) + { + return put(key.data(), key.size(), value.data(), value.size(), keyIndexes, ttlSec, forceDiskSync); + } + + Status remove(const void* key, size_t keySize, bool forceDiskSync = false) + { + using namespace litecask::detail; + if (keySize == 0 || keySize >= USHRT_MAX) { + ++_stats.removeCallFailedQty; + return Status::BadKeySize; + } + + uint64_t keyHash = LITECASK_HASH_FUNC(key, keySize); + uint32_t checksum = (uint32_t)keyHash; + + _mxActiveFile.lock(); + if (!_isInitialized) { + _mxActiveFile.unlock(); + ++_stats.removeCallFailedQty; + return Status::StoreNotOpen; + } + + // Optional, but keeps the database cleaner in case of false removal + KeyChunk entry; + bool isFound = _keyDir->find((uint32_t)keyHash, key, (uint16_t)keySize, entry); + if (!isFound || entry.valueSize == DeletedEntry) { + _mxActiveFile.unlock(); + ++_stats.removeCallNotFoundQty; + return Status::EntryNotFound; + } + + // Check that the limit of the data file size is not exceeded (taking into account the 64 overflow) + // The only exception is if we are at the beginning of a new file, so that any entry size can fit the data file + if (_activeDataOffset > 0 && (uint64_t)_activeDataOffset + sizeof(DataFileEntry) + (uint64_t)keySize >= _dataFileMaxBytes) { + createNewActiveDataFileUnlocked(); // Now the "removal" entry can be written + } + + _mxDataFiles.lockRead(); + lcOsFileHandle fh = _dataFiles[_activeDataFileId]->handle; + assert(osIsValidHandle(fh)); + + // Write entry in the memory write buffer + _mxWriteBuffer.lockWrite(); + if ((_activeDataOffset - _activeFlushedDataOffset) + (sizeof(DataFileEntry) + keySize) > _writeBuffer.size()) { + flushWriteBufferUnlocked(); // After that, the write must succeed by design (write buffer big enough for 1 entry) + } + + // Note: tombstone's keyIndexes are not stored on disk, but we need to keep the previous indexes in memory for the + // following use case: remove an entry with indexes, then add it again with some identical indexes: we do not want doubles inside + // index arrays + DataFileEntry dfe{checksum, 0, DeletedEntry, (uint16_t)keySize, 0, 0}; + uint32_t entryActiveDataOffset = _activeDataOffset; + uint16_t entryActiveDataFileId = _activeDataFileId; + assert(_activeDataOffset >= _activeFlushedDataOffset); + + if ((_activeDataOffset - _activeFlushedDataOffset) + (sizeof(DataFileEntry) + keySize) <= _writeBuffer.size()) { + // Store in the write buffer + uint32_t dataOffset = _activeDataOffset - _activeFlushedDataOffset; + memcpy(&_writeBuffer[dataOffset], &dfe, sizeof(DataFileEntry)); + memcpy(&_writeBuffer[dataOffset + sizeof(DataFileEntry)], key, keySize); + + // Update the active offset + _activeDataOffset += (uint32_t)(sizeof(DataFileEntry) + keySize); + if (forceDiskSync) { flushWriteBufferUnlocked(); } + } + + else { + // Too big entry: the write buffer has already been synced-flushed, and we directly write in the file + if (!osOsWrite(fh, &dfe, sizeof(DataFileEntry)) || !osOsWrite(fh, key, keySize)) { + fatalHandler("Remove: Unable to write the header and key (size=%" PRId64 ") in the datafile", keySize); + } + + // Update the offsets (flush also) after this unoptimized write + _activeDataOffset += (uint32_t)(sizeof(DataFileEntry) + keySize); + _activeFlushedDataOffset = _activeDataOffset; + } + + _mxWriteBuffer.unlockWrite(); + + _dataFiles[entryActiveDataFileId]->tombBytes += (uint32_t)(sizeof(DataFileEntry) + keySize); + _dataFiles[entryActiveDataFileId]->tombEntries += 1; + _dataFiles[entryActiveDataFileId]->bytes += (uint32_t)(sizeof(DataFileEntry) + keySize); + _dataFiles[entryActiveDataFileId]->entries += 1; + + _mxDataFiles.unlockRead(); + _mxActiveFile.unlock(); + + // Update the KeyDir with a tombstone + _mxKeyDir.lock(); + OldKeyChunk oldEntry; + Status storageStatus = _keyDir->insertEntry( + (uint32_t)keyHash, key, nullptr, + {0, DeletedEntry, NotStored, entryActiveDataOffset, entryActiveDataFileId, (uint16_t)keySize, 0, 0}, oldEntry); + _mxKeyDir.unlock(); + + if (storageStatus != Status::Ok) { + if (storageStatus == Status::OutOfMemory) { + // This error deserves a dedicated log message + // In this case, the run-time behavior of the database is compromised. + // The data files are however still correct and consistent, only the in-memory information is incomplete. + log(LogLevel::Error, + "Unable to store the new key due to out of memory, the run-time integrity of the datastore is compromised (data files " + "are ok). You should stop and relaunch the application to recover it. If not enough, using tools to perform a full " + "merge on the data to make it more compact could help."); + } + return storageStatus; + } + + // Remove the (potential) old value from the value cache + if (oldEntry.isValid && oldEntry.cacheLocation != NotStored && _valueCache->isEnabled()) { + _valueCache->removeValue(oldEntry.cacheLocation, keyHash); + } + + if (oldEntry.isValid) { + _mxDataFiles.lockRead(); + _dataFiles[oldEntry.fileId]->deadBytes += (uint32_t)(sizeof(DataFileEntry) + oldEntry.valueSize + keySize); + _dataFiles[oldEntry.fileId]->deadEntries += 1; + _mxDataFiles.unlockRead(); + } + + ++_stats.removeCallQty; + return Status::Ok; + } + + // Variant 1: key as vector + Status remove(const lcVector& key, bool forceDiskSync = false) { return remove(key.data(), key.size(), forceDiskSync); } + + // Variant 2: key as string + Status remove(const lcString& key, bool forceDiskSync = false) { return remove(key.data(), key.size(), forceDiskSync); } + + Status get(const void* key, size_t keySize, lcVector& value) + { + using namespace litecask::detail; + + if (keySize == 0 || keySize >= USHRT_MAX) { // Key size is anyway limited by 16 bits minus some meta data overhead + ++_stats.getCallFailedQty; + return Status::BadKeySize; + } + + // Look in the KeyDir + uint64_t keyHash = LITECASK_HASH_FUNC(key, keySize); + + _mxDataFiles.lockRead(); + if (!_isInitialized) { + _mxDataFiles.unlockRead(); + ++_stats.getCallFailedQty; + return Status::StoreNotOpen; + } + + KeyChunk entry{0, 0, 0, 0, 0, 0, 0, 0}; + bool isFound = _keyDir->find((uint32_t)keyHash, key, (uint16_t)keySize, entry); + + if (!isFound || entry.valueSize == DeletedEntry) { + _mxDataFiles.unlockRead(); + ++_stats.getCallNotFoundQty; + return Status::EntryNotFound; + } + assert(entry.fileId < _dataFiles.size()); + + // Check the write buffer + if (entry.fileId == _activeDataFileId) { // If it is different, it cannot be equal afterwards. And we avoid a lock on main path + _mxWriteBuffer.lockRead(); + if (entry.fileId == _activeDataFileId && entry.fileOffset >= _activeFlushedDataOffset && + entry.fileOffset - _activeFlushedDataOffset < _writeBuffer.size()) { + value.resize(entry.valueSize); + memcpy(value.data(), + &_writeBuffer[entry.fileOffset - _activeFlushedDataOffset + sizeof(DataFileEntry) + keySize + entry.keyIndexSize], + entry.valueSize); + _mxWriteBuffer.unlockRead(); + _mxDataFiles.unlockRead(); + ++_stats.getCallQty; + ++_stats.getWriteBufferHitQty; + return Status::Ok; + } + _mxWriteBuffer.unlockRead(); + } + + // Check the cache + if (_valueCache->isEnabled()) { + bool isInTheCache = _valueCache->getValue(entry.cacheLocation, keyHash, entry.valueSize, value); + if (isInTheCache) { + _mxDataFiles.unlockRead(); + ++_stats.getCallQty; + ++_stats.getCacheHitQty; + return Status::Ok; + } + } + + // Load the value + value.resize(sizeof(DataFileEntry) + keySize + entry.keyIndexSize + entry.valueSize); + + DataFile* dfd = _dataFiles[entry.fileId]; + lcOsFileHandle fh = dfd->handle; + assert(osIsValidHandle(fh)); + bool isReadOk = + osOsRead(fh, value.data(), sizeof(DataFileEntry) + keySize + entry.keyIndexSize + entry.valueSize, entry.fileOffset); + _mxDataFiles.unlockRead(); + + size_t valueStartOffset = (uint32_t)sizeof(DataFileEntry) + keySize + entry.keyIndexSize; + + // Check the value consistency. Read errors are caught here too + uint32_t checksum = (uint32_t)(keyHash ^ LITECASK_HASH_FUNC(&value[valueStartOffset], entry.valueSize)); + if (!isReadOk || checksum != ((DataFileEntry*)value.data())->checksum) { + ++_stats.getCallCorruptedQty; + return Status::EntryCorrupted; + } + + // Remove the offset due to the metadata (cannot avoid memmove with a generic container as output...) + memmove(value.data(), &value[valueStartOffset], entry.valueSize); + value.resize(entry.valueSize); + + if (_valueCache->isEnabled()) { + // Store the value in the cache + ValueLoc cacheLoc = _valueCache->insertValue(value.data(), entry.valueSize, keyHash, entry.expTimeSec); + + // The change counter avoids the ABA problem between the entry insertion above and the cache update here + // If valueSize or changeCounter do not match, the cache entry is wasted but it will be later evicted anyway + _mxKeyDir.lock(); + _keyDir->updateCachedValueLocation((uint32_t)keyHash, key, (uint16_t)keySize, entry.valueSize, entry.changeCounter, cacheLoc); + _mxKeyDir.unlock(); + } + + ++_stats.getCallQty; + return Status::Ok; + } + + // Get variant 1: key as vector + Status get(const lcVector& key, lcVector& value) { return get(key.data(), key.size(), value); } + + // Get variant 2: key as string + Status get(const lcString& key, lcVector& value) { return get(key.data(), key.size(), value); } + + // Query variant 1: single key part as vector + Status query(const lcVector& keyPart, lcVector>& matchingKeys) + { + return privateQuery, lcVector>({keyPart}, matchingKeys); + } + + // Query variant 2: single key part as string + Status query(const lcString& keyPart, lcVector>& matchingKeys) + { + return privateQuery>({keyPart}, matchingKeys); + } + + // Query variant 3: multiple key part as vector of vector + Status query(const lcVector>& keyParts, lcVector>& matchingKeys) + { + return privateQuery, lcVector>(keyParts, matchingKeys); + } + + // Query variant 4: key part as vector of string + Status query(const lcVector& keyParts, lcVector>& matchingKeys) + { + return privateQuery>(keyParts, matchingKeys); + } + + // Query variant 5: single key part as vector, with arena allocator for output array of keys + Status query(const lcVector& keyPart, lcVector& arenaMatchingKeys, ArenaAllocator& allocator) + { + return privateQuery, QueryResult>({keyPart}, arenaMatchingKeys, &allocator); + } + + // Query variant 6: single key part as string, with arena allocator for output array of keys + Status query(const lcString& keyPart, lcVector& arenaMatchingKeys, ArenaAllocator& allocator) + { + return privateQuery({keyPart}, arenaMatchingKeys, &allocator); + } + + // Query variant 7: multiple key parts as vector, with arena allocator for output array of keys + Status query(const lcVector>& keyParts, lcVector& arenaMatchingKeys, ArenaAllocator& allocator) + { + return privateQuery, QueryResult>(keyParts, arenaMatchingKeys, &allocator); + } + + // Query variant 8: multiple key parts as string, with arena allocator for output array of keys + Status query(const lcVector& keyParts, lcVector& arenaMatchingKeys, ArenaAllocator& allocator) + { + return privateQuery(keyParts, arenaMatchingKeys, &allocator); + } + + void sync() + { + _mxWriteBuffer.lockWrite(); + flushWriteBufferUnlocked(); + _mxWriteBuffer.unlockWrite(); + } + + bool requestMerge() + { + std::unique_lock lk(_mergeMutex); + if (_isInitialized && _mergeWork.load() == false) { + _mergeWork.store(true); + _mergeCv.notify_one(); + return true; + } + return false; + } + + bool isMergeOnGoing() const { return _mergeWork.load(); } + + // Use carefully... + static void erasePermanentlyAllContent_UseWithCaution(fs::path dbDirectoryPath) + { + using namespace litecask::detail; + dbDirectoryPath /= ""; + lcVector entries; + if (!osGetDirContent(dbDirectoryPath, entries)) return; + + for (const auto& e : entries) { + if (e.isDir || e.name.empty()) continue; + fs::path filename(e.name); + + if (filename.extension() == DataFileSuffix) { + osRemoveFile(dbDirectoryPath / e.name); + continue; + } + if (filename.extension() == HintFileSuffix) { + osRemoveFile(dbDirectoryPath / e.name); + continue; + } + if (filename.extension() == TmpFileSuffix) { + osRemoveFile(dbDirectoryPath / e.name); + continue; + } + if (filename.extension() == ToRemoveFileSuffix) { + osRemoveFile(dbDirectoryPath / e.name); + continue; + } + if (filename.extension() == LogFileSuffix) { + osRemoveFile(dbDirectoryPath / e.name); + continue; + } + } + } + +#ifndef LITECASK_BUILD_FOR_TEST // Allows looking inside the datastore internal, for testing purposes + private: +#endif + + // Internal data file management + // ========================================================================================== + + // Returns the basename of the last active data file (not the new one but the just closed one) + // The "active file" lock must be taken before the call + lcString createNewActiveDataFileUnlocked() + { + using namespace litecask::detail; + + // Set the name of the previous active data file (used as an in-order basename for merged data files) + char tmpFilename[256]; + snprintf(tmpFilename, 256, "%s%" PRId64 "", _directory.string().c_str(), _maxDataFileIndex); + lcString lastActiveBaseDataFilename = tmpFilename; + + // The data file structure is modified + _mxDataFiles.lockWrite(); + _mxWriteBuffer.lockWrite(); + + if (_activeDataFileId < _dataFiles.size()) { + // Close previous active file, which was writable + flushWriteBufferUnlocked(); + DataFile* dfd = _dataFiles[_activeDataFileId]; + assert(osIsValidHandle(dfd->handle)); + osOsClose(dfd->handle); + + // Reopen it in read-only mode + dfd->handle = osOsOpen(dfd->filename, OsOpenMode::READ); + assert(osIsValidHandle(dfd->handle)); + } + + // Open a new data file in append + read mode + _activeDataOffset = 0; + _activeFlushedDataOffset = 0; + _activeDataFileId = getFreeDataFileIdUnlocked(); + _mxWriteBuffer.unlockWrite(); + + DataFile* newFd = _dataFiles[_activeDataFileId]; + snprintf(tmpFilename, 256, "%s%" PRId64 "%s", _directory.string().c_str(), ++_maxDataFileIndex, DataFileSuffix); + newFd->filename = tmpFilename; + newFd->handle = osOsOpen(newFd->filename, OsOpenMode::APPEND); + assert(osIsValidHandle(newFd->handle)); + + _mxDataFiles.unlockWrite(); + + log(LogLevel::Debug, "Creating new active data file %s", tmpFilename); + ++_stats.activeDataFileSwitchQty; + return lastActiveBaseDataFilename; + } + + bool isItWorthMerging(int fragmentationPercentage, uint32_t deadByteThreshold) + { + using namespace litecask::detail; + assert(fragmentationPercentage >= 1 && fragmentationPercentage <= 100); + + _mxDataFiles.lockRead(); + for (const DataFile* dfd : _dataFiles) { + if (!osIsValidHandle(dfd->handle)) continue; // Descriptor not in use + + if ((uint64_t)dfd->deadBytes * 100L > (uint64_t)dfd->bytes * (uint64_t)fragmentationPercentage) { + _mxDataFiles.unlockRead(); + log(LogLevel::Debug, "Merge needed due to some data file having too high ratio of dead bytes"); + return true; + } + + if (dfd->deadBytes > deadByteThreshold) { + _mxDataFiles.unlockRead(); + log(LogLevel::Debug, "Merge needed due to some data file having more than %d dead bytes", deadByteThreshold); + return true; + } + } + _mxDataFiles.unlockRead(); + return false; + } + + Status selectDataFilesToMerge(int fragmentationPercentage, uint32_t deadByteThreshold, uint32_t smallFileSizeTheshold, + lcVector& mergeInfos) + { + using namespace litecask::detail; + assert(fragmentationPercentage >= 1 && fragmentationPercentage <= 100); + assert(smallFileSizeTheshold >= MinDataFileMaxBytes); + mergeInfos.clear(); + + // Check each data file + _mxDataFiles.lockRead(); + for (uint32_t fileId = 0; fileId < _dataFiles.size(); ++fileId) { + const DataFile* dfd = _dataFiles[fileId]; + if (!osIsValidHandle(dfd->handle)) continue; // Descriptor not in use + bool doIncludeFileInMerge = false; + + if ((uint64_t)dfd->deadBytes * 100L > (uint64_t)dfd->bytes * (uint64_t)fragmentationPercentage) doIncludeFileInMerge = true; + if (dfd->deadBytes > deadByteThreshold) doIncludeFileInMerge = true; + if (dfd->bytes < smallFileSizeTheshold) doIncludeFileInMerge = true; + + if (doIncludeFileInMerge) { mergeInfos.push_back({(uint16_t)fileId, {}}); } + log(LogLevel::Debug, "selectDataFilesToMerge: %s %s", dfd->filename.c_str(), + doIncludeFileInMerge ? "will be merged" : "is skipped"); + } + _mxDataFiles.unlockRead(); + return Status::Ok; + } + + // 'Merging' is a data file "cleaning" process which: + // - remove obsolete entries whose value has been overridden or deleted in newer data files + // - compact the entries together in the new data files up to the allowed maximum size + bool createMergedDataFiles(lcVector& mergeInfos, const lcString& mergeBasename, const uint32_t dataFileMaxBytes) + { + using namespace litecask::detail; + DataFileEntry header; + lcVector buf(1024); + uint16_t mergeFileCount = 0; + uint32_t readFileOffset = 0; + uint32_t writeFileOffset = 0; + uint32_t fileIncrement = 0; + uint16_t currentDataFileId = 0xFFFF; + DataFile* currentDataFile = nullptr; + FILE* fhw = nullptr; + FILE* fhhw = nullptr; + + // Loop on files to merge, the order does not matter + for (MergeFileInfo& mergeInfo : mergeInfos) { + // Get the data file descriptor + _mxDataFiles.lockRead(); + const DataFile* dfd = _dataFiles[mergeInfo.fileId]; + assert(osIsValidHandle(dfd->handle) && "This data file should have been in use"); + _mxDataFiles.unlockRead(); + + // We use fopen/fread/fwrite here because of standard usage and the provided buffering + // By design, we never merge the active file + FILE* fhr = osFopen(dfd->filename, "rb"); + assert(fhr); + readFileOffset = 0; + + // Loop on entries + while (fread(&header, sizeof(DataFileEntry), 1, fhr) == 1) { + uint32_t valueSize = header.valueSize; + uint32_t keyIndexSize = header.keyIndexSize; + uint32_t keySize = header.keySize; + if (keySize == 0) { + log(LogLevel::Error, + "Cannot read the data file %s for merging: a key has a null (=corrupted) size (value size is said to be %u) at " + "file offset %u", + dfd->filename.c_str(), valueSize, readFileOffset); + break; + } + + if (keyIndexSize > MaxKeyIndexQty * sizeof(KeyIndex)) { + log(LogLevel::Error, + "Cannot read the data file %s for merging: a key has too big index (byte size=%d > %" PRId64 ") at file offset %u", + dfd->filename.c_str(), keyIndexSize, MaxKeyIndexQty * sizeof(KeyIndex), readFileOffset); + break; + } + + if (valueSize != DeletedEntry) { + uint32_t allSize = keySize + keyIndexSize + valueSize; + if (buf.size() < allSize) buf.resize(allSize); + if (fread(buf.data(), 1, allSize, fhr) != allSize) { + log(LogLevel::Warn, + "Cannot read the data file %s for merging: unable to read all the bytes (%u) of the entry at file offset %u", + dfd->filename.c_str(), allSize, readFileOffset); + break; + } + fileIncrement = (uint32_t)sizeof(DataFileEntry) + allSize; + } + + else { + // Tombstone case + uint32_t allSize = keySize + keyIndexSize; + if (buf.size() < allSize) buf.resize(allSize); + if (fread(buf.data(), 1, allSize, fhr) != allSize) { + log(LogLevel::Warn, + "Cannot read the data file %s for merging: unable to read all the bytes (%u) of the deleted entry at file " + "offset %u", + dfd->filename.c_str(), allSize, readFileOffset); + break; + } + keyIndexSize = 0; + fileIncrement = sizeof(DataFileEntry) + allSize; + } + + uint64_t keyHash = LITECASK_HASH_FUNC(buf.data(), keySize); + KeyChunk entry; + bool isFound = _keyDir->find((uint32_t)keyHash, buf.data(), (uint16_t)keySize, entry); + if (!isFound || entry.fileId != mergeInfo.fileId || entry.fileOffset != readFileOffset) { + readFileOffset += fileIncrement; + continue; // This entry is not the latest or expired + } + readFileOffset += fileIncrement; + + // Change the write file if the size exceeds the threshold + if (fhw == nullptr || (writeFileOffset > 0 && writeFileOffset + fileIncrement > dataFileMaxBytes)) { + // Move the complete compacted file as official data file + if (fhw != nullptr) { + // Close finished data and hint written files + fclose(fhw); + fclose(fhhw); + + // If a crash occurs before the move then the .tmp is simply removed at next launch. If a crash + // occurs after this (atomic) renaming, then the data will be taken into account and the old + // duplicate entries will be cleaned by next merge + assert(currentDataFile); + log(LogLevel::Debug, "Finished compacted file %s. Removing the '%s' suffix.", currentDataFile->filename.c_str(), + TmpFileSuffix); + [[maybe_unused]] bool isOk = osRenameFile(currentDataFile->filename + TmpFileSuffix, currentDataFile->filename); + assert(isOk); + currentDataFile->handle = osOsOpen(currentDataFile->filename, OsOpenMode::READ); + assert(osIsValidHandle(currentDataFile->handle)); + fs::path hintFilename = fs::path(currentDataFile->filename).replace_extension(HintFileSuffix); + isOk = osRenameFile(hintFilename.string() + TmpFileSuffix, hintFilename); + assert(isOk); + } + + // Create the next compacted file to write in + char dataFilename[512]; // Note: the fractional number shall not be zero + snprintf(dataFilename, sizeof(dataFilename), "%s.%05d%s", mergeBasename.c_str(), ++mergeFileCount, DataFileSuffix); + + // The data file structure is modified + _mxDataFiles.lockWrite(); + currentDataFileId = getFreeDataFileIdUnlocked(); + currentDataFile = _dataFiles[currentDataFileId]; + _mxDataFiles.unlockWrite(); + + currentDataFile->filename = dataFilename; + currentDataFile->handle = + InvalidFileHandle; // Will be opened for read only when the temporary file is complete and renamed + fhw = osFopen(lcString(dataFilename) + TmpFileSuffix, "wb"); + if (!fhw) { + fatalHandler("Unable to open temp data file for %s during merge file creation.", currentDataFile->filename.c_str()); + } + writeFileOffset = 0; + fs::path hintFilename = fs::path(currentDataFile->filename).replace_extension(HintFileSuffix); + fhhw = osFopen(hintFilename.string() + TmpFileSuffix, "wb"); + if (!fhhw) { + fatalHandler("Unable to open temp hint file for %s during merge file creation.", currentDataFile->filename.c_str()); + } + } + + // Write the entry both in the data file and its hint file + uint8_t* keyAndIndexes = buf.data(); + DataFileEntry dfe{header.checksum, header.expTimeSec, valueSize, (uint16_t)keySize, (uint8_t)keyIndexSize, 0}; + bool isMergeOk = (fwrite(&dfe, sizeof(DataFileEntry), 1, fhw) == 1); + isMergeOk = isMergeOk && + (fwrite(keyAndIndexes, 1, fileIncrement - sizeof(DataFileEntry), fhw) == fileIncrement - sizeof(DataFileEntry)); + + HintFileEntry hfe{writeFileOffset, header.expTimeSec, valueSize, (uint16_t)keySize, (uint8_t)keyIndexSize, 0}; + isMergeOk = isMergeOk && (fwrite(&hfe, sizeof(HintFileEntry), 1, fhhw) == 1); + isMergeOk = isMergeOk && (fwrite(keyAndIndexes, 1, keySize + keyIndexSize, fhhw) == keySize + keyIndexSize); + if (!isMergeOk) { + fatalHandler("Write error for for file %s during merge file creation.", currentDataFile->filename.c_str()); + } + + mergeInfo.patches.push_back({(uint32_t)keyHash, entry.fileOffset, writeFileOffset, mergeInfo.fileId, currentDataFileId}); + currentDataFile->bytes += fileIncrement; + currentDataFile->entries += 1; + if (valueSize == DeletedEntry) { + currentDataFile->tombBytes += fileIncrement; + currentDataFile->tombEntries += 1; + } + writeFileOffset += fileIncrement; + } // End of loop on entries + + _stats.mergeGainedBytes += readFileOffset - writeFileOffset; + fclose(fhr); + } // End of loop on data files to merge + + // Close the last merged data file + if (fhw != nullptr) { + // Close finished data and hint written files + fclose(fhw); + fclose(fhhw); + + assert(currentDataFile); + log(LogLevel::Debug, "Finished compacted file %s. Removing the '%s' suffix.", currentDataFile->filename.c_str(), TmpFileSuffix); + if (!osRenameFile(currentDataFile->filename + TmpFileSuffix, currentDataFile->filename)) { + fatalHandler("Unable to rename temp data file for %s during merge file creation.", currentDataFile->filename.c_str()); + } + currentDataFile->handle = osOsOpen(currentDataFile->filename, OsOpenMode::READ); + assert(osIsValidHandle(currentDataFile->handle)); + fs::path hintFilename = fs::path(currentDataFile->filename).replace_extension(HintFileSuffix); + if (!osRenameFile(hintFilename.string() + TmpFileSuffix, hintFilename)) { + fatalHandler("Unable to rename temp hint file for %s during merge file creation.", currentDataFile->filename.c_str()); + } + } + + // No problem so far: create the tag files to remove old data files + // If a crash occurs before/while creating the "to_remove" tag files, next merge will clean the old-and-now-duplicate entries + _mxDataFiles.lockRead(); + for (MergeFileInfo& mergeInfo : mergeInfos) { + const DataFile* dfd = _dataFiles[mergeInfo.fileId]; + log(LogLevel::Debug, "Creating tag file to request removal of old data file %s.", dfd->filename.c_str()); + FILE* tagFile = osFopen(fs::path(dfd->filename).replace_extension(ToRemoveFileSuffix), "wb"); + fclose(tagFile); // No content, just the file existence. Not really a problem if the tag file creation failed + } + _mxDataFiles.unlockRead(); + + // No problem so far. + // Next step is to apply patch on KeyDir, close the old data files, open the new ones, and remove the tagged data files + _stats.mergeGainedDataFileQty += mergeInfos.size() - mergeFileCount; + return true; + } + + bool replaceDataFiles(const lcVector& mergeInfos) + { + using namespace litecask::detail; + + // Next step is to apply patch on KeyDir, close the old data files, open the new ones, and remove the tagged data files + for (const MergeFileInfo& mergeInfo : mergeInfos) { + // The data file structure is modified + _mxDataFiles.lockWrite(); + DataFile* dfd = _dataFiles[mergeInfo.fileId]; + + // Live-patch of the KeyDir + _mxKeyDir.lock(); + for (const KeyDirPatch& kdPatch : mergeInfo.patches) { + _keyDir->updateMergedValueLocation(kdPatch.keyHash, kdPatch.oldFileId, kdPatch.oldFileOffset, kdPatch.newFileId, + kdPatch.fileOffset); + } + _mxKeyDir.unlock(); + + if (osIsValidHandle(dfd->handle)) { + osOsClose(dfd->handle); + dfd->handle = InvalidFileHandle; + + // Remove the files associated with this old data file. The order below matters in case of hard interruption. + // First remove the old data file + osRemoveFile(dfd->filename); + // Then the hint file (if it exists) + osRemoveFile(fs::path(dfd->filename).replace_extension(HintFileSuffix)); + // Then, the removal tag + osRemoveFile(fs::path(dfd->filename).replace_extension(ToRemoveFileSuffix)); + + // Free the fileId for reuse + _freeDataFileIds.push_back(mergeInfo.fileId); + } + + _mxDataFiles.unlockWrite(); + } + + return true; + } + + // Templatized helper function for template privateQuery(...) below + void addQueryResult(lcVector>& matchingKeys, const lcVector& key, ArenaAllocator* /*allocator*/) + { + matchingKeys.push_back(key); + } + + // Templatized helper function for template privateQuery(...) below + void addQueryResult(lcVector& matchingKeys, const lcVector& key, ArenaAllocator* allocator) + { + uint8_t* ptr = allocator->allocate(key.size()); + assert(ptr); + memcpy(ptr, key.data(), key.size()); + matchingKeys.push_back({ptr, (uint16_t)key.size()}); + } + + template + Status privateQuery(const lcVector& keyParts, lcVector& matchingKeys, ArenaAllocator* allocator = nullptr) + { + matchingKeys.clear(); + ++_stats.queryCallQty; + + // Check key parts validity + for (const KP& kp : keyParts) { + if (kp.size() >= USHRT_MAX) { + ++_stats.queryCallFailedQty; + return Status::BadKeySize; + } + } + + _mxIndexMap.lockRead(); + + // Get the smallest entry list among the "key parts", to minimize the work + int sourceKeyPartIdx = -1; + if (keyParts.size() == 1) { + sourceKeyPartIdx = 0; + } else if (keyParts.size() > 1) { + uint32_t bestValue = 0; + for (int keyPartIdx = 0; keyPartIdx < (int)keyParts.size(); ++keyPartIdx) { + const KP& kp = keyParts[keyPartIdx]; + uint32_t entries = _indexMap->getEntryHashes(kp.data(), (uint16_t)kp.size(), nullptr); + if (sourceKeyPartIdx == -1 || entries < bestValue) { + sourceKeyPartIdx = keyPartIdx; + bestValue = entries; + if (bestValue == 0) { // Empty match, so empty output ("and" between key parts) + sourceKeyPartIdx = -1; + break; + } + } + } + } + + // Empty answer + if (sourceKeyPartIdx < 0) { + _mxIndexMap.unlockRead(); + return Status::Ok; + } + + // Snapshot the smaller list of keyHash + lcVector entryHashes; + const KP& sourceKeyPart = keyParts[sourceKeyPartIdx]; + _indexMap->getEntryHashes(sourceKeyPart.data(), (uint16_t)sourceKeyPart.size(), &entryHashes); + _mxIndexMap.unlockRead(); + + // Loop on the hashes + // - check if the entry exists and really contain the keyPart + // - if yes, check that other keyparts are also present + // - if all are present, store the key in the result list + lcVector key; + lcVector keyIndexes; + uint32_t hashNotPresentQty = 0; + for (uint32_t entryHashIdx = 0; entryHashIdx < (uint32_t)entryHashes.size(); ++entryHashIdx) { + uint32_t keyHash = entryHashes[entryHashIdx]; + + if (!_keyDir->getKeyAndIndexes(keyHash, key, keyIndexes)) { + // The entry hash array is reused to store the absent hash. + // Collected data is used later if a "cleaning" is triggered + entryHashes[hashNotPresentQty++] = keyHash; + continue; + } + + // Filter on other key parts (which implements the "and" behavior if multiple key parts are provided) + bool allKeyPartsFound = true; + for (int i = 0; i < (int)keyParts.size(); ++i) { + // Swap sourceKeyPart and index 0. Indeed, we want to check that the "main" key part is present in the key + int keyPartIdx = (i == 0) ? sourceKeyPartIdx : ((i == sourceKeyPartIdx) ? 0 : i); + const KP& kp = keyParts[keyPartIdx]; + + bool keyPartFound = false; + for (const auto& ki : keyIndexes) { + if (ki.size == kp.size() && !memcmp(&key[ki.startIdx], kp.data(), ki.size)) { + keyPartFound = true; + break; + } + } + if (!keyPartFound) { + if (keyPartIdx == sourceKeyPartIdx) { + // Collected data is used later if a "cleaning" is triggered, as the key part is unexpecingly not in the entry + entryHashes[hashNotPresentQty++] = keyHash; + } + + allKeyPartsFound = false; + break; + } + } + + // Store only if all key parts are present in this key + if (allKeyPartsFound) { addQueryResult(matchingKeys, key, allocator); } + } + + // A cleaning phase to remove the no-more-matching keys from the database is done at query time. + // The mismatch is due to database entry key index modification or entry removal, as no index array update is performed + // at that time for performance reasons. + // The cleaning implies both an update of the index lookup array and an update of the key part list inside the entries. + // It is trigged based on a minimum quantity and array ratio of entries to clean. + // Note: in case of multiple "AND" index provided, only the one with smallest array will be processed + constexpr uint32_t MinimumMismatchEntries = 10; + constexpr uint64_t MinimumMismatchArrayPercent = 10; + if (hashNotPresentQty > MinimumMismatchEntries && + (uint64_t)hashNotPresentQty * 100 > MinimumMismatchArrayPercent * entryHashes.size()) { + ++_stats.indexArrayCleaningQty; + + // Get the writable array of entry hashes + uint32_t* storedEntryHashes = nullptr; + uint32_t* storedEntryHashQty = nullptr; + _mxIndexMap.lockWrite(); + if (_indexMap->getEntryHashesForUpdate(sourceKeyPart.data(), (uint16_t)sourceKeyPart.size(), &storedEntryHashes, + &storedEntryHashQty)) { + // Collected invalid hashes to clean are processed in order. + // This ordering assumption is valid if no cleaning was done in-between. Otherwise, the array is already clean + // and the current cleaning process will be ineffective and harmless. + // In-between insertions in the array do not affect the cleaning as they are added at the end + uint32_t storedEntryHashIndex = 0; + uint32_t invalidEntryHashIndex = 0; + while (storedEntryHashIndex < *storedEntryHashQty && invalidEntryHashIndex < hashNotPresentQty) { + uint32_t keyHashToClean = entryHashes[invalidEntryHashIndex++]; + + // Find the hash in the invalid hash list + while (storedEntryHashIndex < *storedEntryHashQty && storedEntryHashes[storedEntryHashIndex] != keyHashToClean) { + ++storedEntryHashIndex; + } + if (storedEntryHashIndex >= *storedEntryHashQty) { + break; + } // Means that the invalid hash was not found inside the array + + // Clean the index in the entry + _mxKeyDir.lock(); + if (_keyDir->cleanIndex(keyHashToClean, sourceKeyPart.data(), (uint16_t)sourceKeyPart.size())) { + // The index was removed or not found in this entry, so it shall also be removed from current hash array + storedEntryHashes[storedEntryHashIndex] = storedEntryHashes[--(*storedEntryHashQty)]; + ++_stats.indexArrayCleanedEntries; + } + _mxKeyDir.unlock(); + + } // End of loop on current hashes + + } else { + log(LogLevel::Warn, + "The key part to clean was not found in the index map in spite of the absence of removal of such item..."); + } + + _mxIndexMap.unlockWrite(); + } + + return Status::Ok; + } + + void notifyKeyDirResizing(uint32_t newSize, bool isStart, bool wasForced) + { + if (isStart) { + log(LogLevel::Debug, "KeyDir resizing to %u entries started", newSize); + std::unique_lock lk(_upkeepMutex); + _upkeepWork.store(true); + _upkeepCv.notify_one(); + } else { + log(LogLevel::Debug, "KeyDir resizing to %u entries finished%s", newSize, wasForced ? " (forced)" : ""); + } + } + + bool requestUpKeeping() + { + std::unique_lock lk(_upkeepMutex); + if (_isInitialized && _upkeepWork.load() == false) { + _upkeepWork.store(true); + _upkeepCv.notify_one(); + return true; + } + return false; + } + + bool isUpkeepingOnGoing() const { return _upkeepWork.load(); } + + void upkeepThreadEntry() + { + using namespace litecask::detail; + + // Upkeep service loop + while (!_upkeepExit.load()) { + // Wait for a upkeep period or explicit request + { + std::unique_lock lk(_upkeepMutex); + _upkeepCv.wait_for(lk, std::chrono::milliseconds(_config.upkeepCyclePeriodMs), + [this] { return _upkeepExit.load() || _upkeepWork.load(); }); + if (_upkeepExit.load()) continue; + _upkeepWork.store(false); + } + + // Update the current date + updateNow(); + + // Write buffer flushing + uint64_t timeMs = + std::chrono::duration_cast(std::chrono::steady_clock::now().time_since_epoch()).count(); + if (timeMs - _upkeepLastActiveFlushedTimeMs > _config.writeBufferFlushPeriodMs) { + _mxActiveFile.lock(); + _mxWriteBuffer.lockWrite(); + + // Flush only if no flush was performed since last check, and if there is something to flush + if (_activeFlushedDataOffset == _upkeepLastActiveFlushedDataOffset && _activeDataFileId == _upkeepLastActiveDataFileId && + _activeDataOffset - _activeFlushedDataOffset > 0) { + flushWriteBufferUnlocked(); + } + _upkeepLastActiveDataFileId = _activeDataFileId; + _upkeepLastActiveFlushedDataOffset = _activeFlushedDataOffset; + _upkeepLastActiveFlushedTimeMs = timeMs; + _mxWriteBuffer.unlockWrite(); + _mxActiveFile.unlock(); + } + + // First priority: resize the key directory + if (_keyDir->isResizingOngoing()) { + log(LogLevel::Debug, "Resizing KeyDir in upkeep thread under work"); + while (_keyDir->isResizingOngoing()) { + // Transfer one batch of entries at a time + _mxKeyDir.lock(); + _keyDir->backgroundResizeWork(_config.upkeepKeyDirBatchSize); + _mxKeyDir.unlock(); + // Give air to writer threads + std::this_thread::yield(); + } + log(LogLevel::Debug, "Resizing KeyDir in upkeep thread finished"); + } + + // Second priority: Value cache upkeeping + _valueCache->backgroundUpdateLru(_config.upkeepValueCacheBatchSize); + _valueCache->backgroundPreventiveEviction(_config.upkeepValueCacheBatchSize); // Ensures a free margin + + uint32_t keyDirIndex = 0; + uint32_t keyHash = 0; + uint32_t keySize = 0; + uint32_t oldValueSize = 0; + uint16_t oldFileId = 0; + ValueLoc oldCacheLoc = NotStored; + + // Third priority: cleaning of entries with expired TTL + uint32_t batchSize = _config.upkeepValueCacheBatchSize; + while (batchSize > 0) { + // Probe a KeyDir entry with expired TTL and remove the entry after checking it again under lock + if ((keyDirIndex = _keyDir->backgroundExpiredKeyCleaning(batchSize)) == NotStored) { continue; } + + // Invalidate the entry iff valid and with an expired TTL. This time, under write lock. + _mxKeyDir.lock(); + bool isReplacing = _keyDir->invalidateExpiredTtl(keyDirIndex, keyHash, keySize, oldValueSize, oldFileId, oldCacheLoc); + _mxKeyDir.unlock(); + + if (!isReplacing) { continue; } + + // Remove the (potential) old value from the value cache + if (oldCacheLoc != NotStored && _valueCache->isEnabled()) { _valueCache->removeValue(oldCacheLoc, keyHash); } + + // Update the data file statistics + _mxDataFiles.lockRead(); + _dataFiles[oldFileId]->deadBytes += (uint32_t)sizeof(DataFileEntry) + oldValueSize + keySize; + _dataFiles[oldFileId]->deadEntries += 1; + _mxDataFiles.unlockRead(); + } + + } // End of service loop + } + + void mergeThreadEntry() + { + using namespace litecask::detail; + log(LogLevel::Debug, "Merge thread started"); + + // Merge service loop + while (!_mergeExit.load()) { + { + // Wait for a merge period or explicit request + std::unique_lock lk(_mergeMutex); + _mergeCv.wait_for(lk, std::chrono::milliseconds(_config.mergeCyclePeriodMs), + [this] { return _mergeExit.load() || _mergeWork.load(); }); + if (_mergeExit.load()) continue; + } + + log(LogLevel::Debug, "Merge process started"); + + // Snapshot the protected configuration + _mxConfig.lock(); + Config c = _config; + _mxConfig.unlock(); + ++_stats.mergeCycleQty; + + // Analyze the data files to select which ones to merge, if any + lcVector mergeInfos; + if (isItWorthMerging(c.mergeTriggerDataFileFragmentationPercentage, c.mergeTriggerDataFileDeadByteThreshold)) { + selectDataFilesToMerge(c.mergeSelectDataFileFragmentationPercentage, c.mergeSelectDataFileDeadByteThreshold, + c.mergeSelectDataFileSmallSizeTheshold, mergeInfos); + } + + if (!mergeInfos.empty()) { + // Mandatory switch of the active file for the following reasons: + // - it ensures that the previous active file can be merged, as it was potentially selected + // - it ensures that the naming of the new compacted data files is unique + _mxActiveFile.lock(); + lcString mergeBasename = createNewActiveDataFileUnlocked(); + _mxActiveFile.unlock(); + + // Create the new compacted data files from the selected files + createMergedDataFiles(mergeInfos, mergeBasename, c.dataFileMaxBytes); + + // Add the new data files, remove the old ones, and update the memory KeyDir + replaceDataFiles(mergeInfos); + + ++_stats.mergeCycleWithMergeQty; + } + + if (_someHintFilesAreMissing) { + // This flag is set once at database opening, so this section will be run once too + _someHintFilesAreMissing = false; + for (uint32_t fileId = 0; fileId < _dataFiles.size(); ++fileId) { + if (fileId == _activeDataFileId) continue; + const DataFile* dfd = _dataFiles[fileId]; + fs::path hintFilename = fs::path(dfd->filename).replace_extension(HintFileSuffix); + if (osIsValidHandle(dfd->handle) && !fs::exists(hintFilename)) { + log(LogLevel::Debug, "Merge thread started"); + createHintFile(dfd->filename, hintFilename); + } + ++_stats.hintFileCreatedQty; + } + } + + _mergeWork.store(false); // Work finished + } // End of service loop + + log(LogLevel::Debug, "Merge thread stopped"); + } + + // File cleaning before opening the data store. The cleaning instructions comes from the file extension. + // Robustness comes from the atomic nature of some file operations (creation and renaming) + Status sanitizeAndCollectDataFiles(const fs::path& dbDirectory, uint64_t& maxDataFileIndex, lcVector& baseDataFilenames) + { + using namespace litecask::detail; + fs::path dbDirectoryPath = dbDirectory / ""; + + baseDataFilenames.clear(); + maxDataFileIndex = 1; + lcVector entries; + if (!osGetDirContent(dbDirectoryPath, entries)) { return Status::CannotOpenStore; } + + struct OrderedDataFiles { + lcString name; + double index; + }; + lcVector orderedDataFiles; + + for (const auto& e : entries) { + if (e.isDir) continue; + fs::path entryFilename = dbDirectoryPath / e.name; + + // Remove ".tmp" files which correspond to an unfinished merge due to a crash + if (entryFilename.extension() == TmpFileSuffix) { + log(LogLevel::Info, "Removing unfinished merge file %s", entryFilename.c_str()); + osRemoveFile(entryFilename); + } + + // Remove ".to_remove" tag files and associated data files, which is a cleanup of redundant file not removed due to a crash + else if (entryFilename.extension() == ToRemoveFileSuffix) { + log(LogLevel::Info, "Removing old data file %s", entryFilename.replace_extension(DataFileSuffix).string().c_str()); + // First the data file (if exists) + osRemoveFile(entryFilename.replace_extension(DataFileSuffix)); + // then the hint file (if exists) + osRemoveFile(entryFilename.replace_extension(HintFileSuffix)); + // and last the removal instruction + osRemoveFile(entryFilename.replace_extension(ToRemoveFileSuffix)); + } + + // Remove standalone hint files + else if (entryFilename.extension() == HintFileSuffix && + osGetFileSize(fs::path(entryFilename).replace_extension(DataFileSuffix)) <= 0) { + osRemoveFile(entryFilename); + } + + // Remove zero sized data files + else if (entryFilename.extension() == DataFileSuffix && osGetFileSize(entryFilename) == 0) { + log(LogLevel::Info, "Removing zero size data file %s", entryFilename.c_str()); + osRemoveFile(entryFilename); + } + + // List the data files + else if (entryFilename.extension() == DataFileSuffix) { + // Get the data file number as "double" (decimals created by merge process), as their order matters + double fileNumber = strtod(entryFilename.stem().string().c_str(), nullptr); + if (fileNumber > 0.) { + // Store the data file basename in the list + orderedDataFiles.push_back({entryFilename.replace_extension("").string(), fileNumber}); + // Get the highest file number + if ((uint64_t)fileNumber > maxDataFileIndex) { maxDataFileIndex = (uint64_t)fileNumber; } + } + } + } + + // Fill the output list of ordered data files (oldest data files first) + std::sort(orderedDataFiles.begin(), orderedDataFiles.end(), + [](const OrderedDataFiles& a, const OrderedDataFiles& b) { return a.index < b.index; }); + baseDataFilenames.reserve(orderedDataFiles.size()); + for (const auto& e : orderedDataFiles) baseDataFilenames.push_back(e.name); + + return Status::Ok; + } + + bool createHintFile(const lcString& readDataFilename, const fs::path& writeHintFilename) + { + using namespace litecask::detail; + log(LogLevel::Info, "Creating hint file for %s", readDataFilename.c_str()); + + // We use fopen/fread/fwrite here because of standard usage and the provided buffering + FILE* fhr = osFopen(readDataFilename, "rb"); + assert(fhr); + FILE* fhw = osFopen(writeHintFilename.string() + TmpFileSuffix, "wb"); + assert(fhw); + + DataFileEntry header; + lcVector buf(1024); + bool isOk = true; + uint32_t fileOffset = 0; + uint32_t fileIncrement = 0; + + while (fread(&header, sizeof(DataFileEntry), 1, fhr) == 1) { + uint32_t keySize = header.keySize; + uint32_t keyIndexSize = header.keyIndexSize; + uint32_t valueSize = header.valueSize; + + if (keySize == 0) { + log(LogLevel::Error, "Cannot create the hint file for %s: a key in the data file has a null (=corrupted) size", + readDataFilename.c_str()); + isOk = false; + break; + } + + if (keyIndexSize > MaxKeyIndexQty * sizeof(KeyIndex)) { + log(LogLevel::Error, "Cannot create the hint file for %s: a key has too big index (byte size=%d > %" PRId64 ")", + readDataFilename.c_str(), keyIndexSize, MaxKeyIndexQty * sizeof(KeyIndex)); + break; + } + + if (valueSize == DeletedEntry) { + // Tombstone case (no key index stored) + if (buf.size() < keySize) buf.resize(keySize); + if (fread(buf.data(), 1, keySize, fhr) != keySize) { + log(LogLevel::Error, + "Cannot create the hint file for %s: unable to read all the bytes (%d) of the deleted key at file offset %u", + readDataFilename.c_str(), keySize, fileOffset); + isOk = false; + break; + } + keyIndexSize = 0; + fileIncrement = sizeof(DataFileEntry) + keySize; + } + + else { + uint32_t allSize = keySize + keyIndexSize + valueSize; + if (buf.size() < allSize) buf.resize(allSize); + if (fread(buf.data(), 1, allSize, fhr) != allSize) { + log(LogLevel::Error, + "Cannot create the hint file for %s: unable to read all the bytes (%d) of the key and value at file offset %u", + readDataFilename.c_str(), allSize, fileOffset); + isOk = false; + break; + } + fileIncrement = (uint32_t)sizeof(DataFileEntry) + allSize; + } + + HintFileEntry hfe{fileOffset, header.expTimeSec, valueSize, (uint16_t)keySize, (uint8_t)keyIndexSize, 0}; + if (fwrite(&hfe, sizeof(HintFileEntry), 1, fhw) != 1 || + fwrite(buf.data(), 1, keySize + keyIndexSize, fhw) != keySize + keyIndexSize) { + log(LogLevel::Error, "Cannot create the hint file for %s: unable to write the hint entry (size=%" PRId64 ")", + readDataFilename.c_str(), sizeof(HintFileEntry) + keySize + keyIndexSize); + isOk = false; + break; + } + + fileOffset += fileIncrement; + } + + fclose(fhw); + fclose(fhr); + + if (isOk) { + [[maybe_unused]] bool isRenamingOk = osRenameFile(writeHintFilename.string() + TmpFileSuffix, writeHintFilename); + assert(isRenamingOk); + } + return isOk; + } + + bool loadHintFile(const lcString& hintFilename, uint16_t fileId, ArenaAllocator& loadArena, + lcVector& keyEntries) + { + using namespace litecask::detail; + log(LogLevel::Debug, "Loading hint file %s", hintFilename.c_str()); + + keyEntries.clear(); + loadArena.reset(); + + // We use fopen/fread/fwrite here because of standard usage and the provided buffering + int64_t fileSize = osGetFileSize(hintFilename); + if (fileSize <= 0) { return false; } + FILE* fh = osFopen(hintFilename, "rb"); + if (!fh) { return false; } + + // Loading fully the hint file in one call, as we need anyway the key and keyindexes + uint8_t* buf = loadArena.allocate(fileSize); + assert(buf && "The hint file is too big"); + size_t readSize = fread(buf, 1, fileSize, fh); + fclose(fh); + + HintFileEntry header; + bool isOk = true; + size_t readOffset = 0; + + while (isOk && readOffset + sizeof(HintFileEntry) < readSize) { + // Copy due to uncontrolled alignment + memcpy(&header, &buf[readOffset], sizeof(HintFileEntry)); + + // Parse the header + if (header.keySize == 0) { + log(LogLevel::Error, "Cannot load the hint file %s: a key has a null (=corrupted) size", hintFilename.c_str()); + isOk = false; + break; + } + + if (header.keyIndexSize > MaxKeyIndexQty * sizeof(KeyIndex)) { + log(LogLevel::Error, "Cannot read the hint file %s: an index has a too big value (byte size=%d > %" PRId64 ")", + hintFilename.c_str(), header.keyIndexSize, MaxKeyIndexQty * sizeof(KeyIndex)); + isOk = false; + break; + } + + if (header.keyIndexSize & 0x1) { + log(LogLevel::Error, "Cannot read the hint file %s: an index has an odd size (byte size=%d)", hintFilename.c_str(), + header.keyIndexSize); + break; + } + + size_t entrySize = sizeof(HintFileEntry) + (int)header.keySize + (int)header.keyIndexSize; + if (readOffset + entrySize > readSize) { + log(LogLevel::Warn, "The hint file %s is corrupted", hintFilename.c_str()); + break; // End of file reached (and corrupted last entry) + } + uint8_t* key = &buf[readOffset + sizeof(HintFileEntry)]; + uint64_t keyHash = LITECASK_HASH_FUNC(key, header.keySize); + uint8_t* keyIndexes = key + header.keySize; + + // Note: the key and keyIndexes pointers are persistent in the memory arena (until it is reset) + // The changeCounter initialized with the readOffset is to provide some spreading for the initial value + keyEntries.push_back({{header.expTimeSec, header.valueSize, NotStored, header.fileOffset, fileId, header.keySize, + header.keyIndexSize, (uint8_t)readOffset}, + (uint32_t)keyHash, + key, + keyIndexes}); + readOffset += entrySize; + } + + return isOk; + } + + bool loadDataFile(const lcString& dataFilename, uint16_t fileId, ArenaAllocator& loadArena, + lcVector& keyEntries) + { + using namespace litecask::detail; + log(LogLevel::Debug, "Loading data file %s", dataFilename.c_str()); + + keyEntries.clear(); + + // We use fopen/fread/fwrite here because of the provided buffering and sequential reading + FILE* fh = osFopen(dataFilename, "rb"); + if (!fh) return false; + + DataFileEntry header; + lcVector buf; + bool isOk = true; + uint32_t fileOffset = 0; + uint64_t valueHash = 0; + uint32_t fileIncrement = 0; + + while (isOk && fread(&header, sizeof(DataFileEntry), 1, fh) == 1) { + uint32_t keySize = header.keySize; + uint32_t keyIndexSize = header.keyIndexSize; + uint32_t valueSize = header.valueSize; + + if (keySize == 0) { + log(LogLevel::Error, + "Cannot load the data file %s: a key has a null (=corrupted) size (value size is said to be %u) at file offset %u", + dataFilename.c_str(), valueSize, fileOffset); + isOk = false; + break; + } + + if (keyIndexSize > MaxKeyIndexQty * sizeof(KeyIndex)) { + log(LogLevel::Error, "Cannot load the data file %s: an index has a too big value (byte size=%d > %" PRId64 ")", + dataFilename.c_str(), keyIndexSize, MaxKeyIndexQty * sizeof(KeyIndex)); + break; + } + + if (keyIndexSize & 0x1) { + log(LogLevel::Error, "Cannot load the data file %s: an index has an odd size (byte size=%d)", dataFilename.c_str(), + keyIndexSize); + break; + } + + if (valueSize != DeletedEntry) { + if (buf.size() < keySize + keyIndexSize + valueSize) buf.resize(keySize + keyIndexSize + valueSize); + if (fread(buf.data(), 1, keySize + keyIndexSize + valueSize, fh) != keySize + keyIndexSize + valueSize) { + log(LogLevel::Warn, + "Cannot load the data file %s: unable to read all the bytes (%u) of the entry at file offset %u. May be last entry " + "not fully written...", + dataFilename.c_str(), keySize + keyIndexSize + valueSize, fileOffset); + break; + } + valueHash = LITECASK_HASH_FUNC(&buf[keySize], valueSize); + fileIncrement = (uint32_t)sizeof(DataFileEntry) + keySize + keyIndexSize + valueSize; + } + + else { + if (buf.size() < keySize) buf.resize(keySize); + if (fread(buf.data(), 1, keySize, fh) != keySize) { + log(LogLevel::Warn, + "Cannot load the data file %s: unable to read all the bytes (%u) of the deleted entry at file offset %u. May be " + "last entry not fully written...", + dataFilename.c_str(), keySize, fileOffset); + break; + } + valueHash = 0; + keyIndexSize = 0; // No key index shall be stored for tombstones + fileIncrement = sizeof(DataFileEntry) + keySize; + } + + // Copy the key and index in the memory arena + uint8_t* persistentPtr = loadArena.allocate(keySize + keyIndexSize); + memcpy(persistentPtr, buf.data(), keySize + keyIndexSize); + uint8_t* key = persistentPtr; + uint64_t keyHash = LITECASK_HASH_FUNC(key, keySize); + uint8_t* keyIndexes = key + keySize; + uint32_t checksum = (uint32_t)(keyHash ^ valueHash); + if (header.checksum != checksum) { + log(LogLevel::Warn, "Cannot load the data file %s: the entry is corrupted (bad checksum) at file offset %u", + dataFilename.c_str(), fileOffset); + isOk = false; + break; + } + // The changeCounter initialized with the checksum is to provide some spreading for the initial value + keyEntries.push_back( + {{header.expTimeSec, valueSize, NotStored, fileOffset, fileId, (uint16_t)keySize, (uint8_t)keyIndexSize, (uint8_t)checksum}, + (uint32_t)keyHash, + key, + keyIndexes}); + + fileOffset += fileIncrement; + } + + fclose(fh); + return isOk; + } + + void flushWriteBufferUnlocked() + { + assert(_activeDataOffset >= _activeFlushedDataOffset); + if (_activeDataOffset - _activeFlushedDataOffset > 0) { + lcOsFileHandle fh = _dataFiles[_activeDataFileId]->handle; + assert(osIsValidHandle(fh)); + if (!osOsWrite(fh, _writeBuffer.data(), _activeDataOffset - _activeFlushedDataOffset)) { + fatalHandler("flushWriteBufferUnlocked: Unable to flush the write buffer (size=%d)", + _activeDataOffset - _activeFlushedDataOffset); + } + _activeFlushedDataOffset = _activeDataOffset; + } + } + + // The "data file" *write* lock must be taken before the call + uint16_t getFreeDataFileIdUnlocked() + { + uint16_t fileId = 0xFFFF; + if (!_freeDataFileIds.empty()) { + fileId = _freeDataFileIds.back(); + _freeDataFileIds.pop_back(); + } else { + fileId = (uint16_t)_dataFiles.size(); + _dataFiles.push_back(new detail::DataFile()); + ++_stats.dataFileMaxQty; + } + ++_stats.dataFileCreationQty; + return fileId; + } + + // The default handler is displaying on console with a relative date + void defaultLogHandler(LogLevel level, const char* message, bool closeDbNotification) const + { + constexpr const char* levelStr[5] = {"[debug]", "[info ]", "[warn ]", "[error]", "[FATAL]"}; + static FILE* fileHandle = nullptr; + + // Log file management + if (closeDbNotification) { + if (fileHandle) { + fclose(fileHandle); + fileHandle = nullptr; + } + return; + } + if (!fileHandle) { + if (!_directory.empty()) { + // Manage big log files (at open time only) + if (osGetFileSize(_directory / "litecask.log") > _maxLogFileBytes) { + // Return status is ignored on purpose, as files may not exist + osRenameFile(_directory / "litecask4.log", _directory / "litecask5.log"); + osRenameFile(_directory / "litecask3.log", _directory / "litecask4.log"); + osRenameFile(_directory / "litecask2.log", _directory / "litecask3.log"); + osRenameFile(_directory / "litecask1.log", _directory / "litecask2.log"); + osRenameFile(_directory / "litecask.log", _directory / "litecask1.log"); + } + // Create / append the current log file + fileHandle = osFopen(_directory / "litecask.log", "a"); + } + if (!fileHandle) { return; } + } + + auto now = std::chrono::system_clock::now(); + std::time_t nowDate = std::chrono::system_clock::to_time_t(now); + std::string dateStr(32, '\0'); + std::strftime(dateStr.data(), dateStr.size(), "%Y-%m-%d %H:%M:%S", std::localtime(&nowDate)); + uint64_t dateMs = std::chrono::duration_cast(now.time_since_epoch()).count(); + + if (fprintf(fileHandle, "[%s.%03d] %s %s\n", dateStr.c_str(), (int)(dateMs % 1000), levelStr[(int)level], message) < 0) { + // Write issue: closing the file pointer + fclose(fileHandle); + fileHandle = nullptr; + } + } + + void LITECASK_PRINTF_CHECK(3, 4) log(LogLevel level, LITECASK_PRINTF_FORMAT_STRING const char* format, ...) + { + // Filter the log based on the level + if (level < _logLevel) { return; } + + // Format the message + char message[512]; + va_list args; + va_start(args, format); + vsnprintf(message, sizeof(message), format, args); + va_end(args); + + // Thread-safe forwarding to the handler + std::lock_guard lk(_logMx); + _logHandler(level, message, false); + } + + void LITECASK_PRINTF_CHECK(2, 3) fatalHandler(LITECASK_PRINTF_FORMAT_STRING const char* format, ...) + { + // Format the message + char message[512]; + va_list args; + va_start(args, format); + vsnprintf(message, sizeof(message), format, args); + va_end(args); + + // Thread-safe forwarding to the handler + { + std::lock_guard lk(_logMx); + _logHandler(LogLevel::Fatal, message, true); + } + + // Exit in error + exit(1); + } + + // To use only in the context of testing (with "private:" disabled) + void setTestTimeFunction(const std::function& testTimeFunc) { _getTestTime = testTimeFunc; } + + void setTestLogMaxFileBytes(int64_t maxLogFileBytes) { _maxLogFileBytes = maxLogFileBytes; } + + void updateNow() + { + if (LITECASK_UNLIKELY(_getTestTime)) { + // This optional time function is provided in the context of testing only + _nowTimeSec = _getTestTime(); + } else { + _nowTimeSec = + (uint32_t)std::chrono::duration_cast(std::chrono::system_clock::now().time_since_epoch()).count(); + } + _keyDir->setNow(_nowTimeSec); + } + + bool _isInitialized = false; + uint32_t _nowTimeSec = 0; // Unix timestamp in second + lcVector _dataFiles; + lcVector _freeDataFileIds; + detail::KeyDirMap* _keyDir = nullptr; + detail::ValueCache* _valueCache = nullptr; + detail::IndexMap* _indexMap = nullptr; + lcVector _writeBuffer; + uint64_t _maxDataFileIndex = 1; + uint32_t _activeDataOffset = 0; + uint32_t _activeFlushedDataOffset = 0; // Last "write buffer" write on disk + uint16_t _activeDataFileId = 0xFFFF; + uint64_t _dataFileMaxBytes = 100'000'000; // Copied from the config for efficiency in multithread environment + int64_t _maxLogFileBytes = 10'000'000; + + alignas(detail::CpuCacheLine) mutable detail::RWLock _mxDataFiles; // lockRead: using _dataFiles, lockWrite: data files changes + alignas(detail::CpuCacheLine) mutable detail::RWLock _mxWriteBuffer; // lock for using the current write buffer + alignas(detail::CpuCacheLine) mutable detail::RWLock _mxIndexMap; // lock for using the index lookup + alignas(detail::CpuCacheLine) mutable std::mutex _mxActiveFile; // lock for using active file (write entry) + alignas(detail::CpuCacheLine) mutable std::mutex _mxKeyDir; // lock for writing the hashmap + alignas(detail::CpuCacheLine) mutable std::mutex _mxConfig; // lock for reading or writing the config + + // Control of merge operations thread. May be long operations + std::thread _mergeThread; + std::mutex _mergeMutex; + std::condition_variable _mergeCv; + std::atomic _mergeWork = false; + std::atomic _mergeExit = false; + bool _someHintFilesAreMissing = false; + + // Control of upkeep operations thread (KeyDir resizing, cache queues, ...). Fine granularity + std::thread _upkeepThread; + std::mutex _upkeepMutex; + std::condition_variable _upkeepCv; + std::atomic _upkeepWork = false; + std::atomic _upkeepExit = false; + uint64_t _upkeepLastActiveFlushedTimeMs = 0; + uint32_t _upkeepLastActiveFlushedDataOffset = detail::NotStored; + uint16_t _upkeepLastActiveDataFileId = 0xFFFF; + + // Logging + std::mutex _logMx; + LogLevel _logLevel = LogLevel::Info; + + std::function _logHandler; + std::function _getTestTime; + + Config _config; + fs::path _directory; + DatastoreCounters _stats; +}; + +} // namespace litecask diff --git a/src/server.cpp b/src/server.cpp index b7ac0cb..39c649b 100644 --- a/src/server.cpp +++ b/src/server.cpp @@ -6,6 +6,8 @@ #include #include // For random number generation #include // For seeding random number generator +#include // For getAllKeys +#include // For litecask values namespace dropshell { @@ -30,26 +32,61 @@ private: Server::Server(const ServerConfig& config) : config_(config), running_(false) { - // Create object store directory if it doesn't exist - std::filesystem::create_directories(config_.object_store_path); + // Ensure object store directory exists + try { + std::filesystem::create_directories(config_.object_store_path); + } catch (const std::filesystem::filesystem_error& e) { + std::cerr << "Failed to create object store directory: " << config_.object_store_path << " - " << e.what() << std::endl; + // Consider throwing an exception or exiting + return; + } + + // Set up and open the litecask datastore + datastore_path_ = config_.object_store_path / "index"; + try { + std::filesystem::create_directories(datastore_path_); + } catch (const std::filesystem::filesystem_error& e) { + std::cerr << "Failed to create datastore directory: " << datastore_path_ << " - " << e.what() << std::endl; + // Consider throwing an exception or exiting + return; + } + + + if (datastore_.open(datastore_path_.string()) != litecask::Status::Ok) { + std::cerr << "Failed to open litecask datastore at " << datastore_path_ << std::endl; + // Consider throwing an exception or exiting + } } Server::~Server() { stop(); + datastore_.close(); // Close the datastore } bool Server::start() { + if (!datastore_.isOpen()) { + std::cerr << "Datastore is not open. Cannot start server." << std::endl; + return false; + } setup_routes(); std::cout << "Server starting on " << config_.host << ":" << config_.port << std::endl; running_ = true; - return server_.listen(config_.host.c_str(), config_.port); + // Use server_.listen in a separate thread or make it non-blocking if needed + // For simplicity, keeping it blocking for now. + if (!server_.listen(config_.host.c_str(), config_.port)) { + running_ = false; + std::cerr << "Failed to listen on " << config_.host << ":" << config_.port << std::endl; + return false; + } + return true; // Should not be reached if listen is blocking and successful } void Server::stop() { if (running_) { - server_.stop(); + server_.stop(); // httplib stop is non-blocking running_ = false; + std::cout << "Server stopped." << std::endl; } } @@ -75,62 +112,108 @@ void Server::setup_routes() { }); // Upload object - server_.Put("/([^/]+)/([^/]+)", [this](const httplib::Request& req, httplib::Response& res) { + server_.Put("/([^/]+)/(.*)", [this](const httplib::Request& req, httplib::Response& res) { // Adjusted regex slightly for label:tag handle_put_object(req, res); }); } void Server::handle_get_object(const httplib::Request& req, httplib::Response& res) { - const auto& path = req.matches[1]; - uint64_t hash = 0; + const auto& key = req.matches[1].str(); // Use .str() to get std::string + std::string hash_str; - try { - hash = std::stoull(path); - } catch (...) { - // Try to find hash by label:tag - auto it = label_tag_index_.find(path); - if (it == label_tag_index_.end()) { - res.status = 404; - res.set_content("Object not found", "text/plain"); - return; + // Check if the key looks like a hash (numeric) + bool is_hash_lookup = true; + for (char c : key) { + if (!std::isdigit(c)) { + is_hash_lookup = false; + break; } - hash = it->second; } - std::filesystem::path file_path = config_.object_store_path / std::to_string(hash); - if (!std::filesystem::exists(file_path)) { + if (!is_hash_lookup) { + // Lookup by label:tag in the datastore + std::string_view value_sv; + auto rc = datastore_.get(key, value_sv); + if (rc == litecask::RESULT_CODE::LC_OK) { + hash_str = std::string(value_sv); + } else if (rc == litecask::RESULT_CODE::LC_ERR_NOT_FOUND) { + res.status = 404; + res.set_content("Object not found (label:tag)", "text/plain"); + return; + } else { + std::cerr << "Datastore get error: " << static_cast(rc) << std::endl; + res.status = 500; + res.set_content("Datastore error on get", "text/plain"); + return; + } + } else { + // Lookup directly by hash + hash_str = key; + } + + // Construct the file path using the hash string + std::filesystem::path file_path = config_.object_store_path / hash_str; + if (!std::filesystem::exists(file_path) || !std::filesystem::is_regular_file(file_path)) { res.status = 404; res.set_content("Object file not found", "text/plain"); return; } // Send file using Response::set_file_content - res.set_file_content(file_path.string()); + // Need to determine content type, default to octet-stream + std::string content_type = "application/octet-stream"; // Basic default + res.set_file_content(file_path.string(), content_type); + // httplib should set status to 200 automatically for set_file_content if successful } void Server::handle_get_hash(const httplib::Request& req, httplib::Response& res) { - const auto& path = req.matches[1]; - auto it = label_tag_index_.find(path); - if (it == label_tag_index_.end()) { + const auto& label_tag = req.matches[1].str(); + std::string_view value_sv; + auto rc = datastore_.get(label_tag, value_sv); + + if (rc == litecask::RESULT_CODE::LC_OK) { + res.set_content(std::string(value_sv), "text/plain"); + } else if (rc == litecask::RESULT_CODE::LC_ERR_NOT_FOUND) { res.status = 404; res.set_content("Label:tag not found", "text/plain"); - return; + } else { + std::cerr << "Datastore get error: " << static_cast(rc) << std::endl; + res.status = 500; + res.set_content("Datastore error on get", "text/plain"); } - - res.set_content(std::to_string(it->second), "text/plain"); } -void Server::handle_get_directory(const httplib::Request& req, httplib::Response& res) { +void Server::handle_get_directory(const httplib::Request& /*req*/, httplib::Response& res) { + std::vector keys; + // Assuming getAllKeys exists and is efficient enough. Check litecask docs. + // This might be slow for very large datastores. + if (datastore_.getAllKeys(keys) != litecask::RESULT_CODE::LC_OK) { + std::cerr << "Failed to get all keys from datastore." << std::endl; + res.status = 500; + res.set_content("Failed to retrieve directory listing", "text/plain"); + return; + } + std::stringstream ss; - for (const auto& [labeltag, hash] : label_tag_index_) { - ss << labeltag << "," << hash << "\n"; + std::string_view value_sv; + for (const auto& key : keys) { + auto rc = datastore_.get(key, value_sv); + if (rc == litecask::RESULT_CODE::LC_OK) { + ss << key << "," << std::string(value_sv) << "\n"; + } else if (rc == litecask::RESULT_CODE::LC_ERR_NOT_FOUND) { + std::cerr << "Key found by getAllKeys but not found by get: " << key << std::endl; + // Skip this key or handle error + } else { + std::cerr << "Datastore get error for key " << key << ": " << static_cast(rc) << std::endl; + // Skip this key or handle error + } } res.set_content(ss.str(), "text/plain"); } void Server::handle_put_object(const httplib::Request& req, httplib::Response& res) { - const auto& token = req.matches[1]; - const auto& label_tag = req.matches[2]; + const auto& token = req.matches[1].str(); + const auto& label_tag = req.matches[2].str(); if (!validate_write_token(token)) { res.status = 403; @@ -176,7 +259,8 @@ void Server::handle_put_object(const httplib::Request& req, httplib::Response& r } // Move file to final location - std::filesystem::path final_path = config_.object_store_path / std::to_string(hash); + std::string hash_str = std::to_string(hash); + std::filesystem::path final_path = config_.object_store_path / hash_str; if (!std::filesystem::exists(final_path)) { try { std::filesystem::rename(temp_path, final_path); @@ -192,20 +276,37 @@ void Server::handle_put_object(const httplib::Request& req, httplib::Response& r // temp_file_deleter will automatically remove the temp file } - // Update indices - label_tag_index_[label + ":" + tag] = hash; + // Update datastore index + auto rc = datastore_.put(label_tag, hash_str); + if (rc != litecask::RESULT_CODE::LC_OK) { + std::cerr << "Datastore put error: " << static_cast(rc) << std::endl; + // Decide how to handle this - the object is stored, but index failed. + // Maybe attempt removal of the object file? Or just log error. + res.status = 500; + res.set_content("Failed to update datastore index", "text/plain"); + // Consider removing the successfully stored object file if index fails + try { + if (!std::filesystem::remove(final_path)) { + std::cerr << "Failed to remove object file after index failure: " << final_path << std::endl; + } + } catch (const std::filesystem::filesystem_error& e) { + std::cerr << "Error removing object file after index failure: " << e.what() << std::endl; + } + return; + } - res.set_content(std::to_string(hash), "text/plain"); + res.set_content(hash_str, "text/plain"); } bool Server::validate_write_token(const std::string& token) const { + // Ensure config_.write_tokens is accessible and valid return std::find(config_.write_tokens.begin(), config_.write_tokens.end(), token) != config_.write_tokens.end(); } std::pair Server::parse_label_tag(const std::string& label_tag) const { size_t colon_pos = label_tag.find(':'); - if (colon_pos == std::string::npos) { - return {"", ""}; + if (colon_pos == std::string::npos || colon_pos == 0 || colon_pos == label_tag.length() - 1) { + return {"", ""}; // Ensure label and tag are not empty } return {label_tag.substr(0, colon_pos), label_tag.substr(colon_pos + 1)}; } diff --git a/src/server.hpp b/src/server.hpp index 04711e4..e209867 100644 --- a/src/server.hpp +++ b/src/server.hpp @@ -3,11 +3,12 @@ #include "config.hpp" #include "httplib.hpp" +#include "litecask.hpp" #include -#include #include #include #include +#include namespace dropshell { @@ -20,12 +21,6 @@ public: void stop(); private: - struct ObjectInfo { - std::string label; - std::string tag; - uint64_t hash; - }; - void setup_routes(); void handle_get_object(const httplib::Request& req, httplib::Response& res); void handle_get_hash(const httplib::Request& req, httplib::Response& res); @@ -36,8 +31,9 @@ private: const ServerConfig& config_; httplib::Server server_; + litecask::Datastore datastore_; + std::filesystem::path datastore_path_; std::atomic running_; - std::unordered_map label_tag_index_; }; } // namespace dropshell