diff --git a/README.md b/README.md index d85947f..3adc0bd 100644 --- a/README.md +++ b/README.md @@ -10,7 +10,7 @@ A simple object storage system that stores files with metadata and provides a RE - Check if a file exists by hash or label:tag - Delete files by hash - List all stored objects -- Automatic file deduplication using content hashing +- Automatic file deduplication using SHA-256 content hashing - Support for large file uploads (up to 6GB) - High-performance HTTP server with async request handling - Configurable storage location and server settings diff --git a/SECURITY_REVIEW.md b/SECURITY_REVIEW.md index e33cc6f..e5bec05 100644 --- a/SECURITY_REVIEW.md +++ b/SECURITY_REVIEW.md @@ -29,15 +29,18 @@ This comprehensive security review analyzes the Simple Object Server C++23 appli - Removed all plaintext token support for enhanced security - **Documentation**: See README.md for token hashing instructions -### 3. **Weak Cryptographic Hash for Content** -- **Location**: `src/hash.cpp:12-56` -- **Risk**: HIGH - Using XXHash (non-cryptographic) for content identification -- **Issue**: XXHash is designed for speed, not security - vulnerable to collision attacks -- **Impact**: Potential for malicious file substitution through hash collisions -- **Recommendation**: - - Replace XXHash with SHA-256 or SHA-3 for content hashing - - Use cryptographic hashes for security-critical operations - - Keep XXHash only for non-security checksums if needed +### 3. **~~Weak Cryptographic Hash for Content~~ [FIXED] +- **Location**: `src/hash.cpp` +- **Risk**: ~~HIGH~~ RESOLVED - Now using SHA-256 for content identification +- **Fix Implemented**: + - Replaced XXHash with SHA-256 for all content hashing + - Using OpenSSL's SHA-256 implementation for cryptographic security + - All file hashes are now 256-bit SHA-256 hashes (64 hex characters) + - Collision resistance: 2^128 operations needed for 50% probability +- **Security Benefits**: + - Cryptographically secure against intentional collisions + - Industry-standard hash function + - Prevents malicious file substitution attacks ## High-Risk Issues diff --git a/src/compress.cpp b/src/compress.cpp index 1647199..d8a4511 100644 --- a/src/compress.cpp +++ b/src/compress.cpp @@ -14,17 +14,17 @@ namespace simple_object_storage { -// if the file is a tgz file (we can't rely on the extension), then unpack on disk and has the contents +// if the file is a tgz file (we can't rely on the extension), then unpack on disk and hash the contents // with hash_directory_recursive in hash.hpp -uint64_t get_hash_from_tgz(const std::string &file_path) +std::string get_hash_from_tgz(const std::string &file_path) { // check if it's a gzip file std::ifstream file(file_path, std::ios::binary); - if (!file) return 0; + if (!file) return ""; int result = system("gunzip -t file.gz"); if (result != 0) { // not a gzip file. - return 0; + return ""; } // gunzip the file to a new temporary directory @@ -35,10 +35,10 @@ uint64_t get_hash_from_tgz(const std::string &file_path) std::string command = "tar -zxzf " + file_path + " -C " + temp_dir; result = system(command.c_str()); // Basic tar extraction - requires 'tar' command if (result != 0) { - return 0; + return ""; } - // hash the contents + // hash the contents with SHA-256 return hash_directory_recursive(temp_dir); } diff --git a/src/compress.hpp b/src/compress.hpp index 7fe5452..c61d759 100644 --- a/src/compress.hpp +++ b/src/compress.hpp @@ -1,8 +1,8 @@ #include -#include namespace simple_object_storage { -uint64_t get_hash_from_tgz(const std::string& file_path); +// Returns SHA-256 hash of unpacked tgz contents, or empty string on error +std::string get_hash_from_tgz(const std::string& file_path); } // namespace simple_object_storage \ No newline at end of file diff --git a/src/database.cpp b/src/database.cpp index bfcdcd3..b760533 100644 --- a/src/database.cpp +++ b/src/database.cpp @@ -209,18 +209,13 @@ bool Database::run_sql_text(const std::string& sql, const std::string& bind_text } -bool is_dec_uint64(const std::string& s) { - if (s.empty()) return false; +bool is_sha256_hash(const std::string& s) { + // SHA-256 hashes are 64 hex characters + if (s.length() != 64) return false; for (char c : s) { - if (!std::isdigit(static_cast(c))) return false; - } - try { - uint64_t x = std::stoull(s, nullptr, 10); - std::string s2=std::to_string(x); - return s2 == s; - } catch (...) { - return false; + if (!std::isxdigit(static_cast(c))) return false; } + return true; } bool Database::get(const std::string& hash_or_labeltag, dbEntry& entry) { @@ -230,7 +225,7 @@ bool Database::get(const std::string& hash_or_labeltag, dbEntry& entry) { return (run_sql_text("SELECT hash, labeltags, metadata FROM objects WHERE json_array_length(labeltags) > 0 AND EXISTS (SELECT 1 FROM json_each(labeltags) WHERE value = ?);", hash_or_labeltag, entry)); } - if (is_dec_uint64(hash_or_labeltag)) { + if (is_sha256_hash(hash_or_labeltag)) { if (run_sql_text("SELECT hash, labeltags, metadata FROM objects WHERE hash = ?;", hash_or_labeltag, entry)) return true; } diff --git a/src/hash.cpp b/src/hash.cpp index 203814a..2819294 100644 --- a/src/hash.cpp +++ b/src/hash.cpp @@ -1,115 +1,140 @@ #include "hash.hpp" -#define XXH_INLINE_ALL -#include "xxhash.hpp" - +#include #include #include #include +#include +#include +#include +#include namespace simple_object_storage { -uint64_t hash_file(const std::string &path) { - // Create hash state - XXH64_state_t* const state = XXH64_createState(); - if (state == nullptr) { - std::cerr << "Failed to create hash state" << std::endl; - return 0; +// Convert binary hash to hex string +static std::string to_hex_string(const unsigned char* hash, size_t length) { + std::stringstream ss; + ss << std::hex << std::setfill('0'); + for (size_t i = 0; i < length; ++i) { + ss << std::setw(2) << static_cast(hash[i]); } + return ss.str(); +} - // Initialize state with seed 0 - XXH64_hash_t const seed = 0; /* or any other value */ - if (XXH64_reset(state, seed) == XXH_ERROR) return 0; +std::string hash_file(const std::string &path) { + // Create SHA256 context + SHA256_CTX sha256; + if (!SHA256_Init(&sha256)) { + std::cerr << "Failed to initialize SHA256" << std::endl; + return ""; + } // Open file std::ifstream file(path, std::ios::binary); if (!file.is_open()) { std::cerr << "Failed to open file: " << path << std::endl; - XXH64_freeState(state); - return 0; + return ""; } // Read file in chunks and update hash - const size_t buffer_size = 4096; - char buffer[buffer_size]; - while (file.read(buffer, buffer_size)) { - if (XXH64_update(state, buffer, file.gcount()) == XXH_ERROR) { + const size_t buffer_size = 8192; + std::vector buffer(buffer_size); + + while (file.read(buffer.data(), buffer_size)) { + if (!SHA256_Update(&sha256, buffer.data(), file.gcount())) { std::cerr << "Failed to update hash" << std::endl; - XXH64_freeState(state); - return 0; + return ""; } } // Handle any remaining bytes if (file.gcount() > 0) { - if (XXH64_update(state, buffer, file.gcount()) == XXH_ERROR) { + if (!SHA256_Update(&sha256, buffer.data(), file.gcount())) { std::cerr << "Failed to update hash" << std::endl; - XXH64_freeState(state); - return 0; + return ""; } } - // Get final hash - XXH64_hash_t hash = XXH64_digest(state); - XXH64_freeState(state); - return hash; -} + file.close(); -uint64_t hash_directory_recursive(const std::string &path) { - // Create hash state - XXH64_state_t* const state = XXH64_createState(); - if (state == nullptr) { - std::cerr << "Failed to create hash state" << std::endl; - return 0; + // Get final hash + unsigned char hash[SHA256_DIGEST_LENGTH]; + if (!SHA256_Final(hash, &sha256)) { + std::cerr << "Failed to finalize hash" << std::endl; + return ""; } - // Initialize state with seed 0 - XXH64_hash_t const seed = 0; /* or any other value */ - if (XXH64_reset(state, seed) == XXH_ERROR) { - std::cerr << "Failed to reset hash state" << std::endl; - XXH64_freeState(state); - return 0; + // Convert to hex string + return to_hex_string(hash, SHA256_DIGEST_LENGTH); +} + +std::string hash_directory_recursive(const std::string &path) { + // Create SHA256 context for combining hashes + SHA256_CTX sha256; + if (!SHA256_Init(&sha256)) { + std::cerr << "Failed to initialize SHA256" << std::endl; + return ""; } try { - // Iterate through all files in directory recursively + // Collect all file paths and sort them for consistent hashing + std::vector file_paths; for (const auto& entry : std::filesystem::recursive_directory_iterator(path)) { if (entry.is_regular_file()) { - // Get file hash - XXH64_hash_t file_hash = hash_file(entry.path().string()); - XXH64_update(state, &file_hash, sizeof(file_hash)); + file_paths.push_back(entry.path()); } } + + // Sort paths for deterministic hashing + std::sort(file_paths.begin(), file_paths.end()); + + // Hash each file and combine + for (const auto& file_path : file_paths) { + // Get file hash + std::string file_hash_str = hash_file(file_path.string()); + if (file_hash_str.empty()) { + std::cerr << "Failed to hash file: " << file_path << std::endl; + continue; + } + + // Update combined hash with file hash and path + std::string relative_path = std::filesystem::relative(file_path, path).string(); + SHA256_Update(&sha256, relative_path.c_str(), relative_path.length()); + SHA256_Update(&sha256, file_hash_str.c_str(), file_hash_str.length()); + } } catch (const std::filesystem::filesystem_error& e) { std::cerr << "Filesystem error: " << e.what() << std::endl; - XXH64_freeState(state); - return 0; + return ""; } - // Get final hash - XXH64_hash_t hash = XXH64_digest(state); - XXH64_freeState(state); - return hash; + // Get final combined hash + unsigned char hash[SHA256_DIGEST_LENGTH]; + if (!SHA256_Final(hash, &sha256)) { + std::cerr << "Failed to finalize hash" << std::endl; + return ""; + } + + // Convert to hex string + return to_hex_string(hash, SHA256_DIGEST_LENGTH); } -void hash_demo(const std::string & path) -{ +void hash_demo(const std::string & path) { std::cout << "Hashing directory: " << path << std::endl; auto start = std::chrono::high_resolution_clock::now(); - XXH64_hash_t hash = hash_directory_recursive(path); + std::string hash = hash_directory_recursive(path); auto end = std::chrono::high_resolution_clock::now(); auto duration = std::chrono::duration_cast(end - start); - std::cout << "Hash: " << hash << " (took " << duration.count() << "ms)" << std::endl; + std::cout << "SHA-256 Hash: " << hash << " (took " << duration.count() << "ms)" << std::endl; } -int hash_demo_raw(const std::string & path) -{ +int hash_demo_raw(const std::string & path) { if (!std::filesystem::exists(path)) { - std::cout << 0 < -#include namespace simple_object_storage { - uint64_t hash_file(const std::string &path); + // Compute SHA-256 hash of a file, returns hex string + std::string hash_file(const std::string &path); - uint64_t hash_directory_recursive(const std::string &path); + // Compute combined SHA-256 hash of all files in directory recursively + std::string hash_directory_recursive(const std::string &path); void hash_demo(const std::string & path); diff --git a/src/put_handler.cpp b/src/put_handler.cpp index 93d288f..7f602fa 100644 --- a/src/put_handler.cpp +++ b/src/put_handler.cpp @@ -196,9 +196,9 @@ void PutHandler::handle_upload_object(const drogon::HttpRequestPtr& req, std::fu // Ensure the temporary file is removed even if errors occur ScopeFileDeleter temp_file_deleter(temp_path); - // Calculate hash - uint64_t hash = hash_file(temp_path.string()); - if (hash == 0) { + // Calculate SHA-256 hash + std::string hash = hash_file(temp_path.string()); + if (hash.empty()) { resp->setStatusCode(drogon::k500InternalServerError); nlohmann::json response = {{"result", "error"}, {"error", "Failed to calculate hash"}}; resp->setBody(response.dump()); @@ -215,8 +215,8 @@ void PutHandler::handle_upload_object(const drogon::HttpRequestPtr& req, std::fu metadata["tgz_content_hash"] = get_hash_from_tgz(temp_path.string()); } - // Move file to final location - std::filesystem::path final_path = server_.config_.object_store_path / std::to_string(hash); + // Move file to final location (using SHA-256 hash as filename) + std::filesystem::path final_path = server_.config_.object_store_path / hash; if (!std::filesystem::exists(final_path)) { try { @@ -235,7 +235,7 @@ void PutHandler::handle_upload_object(const drogon::HttpRequestPtr& req, std::fu // Update database index dbEntry entry; - entry.hash = std::to_string(hash); + entry.hash = hash; entry.labeltags = metadata["labeltags"].get>(); entry.metadata = metadata; @@ -250,7 +250,7 @@ void PutHandler::handle_upload_object(const drogon::HttpRequestPtr& req, std::fu return; } - resp->setBody(nlohmann::json({{"result", "success"}, {"hash", std::to_string(hash)}}).dump()); + resp->setBody(nlohmann::json({{"result", "success"}, {"hash", hash}}).dump()); resp->setContentTypeCode(drogon::CT_APPLICATION_JSON); callback(resp); } diff --git a/testing/compose.yaml b/testing/compose.yaml index 3a31dc8..a8e2664 100644 --- a/testing/compose.yaml +++ b/testing/compose.yaml @@ -6,8 +6,7 @@ services: ports: - 7703:7703 restart: no - volumes: - - ${LOCALCONFIG}:/testing/sos_config.json:ro + command: ["/sos/sos", "/testing/sos_config.json"] healthcheck: test: ["CMD", "wget", "-qO-", "http://127.0.0.1:7703/status"] interval: 1s diff --git a/testing/test-docker.sh b/testing/test-docker.sh index 51b8391..145a920 100755 --- a/testing/test-docker.sh +++ b/testing/test-docker.sh @@ -80,7 +80,7 @@ ${SCRIPT_DIR}/../build.sh # Use static test configuration with known tokens for Docker testing title "Setting up test configuration" # Use the static Docker config with known hashes -cp ${SCRIPT_DIR}/sos_config_docker.json ${SCRIPT_DIR}/sos_config.json +cp "${SCRIPT_DIR}/sos_config_docker.json" "${SCRIPT_DIR}/sos_config.json" # Export the known plaintext tokens that correspond to the hashes in sos_config_docker.json export TEST_TOKEN1="t570H7DmK2VBfCwUmtFaUXyzVklL90E1" @@ -92,7 +92,9 @@ echo "Using static test configuration with known tokens" #------------------------------------------------------------------------------------------------ # run the docker container title "Running docker container" -export LOCALCONFIG="${SCRIPT_DIR}/sos_config.json" + +# Config file is always in SCRIPT_DIR after the copy above +LOCALCONFIG="${SCRIPT_DIR}/sos_config.json" [ -f "${LOCALCONFIG}" ] || die "Config file not found: ${LOCALCONFIG}" [ -f "${COMPOSE_FILE}" ] || die "Compose file not found: ${COMPOSE_FILE}" @@ -107,13 +109,16 @@ cd "${SCRIPT_DIR}" docker stop sos-test 2>/dev/null || true docker rm -v sos-test 2>/dev/null || true -# Start the container and mark that cleanup is needed -LOCALCONFIG=${LOCALCONFIG} docker compose \ - -f "${COMPOSE_FILE}" up -d +# Start the container (without config volume mount) +docker compose -f "${COMPOSE_FILE}" up -d # Mark that we need cleanup from this point on CLEANUP_NEEDED=true +# Copy the config file into the running container +echo "Copying config file into container..." +docker cp "${LOCALCONFIG}" sos-test:/testing/sos_config.json + # wait until healthy. if ! wait_for_container "sos-test"; then echo "----------------------------------------" @@ -123,9 +128,12 @@ if ! wait_for_container "sos-test"; then die "Container sos-test is not healthy" fi -# run the tests. Docker inside docker support! -docker exec -i sos-test ls /testing || true -# Pass the plaintext tokens as environment variables to the test script +# Verify the config and test files are accessible +echo "Verifying test environment..." +docker exec sos-test ls -la /testing/ || die "Cannot access /testing directory in container" + +# Run the tests inside the container +echo "Running tests inside container..." docker exec -i \ -e TEST_TOKEN1="${TEST_TOKEN1:-}" \ -e TEST_TOKEN2="${TEST_TOKEN2:-}" \ diff --git a/testing/test_file.txt b/testing/test_file.txt new file mode 100644 index 0000000..d670460 --- /dev/null +++ b/testing/test_file.txt @@ -0,0 +1 @@ +test content