Files
email-tracker/external/duckdb/test/common/test_hyperlog.cpp
2025-10-24 19:21:19 -05:00

99 lines
2.7 KiB
C++

#include "catch.hpp"
#include "duckdb/common/serializer/binary_deserializer.hpp"
#include "duckdb/common/serializer/binary_serializer.hpp"
#include "duckdb/common/serializer/memory_stream.hpp"
#include "duckdb/common/types/hash.hpp"
#include "duckdb/common/types/hyperloglog.hpp"
using namespace duckdb;
using namespace std;
TEST_CASE("Test that hyperloglog works", "[hyperloglog]") {
HyperLogLog log;
// add a million elements of the same value
int x = 4;
for (size_t i = 0; i < 1000000; i++) {
log.InsertElement(Hash(x));
}
REQUIRE(log.Count() == 1);
// now add a million different values
HyperLogLog log2;
for (size_t i = 0; i < 1000000; i++) {
x = i;
log2.InsertElement(Hash(x));
}
// the count is approximate, but should be pretty close to a million
size_t count = log2.Count();
REQUIRE(count > 950000LL);
REQUIRE(count < 1050000LL);
// now we can merge the HLLs
log.Merge(log2);
// the count should be pretty much the same
count = log.Count();
REQUIRE(count > 950000LL);
REQUIRE(count < 1050000LL);
// now test composability of the merge
// add everything to one big_hll one
// add chunks to small_hll ones and then merge them
// the result should be the same
HyperLogLog big_hll;
HyperLogLog small_hll[16];
for (size_t i = 0; i < 1000000; i++) {
x = ((2 * i) + 3) % (i + 3 / 2);
big_hll.InsertElement(Hash(x));
small_hll[i % 16].InsertElement(Hash(x));
}
// now merge them into one big_hll HyperLogLog
for (idx_t i = 1; i < 16; i++) {
small_hll[0].Merge(small_hll[i]);
}
// the result should be identical to the big_hll one
REQUIRE(small_hll[0].Count() == big_hll.Count());
}
TEST_CASE("Test different hyperloglog version serialization", "[hyperloglog]") {
Allocator allocator;
MemoryStream stream(allocator);
SerializationOptions options;
options.serialization_compatibility = SerializationCompatibility::FromString("v1.0.0");
// Add 100M values to a NEW HyperLogLog
HyperLogLog original_log;
for (size_t i = 0; i < 100000000; i++) {
original_log.InsertElement(Hash(i));
switch (i + 1) {
case 1:
case 10:
case 100:
case 1000:
case 10000:
case 100000:
case 1000000:
case 10000000:
case 100000000:
break; // We roundtrip the serialization every order of magnitude
default:
continue;
}
// Grab the count
const auto original_count = original_log.Count();
// Serialize it as an OLD HyperLogLog
stream.Rewind();
BinarySerializer::Serialize(original_log, stream, options);
// Deserialize it, creating a NEW HyperLogLog from the OLD one
stream.Rewind();
auto deserialized_log = BinaryDeserializer::Deserialize<HyperLogLog>(stream);
// Verify that the deserialized count is equal
const auto deserialized_count = deserialized_log->Count();
REQUIRE(original_count == deserialized_count);
}
}