Skip to content

Commit

Permalink
[ntuple] Add option to defer index building
Browse files Browse the repository at this point in the history
  • Loading branch information
enirolf committed Sep 11, 2024
1 parent f206bd0 commit 9c757b8
Show file tree
Hide file tree
Showing 3 changed files with 124 additions and 41 deletions.
59 changes: 46 additions & 13 deletions tree/ntuple/v7/inc/ROOT/RNTupleIndex.hxx
Original file line number Diff line number Diff line change
Expand Up @@ -70,18 +70,29 @@ private:
/// their respsective entry numbers.
std::unordered_map<RIndexValue, std::vector<NTupleSize_t>, RIndexValueHash> fIndex;

/////////////////////////////////////////////////////////////////////////////
/// \brief Create an RNTupleIndex for an existing RNTuple.
///
/// \param[in] fields The fields that will make up the index.
/// \param[in] nEntries The number of entries to index.
///
/// \note The page source is assumed be attached already.
RNTupleIndex(std::vector<std::unique_ptr<RFieldBase>> &fields, NTupleSize_t nEntries);
/// The page source belonging to the RNTuple for which to build the index.
std::unique_ptr<RPageSource> fPageSource;

/// The fields for which the index is built. Used to compute the hashes for each entry value.
std::vector<std::unique_ptr<RFieldBase>> fIndexFields;

/// Only built indexes can be queried.
bool fIsBuilt = false;

/////////////////////////////////////////////////////////////////////////////
/// \brief Create an a new RNTupleIndex for the RNTuple represented by the provided page source.
///
/// \param[in] fieldNames The names of the fields to index. Only integral-type fields can be specified as index
/// fields.
/// \param[in] pageSource The page source.
RNTupleIndex(const std::vector<std::string> &fieldNames, const RPageSource &pageSource);

/////////////////////////////////////////////////////////////////////////////
/// \brief Ensure the RNTupleIndex has been built.
///
/// \throws RException If the index has not been built, and can therefore not be used yet.
void EnsureBuilt() const;

public:
RNTupleIndex(const RNTupleIndex &other) = delete;
RNTupleIndex &operator=(const RNTupleIndex &other) = delete;
Expand All @@ -92,20 +103,42 @@ public:
/////////////////////////////////////////////////////////////////////////////
/// \brief Create an RNTupleIndex from an existing RNTuple.
///
/// \param[in] fieldNames The names of the fields to index.
/// \param[in] fieldNames The names of the fields to index. Only integral-type fields can be specified as index
/// fields.
/// \param[in] pageSource The page source.
/// \param[in] deferBuild When set to `true`, an empty index will be created. A call to RNTupleIndex::Build is
/// required before the index can actually be used.
///
/// \return A pointer to the newly-created index.
static std::unique_ptr<RNTupleIndex> Create(const std::vector<std::string> &fieldNames, RPageSource &pageSource);
static std::unique_ptr<RNTupleIndex>
Create(const std::vector<std::string> &fieldNames, const RPageSource &pageSource, bool deferBuild = false);

/////////////////////////////////////////////////////////////////////////////
/// \brief Get the number of values currently indexed.
/// \brief Build the index.
///
/// \return The number of values currently indexed.
/// Only a built index can be queried (with RNTupleIndex::GetFirstEntryNumber or RNTupleIndex::GetAllEntryNumbers).
void Build();

/////////////////////////////////////////////////////////////////////////////
/// \brief Get the number of indexed values.
///
/// \return The number of indexed values.
///
/// \note This does not have to correspond to the number of entries in the original RNTuple. If the original RNTuple
/// contains duplicate index values, they are counted as one.
std::size_t GetSize() const { return fIndex.size(); }
std::size_t GetSize() const
{
EnsureBuilt();
return fIndex.size();
}

/////////////////////////////////////////////////////////////////////////////
/// \brief Whether the index has been built (and therefore ready to be used).
///
/// \return `true` if the index has been built.
///
/// Only built indexes can be queried.
bool IsBuilt() const { return fIsBuilt; }

/////////////////////////////////////////////////////////////////////////////
/// \brief Get the first entry number containing the given index value.
Expand Down
74 changes: 46 additions & 28 deletions tree/ntuple/v7/src/RNTupleIndex.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -33,9 +33,48 @@ CastValuePtr(void *valuePtr, const ROOT::Experimental::RFieldBase &field)
}
} // anonymous namespace

ROOT::Experimental::Internal::RNTupleIndex::RNTupleIndex(std::vector<std::unique_ptr<RFieldBase>> &fields,
NTupleSize_t nEntries)
: fIndexFields(std::move(fields))
ROOT::Experimental::Internal::RNTupleIndex::RNTupleIndex(const std::vector<std::string> &fieldNames,
const RPageSource &pageSource)
: fPageSource(pageSource.Clone())
{
fPageSource->Attach();
auto desc = fPageSource->GetSharedDescriptorGuard();

fIndexFields.reserve(fieldNames.size());

for (const auto &fieldName : fieldNames) {
auto fieldId = desc->FindFieldId(fieldName);
if (fieldId == kInvalidDescriptorId)
throw RException(R__FAIL("Could not find field \"" + std::string(fieldName) + "."));

const auto &fieldDesc = desc->GetFieldDescriptor(fieldId);
auto field = fieldDesc.CreateField(desc.GetRef());

CallConnectPageSourceOnField(*field, *fPageSource);

fIndexFields.push_back(std::move(field));
}
}

void ROOT::Experimental::Internal::RNTupleIndex::EnsureBuilt() const
{
if (!fIsBuilt)
throw RException(R__FAIL("Index has not been built yet"));
}

std::unique_ptr<ROOT::Experimental::Internal::RNTupleIndex>
ROOT::Experimental::Internal::RNTupleIndex::Create(const std::vector<std::string> &fieldNames,
const RPageSource &pageSource, bool deferBuild)
{
auto index = std::unique_ptr<RNTupleIndex>(new RNTupleIndex(fieldNames, pageSource));

if (!deferBuild)
index->Build();

return index;
}

void ROOT::Experimental::Internal::RNTupleIndex::Build()
{
std::vector<RFieldBase::RValue> fieldValues;
fieldValues.reserve(fIndexFields.size());
Expand All @@ -50,7 +89,7 @@ ROOT::Experimental::Internal::RNTupleIndex::RNTupleIndex(std::vector<std::unique
std::vector<NTupleIndexValue_t> indexValues;
indexValues.reserve(fIndexFields.size());

for (unsigned i = 0; i < nEntries; ++i) {
for (unsigned i = 0; i < fPageSource->GetNEntries(); ++i) {
indexValues.clear();
for (auto &fieldValue : fieldValues) {
// TODO(fdegeus): use bulk reading
Expand All @@ -61,31 +100,8 @@ ROOT::Experimental::Internal::RNTupleIndex::RNTupleIndex(std::vector<std::unique
}
fIndex[RIndexValue(indexValues)].push_back(i);
}
}

std::unique_ptr<ROOT::Experimental::Internal::RNTupleIndex>
ROOT::Experimental::Internal::RNTupleIndex::Create(const std::vector<std::string> &fieldNames, RPageSource &pageSource)
{
pageSource.Attach();
auto desc = pageSource.GetSharedDescriptorGuard();

std::vector<std::unique_ptr<RFieldBase>> fields;
fields.reserve(fieldNames.size());

for (const auto &fieldName : fieldNames) {
auto fieldId = desc->FindFieldId(fieldName);
if (fieldId == kInvalidDescriptorId)
throw RException(R__FAIL("Could not find field \"" + std::string(fieldName) + "."));

const auto &fieldDesc = desc->GetFieldDescriptor(fieldId);
auto field = fieldDesc.CreateField(desc.GetRef());

CallConnectPageSourceOnField(*field, pageSource);

fields.push_back(std::move(field));
}

return std::unique_ptr<RNTupleIndex>(new RNTupleIndex(fields, pageSource.GetNEntries()));
fIsBuilt = true;
}

ROOT::Experimental::NTupleSize_t
Expand All @@ -103,6 +119,8 @@ ROOT::Experimental::Internal::RNTupleIndex::GetAllEntryNumbers(const std::vector
if (valuePtrs.size() != fIndexFields.size())
throw RException(R__FAIL("Number of value pointers must match number of indexed fields."));

EnsureBuilt();

std::vector<NTupleIndexValue_t> indexValues;
indexValues.reserve(fIndexFields.size());

Expand Down
32 changes: 32 additions & 0 deletions tree/ntuple/v7/test/ntuple_index.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,38 @@ TEST(RNTupleIndex, Basic)
}
}

TEST(RNTupleIndex, DeferBuild)
{
FileRaii fileGuard("test_ntuple_index_defer_build.root");
{
auto model = RNTupleModel::Create();
auto fld = model->MakeField<std::uint64_t>("fld");

auto ntuple = RNTupleWriter::Recreate(std::move(model), "ntuple", fileGuard.GetPath());

for (int i = 0; i < 10; ++i) {
*fld = i * 2;
ntuple->Fill();
}
}

auto pageSource = RPageSource::Create("ntuple", fileGuard.GetPath());
auto index = RNTupleIndex::Create({"fld"}, *pageSource, true /* deferBuild */);
EXPECT_FALSE(index->IsBuilt());

try {
index->GetFirstEntryNumber<std::uint64_t>(0);
FAIL() << "querying an unbuilt index should not be possible";
} catch (const RException &err) {
EXPECT_THAT(err.what(), testing::HasSubstr("Index has not been built yet"));
}

index->Build();
EXPECT_TRUE(index->IsBuilt());

EXPECT_EQ(0, index->GetFirstEntryNumber<std::uint64_t>(0));
}

TEST(RNTupleIndex, InvalidTypes)
{
FileRaii fileGuard("test_ntuple_index_invalid_types.root");
Expand Down

0 comments on commit 9c757b8

Please sign in to comment.