Skip to content

Commit

Permalink
[ntuple] Add initial in-memory index prototype
Browse files Browse the repository at this point in the history
This adds (a first version of) the `RNTupleIndex`, which is an
in-memory structure that maps RNTuple field values (or combinations
thereof) to an entry index in the RNTuple for which the index was
built. At this point, the index only resides in memory and thus has
to be (re)build each time.

`RNTupleIndex` will be used by the  `RNTupleProcessor` to enable
dataset joins and will be as transparent as possible to users.
Currently, no public interface is foreseen.
  • Loading branch information
enirolf committed Jul 9, 2024
1 parent 59d2f9e commit d15cfe3
Show file tree
Hide file tree
Showing 6 changed files with 445 additions and 0 deletions.
2 changes: 2 additions & 0 deletions tree/ntuple/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ HEADERS
ROOT/RNTupleFillContext.hxx
ROOT/RNTupleFillStatus.hxx
ROOT/RNTupleImtTaskScheduler.hxx
ROOT/RNTupleIndex.hxx
ROOT/RNTupleMerger.hxx
ROOT/RNTupleMetrics.hxx
ROOT/RNTupleModel.hxx
Expand Down Expand Up @@ -66,6 +67,7 @@ SOURCES
v7/src/RNTupleDescriptor.cxx
v7/src/RNTupleDescriptorFmt.cxx
v7/src/RNTupleFillContext.cxx
v7/src/RNTupleIndex.cxx
v7/src/RNTupleMerger.cxx
v7/src/RNTupleMetrics.cxx
v7/src/RNTupleModel.cxx
Expand Down
172 changes: 172 additions & 0 deletions tree/ntuple/v7/inc/ROOT/RNTupleIndex.hxx
Original file line number Diff line number Diff line change
@@ -0,0 +1,172 @@
/// \file ROOT/RNTupleIndex.hxx
/// \ingroup NTuple ROOT7
/// \author Florine de Geus <[email protected]>
/// \date 2024-04-02
/// \warning This is part of the ROOT 7 prototype! It will change without notice. It might trigger earthquakes. Feedback
/// is welcome!

/*************************************************************************
* Copyright (C) 1995-2024, Rene Brun and Fons Rademakers. *
* All rights reserved. *
* *
* For the licensing terms see $ROOTSYS/LICENSE. *
* For the list of contributors see $ROOTSYS/README/CREDITS. *
*************************************************************************/

#ifndef ROOT7_RNTupleIndex
#define ROOT7_RNTupleIndex

#include <ROOT/RField.hxx>
#include <ROOT/RNTupleUtil.hxx>

#include <memory>
#include <set>
#include <string>
#include <unordered_map>
#include <vector>

namespace ROOT {
namespace Experimental {
namespace Internal {
// clang-format off
/**
\class ROOT::Experimental::Internal::RNTupleIndex
\ingroup NTuple
\brief Builds an index on one or several fields of an RNTuple so it can be joined onto other RNTuples.
*/
// clang-format on
class RNTupleIndex {
private:
/////////////////////////////////////////////////////////////////////////////
/// Container for the hashes of the indexed fields.
class RIndexValue {
public:
std::vector<std::size_t> fValueHashes;
RIndexValue(const std::vector<std::size_t> &valueHashes) : fValueHashes(valueHashes) {}
inline bool operator==(const RIndexValue &other) const { return other.fValueHashes == fValueHashes; }
};

/////////////////////////////////////////////////////////////////////////////
/// Hash combinining the individual index value hashes from RIndexValue. Uses the implementation from
/// `boost::hash_combine` (see
/// https://www.boost.org/doc/libs/1_55_0/doc/html/hash/reference.html#boost.hash_combine).
struct RIndexValueHash {
inline std::size_t operator()(const RIndexValue &indexValue) const
{
std::size_t combinedHash = 0;
for (const auto &valueHash : indexValue.fValueHashes) {
combinedHash ^= valueHash + 0x9e3779b9 + (valueHash << 6) + (valueHash >> 2);
}
return combinedHash;
}
};

/// The fields for which the index is built. Used to compute the hashes for each entry value.
const std::vector<std::unique_ptr<RFieldBase>> fFields;

/// The index itself. Maps field values (or combinations thereof in case the index is defined for multiple fields) to
/// their respsective entry numbers.
std::unordered_map<RIndexValue, std::vector<NTupleSize_t>, RIndexValueHash> fIndex;

/////////////////////////////////////////////////////////////////////////////
/// \brief Create an RNTupleIndex for an existing RNTuple.
///
/// \param[in] The fields that will make up the index.
/// \param[in] The number of entries to index.
///
/// \note The page source is assumed be attached already.
RNTupleIndex(std::vector<std::unique_ptr<RFieldBase>> &fields, NTupleSize_t nEntries);

///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
/// \brief Add a new entry to the index.
///
/// \param[in] valuePtrs The entry values to index, according to fFields.
/// \param[in] entry The entry number.
void Add(const std::vector<void *> &valuePtrs, NTupleSize_t entry);

public:
RNTupleIndex(const RNTupleIndex &other) = delete;
RNTupleIndex &operator=(const RNTupleIndex &other) = delete;
RNTupleIndex(RNTupleIndex &&other) = delete;
RNTupleIndex &operator=(RNTupleIndex &&other) = delete;
~RNTupleIndex() = default;

////////////////////////////////////////////////////////////////////////////////
/// \brief Create an RNTupleIndex from an existing RNTuple.
///
/// \param[in] fieldNames The names of the fields to index.
/// \param pageSource The page source.
///
/// \return A pointer to the newly-created index.
///
static std::unique_ptr<RNTupleIndex>
Create(const std::vector<std::string_view> &fieldNames, RPageSource &pageSource);

///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
/// \brief Get the number of elements currently indexed.
///
/// \return The number of elements currently indexed.
std::size_t GetNElements() const { return fIndex.size(); }

/////////////////////////////////////////////////////////////////////////////
/// \brief Get the first entry number containing the given index value.
///
/// \param[in] valuePtrs A vector of pointers to the index values to look up.
///
/// \return The first entry number that corresponds to `valuePtrs`. When no such entry exists, `kInvalidNTupleIndex`
/// is returned.
///
/// Note that in case multiple entries corresponding to the provided index value exist, the first occurrence is
/// returned. Use RNTupleIndex::GetAllEntryNumbers to get all entries.
NTupleSize_t GetFirstEntryNumber(const std::vector<void *> &valuePtrs) const;

/////////////////////////////////////////////////////////////////////////////
/// \brief Get the entry number containing the given index value.
///
/// \sa GetFirstEntryNumber(std::vector<void *> valuePtrs)
template <typename... Ts>
NTupleSize_t GetFirstEntryNumber(Ts... values) const
{
// TODO(fdegeus) also check that the types match
if (sizeof...(Ts) != fFields.size())
throw RException(R__FAIL("number of value pointers must match number of indexed fields"));

std::vector<void *> valuePtrs;
valuePtrs.reserve(sizeof...(Ts));
([&] { valuePtrs.push_back(&values); }(), ...);

return GetFirstEntryNumber(valuePtrs);
}

/////////////////////////////////////////////////////////////////////////////
/// \brief Get all entry numbers for the given index.
///
/// \param[in] valuePtrs A vector of pointers to the index values to look up.
///
/// \return The entry numbers that corresponds to `valuePtrs`. When no such entry exists, an empty vector is
/// returned.
const std::vector<NTupleSize_t> *GetAllEntryNumbers(const std::vector<void *> &valuePtrs) const;

/////////////////////////////////////////////////////////////////////////////
/// \brief Get all entry numbers for the given index.
///
/// \sa GetAllEntryNumbers(std::vector<void *> valuePtrs)
template <typename... Ts>
const std::vector<NTupleSize_t> *GetAllEntryNumbers(Ts... values) const
{
// TODO(fdegeus) also check that the types match
if (sizeof...(Ts) != fFields.size())
throw RException(R__FAIL("number of value pointers must match number of indexed fields"));

std::vector<void *> valuePtrs;
valuePtrs.reserve(sizeof...(Ts));
([&] { valuePtrs.push_back(&values); }(), ...);

return GetAllEntryNumbers(valuePtrs);
}
};
} // namespace Internal
} // namespace Experimental
} // namespace ROOT

#endif // ROOT7_RNTupleIndex
108 changes: 108 additions & 0 deletions tree/ntuple/v7/src/RNTupleIndex.cxx
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
/// \file RNTupleIndex.cxx
/// \ingroup NTuple ROOT7
/// \author Florine de Geus <[email protected]>
/// \date 2024-04-02
/// \warning This is part of the ROOT 7 prototype! It will change without notice. It might trigger earthquakes. Feedback
/// is welcome!

/*************************************************************************
* Copyright (C) 1995-2024, Rene Brun and Fons Rademakers. *
* All rights reserved. *
* *
* For the licensing terms see $ROOTSYS/LICENSE. *
* For the list of contributors see $ROOTSYS/README/CREDITS. *
*************************************************************************/

#include <ROOT/RNTupleIndex.hxx>
#include <ROOT/RHashValueVisitor.hxx>

ROOT::Experimental::Internal::RNTupleIndex::RNTupleIndex(std::vector<std::unique_ptr<RFieldBase>> &fields,
NTupleSize_t nEntries)
: fFields(std::move(fields))
{
std::vector<RFieldBase::RValue> fieldValues;
fieldValues.reserve(fields.size());
for (const auto &field : fFields) {
fieldValues.emplace_back(field->CreateValue());
}

for (std::uint64_t i = 0; i < nEntries; ++i) {
std::vector<void *> ptrs;
ptrs.reserve(fieldValues.size());
for (auto &fieldValue : fieldValues) {
// TODO(fdegeus): use bulk reading
fieldValue.Read(i);
ptrs.push_back(fieldValue.GetPtr<void>().get());
}
Add(ptrs, i);
}
}

void ROOT::Experimental::Internal::RNTupleIndex::Add(const std::vector<void *> &valuePtrs, NTupleSize_t entry)
{
std::vector<std::size_t> valueHashes;
valueHashes.reserve(fFields.size());
for (unsigned i = 0; i < fFields.size(); ++i) {
auto &field = fFields[i];
auto valuePtr = valuePtrs[i];
RHashValueVisitor visitor(valuePtr);
field->AcceptVisitor(visitor);
valueHashes.push_back(visitor.GetHash());
}
fIndex[RIndexValue(valueHashes)].push_back(entry);
}

std::unique_ptr<ROOT::Experimental::Internal::RNTupleIndex>
ROOT::Experimental::Internal::RNTupleIndex::Create(const std::vector<std::string_view> &fieldNames,
RPageSource &pageSource)
{
pageSource.Attach();
auto desc = pageSource.GetSharedDescriptorGuard();

std::vector<std::unique_ptr<RFieldBase>> fields;
fields.reserve(fieldNames.size());

for (const auto &fieldName : fieldNames) {
auto fieldId = desc->FindFieldId(fieldName);
if (fieldId == kInvalidDescriptorId)
throw RException(R__FAIL("could not find field \"" + std::string(fieldName) + ""));

const auto &fieldDesc = desc->GetFieldDescriptor(fieldId);
auto field = fieldDesc.CreateField(desc.GetRef());

CallConnectPageSourceOnField(*field, pageSource);

fields.push_back(std::move(field));
}

return std::unique_ptr<RNTupleIndex>(new RNTupleIndex(fields, pageSource.GetNEntries()));
}

ROOT::Experimental::NTupleSize_t
ROOT::Experimental::Internal::RNTupleIndex::GetFirstEntryNumber(const std::vector<void *> &valuePtrs) const
{
const auto entryIndices = GetAllEntryNumbers(valuePtrs);
if (!entryIndices)
return kInvalidNTupleIndex;
return entryIndices->front();
}

const std::vector<ROOT::Experimental::NTupleSize_t> *
ROOT::Experimental::Internal::RNTupleIndex::GetAllEntryNumbers(const std::vector<void *> &valuePtrs) const
{
std::vector<std::size_t> valueHashes;
valueHashes.reserve(fFields.size());
for (unsigned i = 0; i < fFields.size(); ++i) {
auto &field = fFields[i];
auto valuePtr = valuePtrs[i];
RHashValueVisitor visitor(valuePtr);
field->AcceptVisitor(visitor);
valueHashes.push_back(visitor.GetHash());
}
RIndexValue indexValue(valueHashes);

if (!fIndex.count(indexValue))
return nullptr;

return &fIndex.at(indexValue);
}
1 change: 1 addition & 0 deletions tree/ntuple/v7/test/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ ROOT_GENERATE_DICTIONARY(RXTupleDict ${CMAKE_CURRENT_SOURCE_DIR}/RXTuple.hxx
ROOT_ADD_GTEST(ntuple_descriptor ntuple_descriptor.cxx LIBRARIES ROOTNTuple CustomStruct)
ROOT_ADD_GTEST(ntuple_endian ntuple_endian.cxx LIBRARIES ROOTNTuple)
ROOT_ADD_GTEST(ntuple_friends ntuple_friends.cxx LIBRARIES ROOTNTuple CustomStruct)
ROOT_ADD_GTEST(ntuple_index ntuple_index.cxx LIBRARIES ROOTNTuple)
ROOT_ADD_GTEST(ntuple_merger ntuple_merger.cxx LIBRARIES ROOTNTuple CustomStruct)
ROOT_ADD_GTEST(ntuple_metrics ntuple_metrics.cxx LIBRARIES ROOTNTuple CustomStruct)
ROOT_ADD_GTEST(ntuple_model ntuple_model.cxx LIBRARIES ROOTNTuple CustomStruct)
Expand Down
Loading

0 comments on commit d15cfe3

Please sign in to comment.