fastq_to_fasta
A template for creation of SeqAn3 apps, with a FASTQ to FASTA example app.
hierarchical_interleaved_bloom_filter.hpp
Go to the documentation of this file.
1// --------------------------------------------------------------------------------------------------
2// Copyright (c) 2006-2022, Knut Reinert & Freie Universität Berlin
3// Copyright (c) 2016-2022, Knut Reinert & MPI für molekulare Genetik
4// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License
5// shipped with this file and also available at: https://github.com/seqan/raptor/blob/main/LICENSE.md
6// --------------------------------------------------------------------------------------------------
7
8#pragma once
9
10#include <ranges>
11
12#include <seqan3/search/dream_index/interleaved_bloom_filter.hpp>
13
14#ifndef RAPTOR_HIBF_HAS_COUNT
15# define RAPTOR_HIBF_HAS_COUNT 0
16#endif
17
18namespace raptor
19{
20
76template <seqan3::data_layout data_layout_mode_ = seqan3::data_layout::uncompressed>
78{
79public:
80 // Forward declaration
81 class user_bins;
82
83 // Forward declaration
84 class membership_agent;
85
86#if RAPTOR_HIBF_HAS_COUNT
87 // Forward declaration
88 template <std::integral value_t>
90#endif
91
93 static constexpr seqan3::data_layout data_layout_mode = data_layout_mode_;
94
96 using ibf_t = seqan3::interleaved_bloom_filter<data_layout_mode_>;
97
109
111
113 std::vector<ibf_t> ibf_vector;
114
122 std::vector<std::vector<int64_t>> next_ibf_id;
123
126
129 {
131 }
132
133#if RAPTOR_HIBF_HAS_COUNT
137 template <std::integral value_t = uint16_t>
138 counting_agent_type<value_t> counting_agent() const
139 {
140 return counting_agent_type<value_t>{*this};
141 }
142#endif
143
151 template <seqan3::cereal_archive archive_t>
152 void CEREAL_SERIALIZE_FUNCTION_NAME(archive_t & archive)
153 {
154 archive(ibf_vector);
155 archive(next_ibf_id);
156 archive(user_bins);
157 }
159};
160
163template <seqan3::data_layout data_layout_mode>
165{
166private:
168 std::vector<std::string> user_bin_filenames;
169
176 std::vector<std::vector<int64_t>> ibf_bin_to_filename_position{};
177
178public:
180 size_t num_user_bins() const noexcept
181 {
182 return user_bin_filenames.size();
183 }
184
186 void set_ibf_count(size_t const size)
187 {
188 ibf_bin_to_filename_position.resize(size);
189 }
190
192 void set_user_bin_count(size_t const size)
193 {
194 user_bin_filenames.resize(size);
195 }
196
198 std::vector<int64_t> & bin_indices_of_ibf(size_t const idx)
199 {
200 return ibf_bin_to_filename_position[idx];
201 }
202
204 std::string & filename_of_user_bin(size_t const idx)
205 {
206 return user_bin_filenames[idx];
207 }
208
210 std::string const & operator[](std::pair<size_t, size_t> const & index_pair) const
211 {
212 return user_bin_filenames[ibf_bin_to_filename_position[index_pair.first][index_pair.second]];
213 }
214
218 auto operator[](size_t const ibf_idx) const
219 {
220 return ibf_bin_to_filename_position[ibf_idx]
221 | std::views::transform(
222 [this](int64_t i)
223 {
224 if (i == -1)
225 return std::string{};
226 else
227 return user_bin_filenames[i];
228 });
229 }
230
232 int64_t filename_index(size_t const ibf_idx, size_t const bin_idx) const
233 {
234 return ibf_bin_to_filename_position[ibf_idx][bin_idx];
235 }
236
242 template <typename stream_t>
243 void write_filenames(stream_t & out_stream) const
244 {
245 size_t position{};
246 std::string line{};
247 for (auto const & filename : user_bin_filenames)
248 {
249 line.clear();
250 line = '#';
251 line += std::to_string(position);
252 line += '\t';
253 line += filename;
254 line += '\n';
255 out_stream << line;
256 ++position;
257 }
258 }
259
267 template <typename archive_t>
268 void serialize(archive_t & archive)
269 {
270 archive(user_bin_filenames);
271 archive(ibf_bin_to_filename_position);
272 }
274};
275
281template <seqan3::data_layout data_layout_mode> // TODO: value_t as template?
283{
284private:
287
289 hibf_t const * const hibf_ptr{nullptr};
290
292 template <std::ranges::forward_range value_range_t>
293 void bulk_contains_impl(value_range_t && values, int64_t const ibf_idx, size_t const threshold)
294 {
295 auto agent = hibf_ptr->ibf_vector[ibf_idx].template counting_agent<uint16_t>();
296 auto & result = agent.bulk_count(values);
297
298 uint16_t sum{};
299
300 for (size_t bin{}; bin < result.size(); ++bin)
301 {
302 sum += result[bin];
303
304 auto const current_filename_index = hibf_ptr->user_bins.filename_index(ibf_idx, bin);
305
306 if (current_filename_index < 0) // merged bin
307 {
308 if (sum >= threshold)
309 bulk_contains_impl(values, hibf_ptr->next_ibf_id[ibf_idx][bin], threshold);
310 sum = 0u;
311 }
312 else if (bin + 1u == result.size() || // last bin
313 current_filename_index != hibf_ptr->user_bins.filename_index(ibf_idx, bin + 1)) // end of split bin
314 {
315 if (sum >= threshold)
316 result_buffer.emplace_back(current_filename_index);
317 sum = 0u;
318 }
319 }
320 }
321
322public:
326 membership_agent() = default;
331 ~membership_agent() = default;
332
337 explicit membership_agent(hibf_t const & hibf) : hibf_ptr(std::addressof(hibf))
338 {}
340
342 std::vector<int64_t> result_buffer;
343
361 template <std::ranges::forward_range value_range_t>
362 [[nodiscard]] std::vector<int64_t> const & bulk_contains(value_range_t && values, size_t const threshold) & noexcept
363 {
364 assert(hibf_ptr != nullptr);
365
366 static_assert(std::ranges::forward_range<value_range_t>, "The values must model forward_range.");
367 static_assert(std::unsigned_integral<std::ranges::range_value_t<value_range_t>>,
368 "An individual value must be an unsigned integral.");
369
370 result_buffer.clear();
371
372 bulk_contains_impl(values, 0, threshold);
373
374 std::ranges::sort(result_buffer); // TODO: necessary?
375
376 return result_buffer;
377 }
378
379 // `bulk_contains` cannot be called on a temporary, since the object the returned reference points to
380 // is immediately destroyed.
381 template <std::ranges::range value_range_t>
382 [[nodiscard]] std::vector<int64_t> const & bulk_contains(value_range_t && values,
383 size_t const threshold) && noexcept = delete;
385};
386
387#if RAPTOR_HIBF_HAS_COUNT
390template <seqan3::data_layout data_layout_mode>
391template <std::integral value_t>
393{
394private:
397
399 hibf_t const * const hibf_ptr{nullptr};
400
402 template <std::ranges::forward_range value_range_t>
403 void bulk_count_impl(value_range_t && values, int64_t const ibf_idx, size_t const threshold)
404 {
405 auto agent = hibf_ptr->ibf_vector[ibf_idx].template counting_agent<value_t>();
406 auto & result = agent.bulk_count(values);
407
408 value_t sum{};
409
410 for (size_t bin{}; bin < result.size(); ++bin)
411 {
412 sum += result[bin];
413 auto const current_filename_index = hibf_ptr->user_bins.filename_index(ibf_idx, bin);
414
415 if (current_filename_index < 0) // merged bin
416 {
417 if (sum >= threshold)
418 bulk_count_impl(values, hibf_ptr->next_ibf_id[ibf_idx][bin], threshold);
419 sum = 0u;
420 }
421 else if (bin + 1u == result.size() || // last bin
422 current_filename_index != hibf_ptr->user_bins.filename_index(ibf_idx, bin + 1)) // end of split bin
423 {
424 if (sum >= threshold)
425 result_buffer[current_filename_index] = sum;
426 sum = 0u;
427 }
428 }
429 }
430
431public:
441
446 explicit counting_agent_type(hibf_t const & hibf) :
447 hibf_ptr(std::addressof(hibf)),
448 result_buffer(hibf_ptr->user_bins.num_user_bins())
449 {}
451
453 seqan3::counting_vector<value_t> result_buffer;
454
474 template <std::ranges::forward_range value_range_t>
475 [[nodiscard]] seqan3::counting_vector<value_t> const & bulk_count(value_range_t && values,
476 size_t const threshold = 1u) & noexcept
477 {
478 assert(hibf_ptr != nullptr);
479 assert(threshold > 0u);
480 assert(result_buffer.size() == hibf_ptr->user_bins.num_user_bins());
481
482 static_assert(std::ranges::forward_range<value_range_t>, "The values must model forward_range.");
483 static_assert(std::unsigned_integral<std::ranges::range_value_t<value_range_t>>,
484 "An individual value must be an unsigned integral.");
485
486 std::ranges::fill(result_buffer, static_cast<value_t>(0u));
487
488 bulk_count_impl(values, 0, threshold);
489
490 return result_buffer;
491 }
492
493 // `bulk_count` cannot be called on a temporary, since the object the returned reference points to
494 // is immediately destroyed.
495 template <std::ranges::range value_range_t>
496 [[nodiscard]] seqan3::counting_vector<value_t> const & bulk_count(value_range_t && values,
497 size_t const threshold = 1u) && noexcept = delete;
499};
500#endif // RAPTOR_HIBF_HAS_COUNT
501
502} // namespace raptor
Manages counting ranges of values for the hibf::hierarchical_interleaved_bloom_filter.
Definition: hierarchical_interleaved_bloom_filter.hpp:393
counting_agent_type & operator=(counting_agent_type const &)=default
Defaulted.
counting_agent_type & operator=(counting_agent_type &&)=default
Defaulted.
counting_agent_type(counting_agent_type &&)=default
Defaulted.
seqan3::counting_vector< value_t > const & bulk_count(value_range_t &&values, size_t const threshold=1u) &noexcept
Counts the occurrences in each bin for all values in a range.
Definition: hierarchical_interleaved_bloom_filter.hpp:475
seqan3::counting_vector< value_t > const & bulk_count(value_range_t &&values, size_t const threshold=1u) &&noexcept=delete
counting_agent_type(counting_agent_type const &)=default
Defaulted.
seqan3::counting_vector< value_t > result_buffer
Stores the result of bulk_count().
Definition: hierarchical_interleaved_bloom_filter.hpp:453
Manages membership queries for the hibf::hierarchical_interleaved_bloom_filter.
Definition: hierarchical_interleaved_bloom_filter.hpp:283
membership_agent & operator=(membership_agent const &)=default
Defaulted.
membership_agent(membership_agent const &)=default
Defaulted.
membership_agent & operator=(membership_agent &&)=default
Defaulted.
std::vector< int64_t > const & bulk_contains(value_range_t &&values, size_t const threshold) &noexcept
Determines set membership of given values, and returns the user bin indices of occurrences.
Definition: hierarchical_interleaved_bloom_filter.hpp:362
std::vector< int64_t > result_buffer
Stores the result of bulk_contains().
Definition: hierarchical_interleaved_bloom_filter.hpp:342
std::vector< int64_t > const & bulk_contains(value_range_t &&values, size_t const threshold) &&noexcept=delete
membership_agent(membership_agent &&)=default
Defaulted.
Bookkeeping for user and technical bins.
Definition: hierarchical_interleaved_bloom_filter.hpp:165
int64_t filename_index(size_t const ibf_idx, size_t const bin_idx) const
Returns the filename index of the ibf_idxth IBF for bin bin_idx.
Definition: hierarchical_interleaved_bloom_filter.hpp:232
void set_ibf_count(size_t const size)
Changes the number of managed IBFs.
Definition: hierarchical_interleaved_bloom_filter.hpp:186
void set_user_bin_count(size_t const size)
Changes the number of managed user bins.
Definition: hierarchical_interleaved_bloom_filter.hpp:192
auto operator[](size_t const ibf_idx) const
Returns a view over the user bin filenames for the ibf_idxth IBF. An empty string is returned for mer...
Definition: hierarchical_interleaved_bloom_filter.hpp:218
void write_filenames(stream_t &out_stream) const
Writes all filenames to a stream. Index and filename are tab-separated.
Definition: hierarchical_interleaved_bloom_filter.hpp:243
std::string const & operator[](std::pair< size_t, size_t > const &index_pair) const
For a pair (a,b), returns a const reference to the filename of the user bin at IBF a,...
Definition: hierarchical_interleaved_bloom_filter.hpp:210
std::string & filename_of_user_bin(size_t const idx)
Returns the filename of the idxth user bin.
Definition: hierarchical_interleaved_bloom_filter.hpp:204
size_t num_user_bins() const noexcept
Returns the number of managed user bins.
Definition: hierarchical_interleaved_bloom_filter.hpp:180
std::vector< int64_t > & bin_indices_of_ibf(size_t const idx)
Returns a vector containing user bin indices for each bin in the idxth IBF.
Definition: hierarchical_interleaved_bloom_filter.hpp:198
The HIBF binning directory. A data structure that efficiently answers set-membership queries for mult...
Definition: hierarchical_interleaved_bloom_filter.hpp:78
hierarchical_interleaved_bloom_filter(hierarchical_interleaved_bloom_filter const &)=default
Defaulted.
~hierarchical_interleaved_bloom_filter()=default
Defaulted.
std::vector< ibf_t > ibf_vector
The individual interleaved Bloom filters.
Definition: hierarchical_interleaved_bloom_filter.hpp:113
hierarchical_interleaved_bloom_filter & operator=(hierarchical_interleaved_bloom_filter const &)=default
Defaulted.
membership_agent membership_agent() const
Returns a membership_agent to be used for counting.
Definition: hierarchical_interleaved_bloom_filter.hpp:128
static constexpr seqan3::data_layout data_layout_mode
Indicates whether the Interleaved Bloom Filter is compressed.
Definition: hierarchical_interleaved_bloom_filter.hpp:93
hierarchical_interleaved_bloom_filter & operator=(hierarchical_interleaved_bloom_filter &&)=default
Defaulted.
std::vector< std::vector< int64_t > > next_ibf_id
Stores for each bin in each IBF of the HIBF the ID of the next IBF.
Definition: hierarchical_interleaved_bloom_filter.hpp:122
hierarchical_interleaved_bloom_filter(hierarchical_interleaved_bloom_filter &&)=default
Defaulted.
seqan3::interleaved_bloom_filter< data_layout_mode_ > ibf_t
The type of an individual Bloom filter.
Definition: hierarchical_interleaved_bloom_filter.hpp:96
hierarchical_interleaved_bloom_filter()=default
Defaulted.
user_bins user_bins
The underlying user bins.
Definition: hierarchical_interleaved_bloom_filter.hpp:125
hierarchical_interleaved_bloom_filter< seqan3::data_layout::uncompressed > hibf
Definition: index.hpp:24
Definition: adjust_seed.hpp:13