55 size_t nModules = fPars->setup.modules.size();
56 size_t nModuleSides = nModules * 2;
57 size_t nDigisTotal = digis.size();
61 DigiMap digiMap = CountDigisPerModules(digis);
64 if (fPars->memory.IsDynamic())
68 throw ProcessingError(
"STS Hitfinder Chain: Too many digis per module for static allocation: {} > {}",
71 if (nDigisTotal > fPars->memory.maxNDigisPerTS) {
72 throw ProcessingError(
"STS Hitfinder Chain: Too many digis per timeslice for static allocation: {} > {}",
73 nDigisTotal, fPars->memory.maxNDigisPerTS);
77 FlattenDigis(digis, digiMap);
79#ifdef CBM_ONLINE_USE_FAIRLOGGER
80 if (fair::Logger::GetConsoleSeverity() == fair::Severity::trace) {
82 if (Opts().LogLevel() == trace) {
84 EnsureDigiOffsets(digiMap);
90 auto& hfc = fHitfinder;
92 if (hfc.maxHitsPerModule > hfc.hitsAllocatedPerModule) {
93 L_(error) << fmt::format(
"STS Hitfinder Chain: Too many hits per module for static allocation: {} > {}",
94 hfc.maxHitsPerModule, hfc.hitsAllocatedPerModule);
95 hfc.maxHitsPerModule = hfc.hitsAllocatedPerModule;
100 L_(debug) <<
"STS Hitfinder Chain: Clearing buffers...";
105 queue.memset(hfc.monitor, 0);
106 queue.memset(hfc.digiConnectorsPerModule, 0);
107 queue.memset(hfc.channelOffsetPerModule, 0);
112 queue.memset(hfc.nClustersPerModule, 0);
114 queue.memset(hfc.nHitsPerModule, 0);
115 queue.memset(hfc.maxClusterTimeErrorByModuleSide, 0);
117 L_(debug) <<
"STS Hitfinder Chain: Copy digis buffers...";
118 xpu::set<TheHitfinder>(fHitfinder);
122 const CbmStsDigi* digisH = xpu::h_view(hfc.digisPerModule).data();
123 CbmStsDigi* digisD = hfc.digisPerModule.get();
124 if (digisH != digisD) queue.copy(digisH, digisD, nDigisTotal);
125 queue.copy(hfc.digiOffsetPerModule, xpu::h2d);
127 L_(debug) <<
"STS Hitfinder Chain: Sort Digis...";
130 queue.launch<
SortDigis>(xpu::n_blocks(nModuleSides));
131 xpu::k_add_bytes<SortDigis>(nDigisTotal *
sizeof(
CbmStsDigi));
132#ifdef CBM_ONLINE_USE_FAIRLOGGER
133 if (fair::Logger::GetConsoleSeverity() == fair::Severity::trace) {
135 if (Opts().LogLevel() == trace) {
137 L_(trace) <<
"Ensuring STS digis are sorted...";
138 queue.copy(hfc.digisPerModule, xpu::d2h);
139 queue.copy(hfc.digiOffsetPerModule, xpu::d2h);
144 L_(debug) <<
"STS Hitfinder Chain: Find Clusters...";
145 if (!
Params().sts.findClustersMultiKernels) {
146 queue.launch<
FindClusters>(xpu::n_blocks(nModuleSides));
150 xpu::k_add_bytes<ChannelOffsets>(nDigisTotal *
sizeof(
CbmStsDigi));
152 xpu::k_add_bytes<CreateDigiConnections>(nDigisTotal *
sizeof(
CbmStsDigi));
154 xpu::k_add_bytes<CreateClusters>(nDigisTotal *
sizeof(
CbmStsDigi));
156#ifdef CBM_ONLINE_USE_FAIRLOGGER
157 if (fair::Logger::GetConsoleSeverity() == fair::Severity::trace) {
159 if (Opts().LogLevel() == trace) {
161 L_(trace) <<
"Ensuring STS channel offsets correct...";
162 xpu::buffer_prop propsOffset{hfc.channelOffsetPerModule};
163 std::vector<u32> channelOffsetPerModule;
164 channelOffsetPerModule.resize(propsOffset.size());
165 queue.copy(hfc.channelOffsetPerModule.get(), channelOffsetPerModule.data(), propsOffset.size());
167 EnsureChannelOffsets(channelOffsetPerModule);
169 L_(trace) <<
"Ensuring STS clusters are ok...";
170 xpu::buffer_prop props{hfc.clusterIdxPerModule};
172 std::vector<ClusterIdx> clusterIdxPerModule;
173 clusterIdxPerModule.resize(props.size());
174 std::vector<PaddedToCacheLine<int>> nClustersPerModule;
175 nClustersPerModule.resize(fPars->setup.modules.size() * 2, 0);
177 queue.copy(hfc.clusterIdxPerModule.get(), clusterIdxPerModule.data(), props.size());
178 queue.copy(hfc.nClustersPerModule.get(), nClustersPerModule.data(), nClustersPerModule.size());
180 EnsureClustersSane(clusterIdxPerModule, nClustersPerModule);
183 L_(debug) <<
"STS Hitfinder Chain: Sort Clusters...";
184 queue.launch<
SortClusters>(xpu::n_blocks(nModuleSides));
186 L_(debug) <<
"STS Hitfinder Chain: Find Hits...";
187 queue.copy(hfc.nClustersPerModule, xpu::d2h);
189 xpu::h_view nClusters(hfc.nClustersPerModule);
190 size_t nClustersTotal = std::accumulate(nClusters.begin(), nClusters.end(), 0);
191 xpu::k_add_bytes<SortClusters>(nClustersTotal *
sizeof(
ClusterIdx));
193 size_t nClustersFront = std::accumulate(nClusters.begin(), nClusters.begin() + nModules, 0);
195 bool isCpu = xpu::device::active().backend() == xpu::cpu;
198 xpu::grid findHitsG = xpu::n_threads(isCpu ? threadsCPU : nClustersFront);
200 xpu::k_add_bytes<FindHits>(nClustersTotal *
sizeof(
sts::Cluster));
202 queue.copy(hfc.nHitsPerModule, xpu::d2h);
205 auto hits = FlattenHits(queue);
206 queue.copy(hfc.monitor, xpu::d2h);
212 xpu::scoped_timer hitStreams_{
"Sort Hits"};
215 hits = SplitHitsIntoStreams(
hits, 100);
216 SortHitsWithinPartition(
hits);
217#ifdef CBM_ONLINE_USE_FAIRLOGGER
218 if (fair::Logger::GetConsoleSeverity() == fair::Severity::trace) {
220 if (Opts().LogLevel() == trace) {
222 EnsureHitsSorted(
hits);
226 xpu::h_view monitor(hfc.monitor);
230 if (monitor[0].nClusterBucketOverflow > 0) {
231 L_(error) <<
"STS Hitfinder Chain: Cluster bucket overflow! " << monitor[0].nClusterBucketOverflow
232 <<
" clusters were discarded!";
234 for (
size_t m = 0; m < nModules * 2; m++) {
235 if (
static_cast<size_t>(nClusters[m]) > hfc.maxClustersPerModule) {
236 L_(error) <<
"STS Hitfinder Chain: Cluster bucket overflow in module " << m <<
" with " << *nClusters[m]
237 <<
" (of " << hfc.maxClustersPerModule <<
" max)"
239 nClusters[m] = hfc.maxClustersPerModule;
244 xpu::h_view nHits(hfc.nHitsPerModule);
245 if (monitor[0].nHitBucketOverflow > 0) {
246 L_(error) <<
"STS Hitfinder Chain: Hit bucket overflow! " << monitor[0].nHitBucketOverflow
247 <<
" hits were discarded!";
249 for (
size_t m = 0; m < nModules; m++) {
250 const size_t nHitsModule = nHits[m];
251 const size_t nClustersModule = nClusters[m];
252 const float hitsPerCluster = nClustersModule > 0 ? nHitsModule / float(nClustersModule) : 0;
253 if (nHitsModule > hfc.maxHitsPerModule) {
254 L_(error) <<
"STS Hitfinder Chain: Hit bucket overflow in module " << m <<
" with " << nHitsModule <<
" (of "
255 << hfc.maxHitsPerModule <<
" max) hits! "
256 <<
"Module has " << nClustersModule <<
" clusters. " << hitsPerCluster <<
" hits per cluster.";
257 nHits[m] = hfc.maxHitsPerModule;
263 if (storeClusters) clusters = FlattenClusters(queue);
265 size_t nHitsTotal = std::accumulate(nHits.begin(), nHits.end(), 0);
266 L_(debug) <<
"Timeslice contains " << nHitsTotal <<
" STS hits and " << nClustersTotal <<
" STS clusters";
269 result.
clusters = std::move(clusters);
286 const int nChannels = fPars->setup.nChannels / 2;
287 const int nModules = fPars->setup.modules.size();
288 const int nModuleSides = 2 * nModules;
293 fHitfinder.nModules = nModules;
294 fHitfinder.nChannels = nChannels;
295 fHitfinder.asic = fPars->setup.asic;
298 size_t nLandauTableEntries = fPars->setup.landauTable.values.size();
299 fHitfinder.landauTableSize = nLandauTableEntries;
300 fHitfinder.landauStepSize = fPars->setup.landauTable.stepSize;
301 fHitfinder.landauTable.reset(nLandauTableEntries, xpu::buf_io);
302 q.copy(fPars->setup.landauTable.values.data(), fHitfinder.landauTable.get(), nLandauTableEntries);
305 fHitfinder.localToGlobalRotationByModule.reset(9 * nModules, xpu::buf_io);
306 fHitfinder.localToGlobalTranslationByModule.reset(3 * nModules, xpu::buf_io);
309 xpu::h_view hRot(fHitfinder.localToGlobalRotationByModule);
310 xpu::h_view hTrans(fHitfinder.localToGlobalTranslationByModule);
311 for (
int m = 0; m < nModules; m++) {
312 const auto&
module = fPars->setup.modules.at(m);
313 std::copy_n(module.localToGlobal.rotation.data(), 9, &hRot[m * 9]);
314 std::copy_n(module.localToGlobal.translation.data(), 3, &hTrans[m * 3]);
318 q.copy(fHitfinder.localToGlobalRotationByModule, xpu::h2d);
319 q.copy(fHitfinder.localToGlobalTranslationByModule, xpu::h2d);
322 fHitfinder.sensorPars.reset(nModules, xpu::buf_io);
323 xpu::h_view hSensorPars(fHitfinder.sensorPars);
324 for (
int m = 0; m < nModules; m++) {
325 const auto&
module = fPars->setup.modules.at(m);
326 auto& gpuPars = hSensorPars[m];
327 gpuPars.dY =
module.dY;
328 gpuPars.pitch =
module.pitch;
329 gpuPars.stereoF =
module.stereoF;
330 gpuPars.stereoB =
module.stereoB;
331 gpuPars.lorentzF =
module.lorentzF;
332 gpuPars.lorentzB =
module.lorentzB;
334 q.copy(fHitfinder.sensorPars, xpu::h2d);
337 fHitfinder.maxClusterTimeErrorByModuleSide.reset(nModuleSides, xpu::buf_device);
339 fHitfinder.monitor.reset(1, xpu::buf_io);
346 L_(debug) <<
"STS Hitfinder Chain: Allocating dynamic memory for " << maxNDigisPerModule <<
" digis per module and "
347 << nDigisTotal <<
" digis in total";
350 xpu::scoped_timer t_(
"Allocate");
355 fPars->memory.maxNDigisPerModule = maxNDigisPerModule;
356 fPars->memory.maxNDigisPerTS = nDigisTotal;
359 const int nChannels = fPars->setup.nChannels / 2;
360 const int nModules = fPars->setup.modules.size();
361 const int nModuleSides = 2 * nModules;
363 const size_t maxNClustersPerModule = fPars->memory.MaxNClustersPerModule();
364 const size_t maxNHitsPerModule = fPars->memory.MaxNHitsPerModule();
367 const size_t maxHitsTotal = maxNHitsPerModule * nModules;
370 fHitfinder.digiOffsetPerModule.reset(nModuleSides + 1, xpu::buf_io);
371 fHitfinder.digisPerModule.reset(nDigisTotal, xpu::buf_io);
373 fHitfinder.digisPerModuleTmp.reset(nDigisTotal, xpu::buf_device);
374 fHitfinder.digiConnectorsPerModule.reset(nDigisTotal, xpu::buf_device);
376 fHitfinder.channelOffsetPerModule.reset(nModuleSides * nChannels, xpu::buf_device);
379 fHitfinder.maxClustersPerModule = maxNClustersPerModule;
381 fHitfinder.clusterIdxPerModule.reset(maxNClustersPerModule * nModuleSides, xpu::buf_device);
382 fHitfinder.clusterIdxPerModuleTmp.reset(maxNClustersPerModule * nModuleSides, xpu::buf_device);
383 fHitfinder.clusterIdxSortedPerModule.reset(nModuleSides, xpu::buf_device);
385 fHitfinder.clusterDataPerModule.reset(maxNClustersPerModule * nModuleSides, xpu::buf_io);
386 fHitfinder.nClustersPerModule.reset(nModuleSides, xpu::buf_io);
389 fHitfinder.hitsAllocatedPerModule = maxNHitsPerModule;
390 fHitfinder.hitsPerModule.reset(maxNHitsPerModule * nModules, xpu::buf_device);
391 fHitfinder.nHitsPerModule.reset(nModules, xpu::buf_io);
393 fHitfinder.hitsFlatCapacity = maxHitsTotal;
394 fHitfinder.hitsFlat.reset(maxHitsTotal, xpu::buf_pinned);
477 L_(debug) <<
"STS Hitfinder Chain: Flattening digis";
478 xpu::scoped_timer t_(
"Flatten Digis");
479 xpu::t_add_bytes(digis.size_bytes());
481 const auto& modules = fPars->setup.modules;
483 size_t nModules = modules.size();
484 size_t nModuleSides = nModules * 2;
485 int nChannelsPerSide = fPars->setup.nChannels / 2;
486 xpu::h_view pDigisFlat(fHitfinder.digisPerModule);
488 xpu::h_view pMdigiOffset(fHitfinder.digiOffsetPerModule);
490 for (
size_t m = 1; m < nModuleSides + 1; m++) {
491 pMdigiOffset[m] = pMdigiOffset[m - 1] + digiMap.
nDigisPerModule.at(m - 1);
495 for (
auto&
v : digiOffsetsPerThread)
496 v.resize(nModuleSides);
497 for (
size_t i = 1; i < digiOffsetsPerThread.size(); i++) {
498 for (
size_t m = 0; m < nModuleSides; m++) {
499 for (
size_t j = 0; j < i; j++) {
508 auto& digiOffsets = digiOffsetsPerThread.at(threadId);
511 for (
size_t i = 0; i < digis.size(); i++) {
512 const auto& digi = digis[i];
516 if (moduleIndex == InvalidModule)
continue;
518 bool isFront = digi.GetChannel() < nChannelsPerSide;
519 moduleIndex += isFront ? 0 : nModules;
521 size_t moduleOffset = pMdigiOffset[moduleIndex];
522 size_t digiOffset = digiOffsets[moduleIndex]++;
524 pDigisFlat[moduleOffset + digiOffset] = digi;
533 for (
size_t i = pMdigiOffset[nModules]; i < pMdigiOffset[nModuleSides]; i++) {
534 auto& digi = pDigisFlat[i];
535 digi.SetChannel(digi.GetChannel() - nChannelsPerSide);
541 auto& hfc = fHitfinder;
542 xpu::h_view nHits(hfc.nHitsPerModule);
544 size_t nHitsTotal = 0;
545 for (
int m = 0; m < hfc.nModules; m++)
546 nHitsTotal += GetNHits(nHits, m);
547 L_(debug) <<
"STS Hitfinder Chain: Flattening " << nHitsTotal <<
" hits";
549 if (nHitsTotal > hfc.hitsFlatCapacity)
550 throw ProcessingError(
"STS Hitfinder Chain: Not enough memory for hits: {} > {}", nHitsTotal, hfc.hitsFlatCapacity);
554 xpu::h_view hitBuffer(hfc.hitsFlat);
555 gsl::span
hits(hitBuffer.data(), nHitsTotal);
565 if (xpu::device::active().backend() == xpu::cpu) {
569 xpu::scoped_timer t_(
"Flatten Hits");
570 xpu::t_add_bytes(nHitsTotal *
sizeof(
sts::Hit));
572 for (
int m = 0; m < hfc.nModules; m++) {
574 for (
int i = 0; i < m; i++) {
575 offset += GetNHits(nHits, i);
577 std::copy_n(hfc.hitsPerModule.get() + hfc.hitsAllocatedPerModule * m, GetNHits(nHits, m),
hits.begin() + offset);
588 size_t nHitsCopied = 0;
589 for (
int m = 0; m < hfc.nModules; m++) {
590 size_t numHitsInModule = GetNHits(nHits, m);
591 queue.copy(hfc.hitsPerModule.get() + hfc.hitsAllocatedPerModule * m,
hits.data() + nHitsCopied, numHitsInModule);
592 nHitsCopied += numHitsInModule;
601 fHitOffsets.push_back(0);
602 for (
int m = 0; m < hfc.nModules; m++)
603 fHitOffsets.push_back(fHitOffsets.back() + GetNHits(nHits, m));
606 for (
auto& m : fPars->setup.modules)
607 fAddresses.push_back(m.address);
610 return partitionedHits;
615 auto& hfc = fHitfinder;
617 xpu::h_view nClusters(hfc.nClustersPerModule);
618 int maxClustersPerModule = hfc.maxClustersPerModule;
619 sts::Cluster* clusters = hfc.clusterDataPerModule.get();
621 int nModuleSides = nClusters.size();
623 L_(debug) <<
"Copy clusters from " << nModuleSides <<
" modules sides";
625 std::vector<u32> addresses;
626 for (
size_t i = 0; i < 2; i++) {
627 for (
auto& m : fPars->setup.modules)
628 addresses.push_back(m.address);
631 size_t nClustersTotal = std::accumulate(nClusters.begin(), nClusters.end(), 0);
635 std::vector<sts::Cluster> clustersFlat;
636 clustersFlat.resize(nClustersTotal);
638 for (
int m = 0; m < nModuleSides; m++) {
639 size_t nClustersInModule = nClusters[m];
640 queue.copy(clusters + maxClustersPerModule * m, clustersFlat.data() + offset, nClustersInModule);
641 offset += nClustersInModule;
645 std::vector<size_t> sizes(nClusters.begin(), nClusters.end());
657 constexpr size_t MinHitsPerStream = 10;
660 int streamsPerSensor = std::max(1, nstreamsMax / nSensors);
663 auto nextMultipleOf = [](
int x,
int mult) {
673 return x + mult - rem;
676 fHitStreamOffsets.clear();
677 fStreamAddresses.clear();
679 fHitStreamOffsets.push_back(0);
680 for (
size_t sensor = 0; sensor <
hits.NPartitions(); sensor++) {
681 auto [hitsSensor, addr] =
hits.Partition(sensor);
682 const size_t nHits = hitsSensor.size();
685 if (nHits < streamsPerSensor * MinHitsPerStream) {
686 fHitStreamOffsets.push_back(fHitStreamOffsets.back() + nHits);
687 fStreamAddresses.push_back(addr);
691 size_t rem = nHits % streamsPerSensor;
693 size_t streamSize = nHits / streamsPerSensor;
694 for (
int s = 0; s < streamsPerSensor; s++) {
695 fHitStreamOffsets.push_back(fHitStreamOffsets.back() + streamSize);
696 fStreamAddresses.push_back(addr);
700 size_t streamSize = nextMultipleOf(nHits, streamsPerSensor) / streamsPerSensor;
701 size_t nHitsLeft = nHits;
702 for (
int s = 0; s < streamsPerSensor; s++) {
703 fHitStreamOffsets.push_back(fHitStreamOffsets.back() + std::min(nHitsLeft, streamSize));
704 fStreamAddresses.push_back(addr);
705 nHitsLeft -= streamSize;
780 xpu::h_view digisPerModule(fHitfinder.digisPerModule);
781 xpu::h_view digiOffsetPerModule(fHitfinder.digiOffsetPerModule);
782 for (
size_t m = 0; m < fPars->setup.modules.size() * 2; m++) {
783 int nChannels = fPars->setup.nChannels / 2;
785 std::vector<u32> expectedChannelOffsets(nChannels, 0);
787 int offset = digiOffsetPerModule[m];
788 int nDigis = digiOffsetPerModule[m + 1] - offset;
789 gsl::span<CbmStsDigi> digis(&digisPerModule[offset], nDigis);
790 gsl::span<u32> channelOffsets = channelOffsetsByModule.subspan(m * nChannels, nChannels);
792 if (nDigis == 0)
continue;
794 if (channelOffsets[0] != 0) {
795 throw FatalError(
"Module {}: First channel offset is not 0", m);
798 int chan = digis[0].GetChannel();
799 for (
int i = 0; i < nDigis; i++) {
800 if (digis[i].GetChannel() != chan) {
801 while (chan < digis[i].GetChannel()) {
803 expectedChannelOffsets[chan] = i;
807 while (chan < nChannels) {
809 expectedChannelOffsets[chan] = nDigis;
812 for (
int i = 0; i < nChannels; i++) {
813 if (channelOffsets[i] != expectedChannelOffsets[i]) {
814 throw FatalError(
"Module {}: Channel offset for channel {} is {} but should be {}", m, i, channelOffsets[i],
815 expectedChannelOffsets[i]);