CbmRoot
Loading...
Searching...
No Matches
HitfinderChain.cxx
Go to the documentation of this file.
1/* Copyright (C) 2022 FIAS Frankfurt Institute for Advanced Studies, Frankfurt / Main
2 SPDX-License-Identifier: GPL-3.0-only
3 Authors: Felix Weiglhofer [committer], Kilian Hunold */
4
6
8#include "PODVector.h"
11
12#include <numeric>
13
14using namespace cbm::algo;
17
19{
20 fPars.emplace(parameters);
22 auto& memoryPars = fPars->memory;
23
24 if (memoryPars.IsAuto()) {
25 if (xpu::device().active().backend() == xpu::cpu) {
26 memoryPars.allocationMode = RecoParams::AllocationMode::Dynamic;
27 }
28 else {
29 memoryPars.allocationMode = RecoParams::AllocationMode::Static;
30 }
31 }
32
33 if (!memoryPars.IsDynamic() && !memoryPars.IsStatic()) {
34 throw std::runtime_error(
35 fmt::format("STS Hitfinder Chain: Unknown allocation mode: {}", util::ToString(memoryPars.allocationMode)));
36 }
37
38 if (memoryPars.IsStatic()) {
39 AllocateDynamic(memoryPars.maxNDigisPerModule, memoryPars.maxNDigisPerTS);
40 }
41}
42
44{
45 // Explicitly free buffers in constant memory.
46 // This avoids an issue in xpu with teardown order of static variables when using CPU.
47 fHitfinder = {};
48 xpu::set<TheHitfinder>(fHitfinder);
49}
50
51sts::HitfinderChain::Result sts::HitfinderChain::operator()(gsl::span<const CbmStsDigi> digis, bool storeClusters)
52{
54
55 Result result;
56
57 size_t nModules = fPars->setup.modules.size();
58 size_t nModuleSides = nModules * 2;
59 size_t nDigisTotal = digis.size();
60
61 // Getting the digis on the GPU requires 3 steps
62 // 1. Sort digis into buckets by module
63 DigiMap digiMap = CountDigisPerModules(digis);
64 // 2. Once we know number of digis per module, we can allocate
65 // the dynamic buffers on the gpu, as the buffer sizes depend on that value
66 if (fPars->memory.IsDynamic())
67 AllocateDynamic(digiMap.maxNDigisPerModule, nDigisTotal);
68 else {
69 if (digiMap.maxNDigisPerModule > fPars->memory.maxNDigisPerModule) {
70 throw ProcessingError("STS Hitfinder Chain: Too many digis per module for static allocation: {} > {}",
71 digiMap.maxNDigisPerModule, fPars->memory.maxNDigisPerModule);
72 }
73 if (nDigisTotal > fPars->memory.maxNDigisPerTS) {
74 throw ProcessingError("STS Hitfinder Chain: Too many digis per timeslice for static allocation: {} > {}",
75 nDigisTotal, fPars->memory.maxNDigisPerTS);
76 }
77 }
78 // 3. Copy digis into flat array with offsets per module
79 FlattenDigis(digis, digiMap);
80
81#ifdef CBM_ONLINE_USE_FAIRLOGGER
82 if (fair::Logger::GetConsoleSeverity() == fair::Severity::trace) {
83#else
84 if (Opts().LogLevel() == trace) {
85#endif // CBM_ONLINE_USE_FAIRLOGGER
86 EnsureDigiOffsets(digiMap);
87 }
88
89 xpu::queue queue;
90
91 // Set constants
92 auto& hfc = fHitfinder;
93 hfc.maxHitsPerModule = fPars->memory.NHitsUpperBound(digiMap.maxNDigisPerModule);
94 if (hfc.maxHitsPerModule > hfc.hitsAllocatedPerModule) {
95 L_(error) << fmt::format("STS Hitfinder Chain: Too many hits per module for static allocation: {} > {}",
96 hfc.maxHitsPerModule, hfc.hitsAllocatedPerModule);
97 hfc.maxHitsPerModule = hfc.hitsAllocatedPerModule;
98 }
99
100 // Clear buffers
101 // Not all buffers have to initialized, but useful for debugging
102 L_(debug) << "STS Hitfinder Chain: Clearing buffers...";
103 // xpu::memset(hfc.digisPerModule, 0);
104 // xpu::memset(hfc.digisPerModuleTmp, 0);
105 // xpu::memset(hfc.digisSortedPerModule, 0);
106 // xpu::memset(hfc.digiOffsetPerModule, 0);
107 queue.memset(hfc.monitor, 0);
108 queue.memset(hfc.digiConnectorsPerModule, 0);
109 queue.memset(hfc.channelOffsetPerModule, 0);
110 // queue.memset(hfc.clusterIdxPerModule, 0);
111 // xpu::memset(hfc.clusterIdxPerModuleTmp, 0);
112 // xpu::memset(hfc.clusterIdxSortedPerModule, 0);
113 // xpu::memset(hfc.clusterDataPerModule, 0);
114 queue.memset(hfc.nClustersPerModule, 0);
115 // xpu::memset(hfc.hitsPerModule, 0);
116 queue.memset(hfc.nHitsPerModule, 0);
117 queue.memset(hfc.maxClusterTimeErrorByModuleSide, 0);
118
119 L_(debug) << "STS Hitfinder Chain: Copy digis buffers...";
120 xpu::set<TheHitfinder>(fHitfinder);
121
122 // Only copy the actual number of digis, not the whole allocated buffer
123 // TODO add support in xpu, for buffer copies with offset + size
124 const CbmStsDigi* digisH = xpu::h_view(hfc.digisPerModule).data();
125 CbmStsDigi* digisD = hfc.digisPerModule.get();
126 if (digisH != digisD) queue.copy(digisH, digisD, nDigisTotal);
127 queue.copy(hfc.digiOffsetPerModule, xpu::h2d);
128
129 L_(debug) << "STS Hitfinder Chain: Sort Digis...";
130 // TODO: skip temporary buffers and sort directly into digisSortedPerModule
131
132 queue.launch<SortDigis>(xpu::n_blocks(nModuleSides));
133 xpu::k_add_bytes<SortDigis>(nDigisTotal * sizeof(CbmStsDigi));
134#ifdef CBM_ONLINE_USE_FAIRLOGGER
135 if (fair::Logger::GetConsoleSeverity() == fair::Severity::trace) {
136#else
137 if (Opts().LogLevel() == trace) {
138#endif // CBM_ONLINE_USE_FAIRLOGGER
139 L_(trace) << "Ensuring STS digis are sorted...";
140 queue.copy(hfc.digisPerModule, xpu::d2h);
141 queue.copy(hfc.digiOffsetPerModule, xpu::d2h);
142 queue.wait();
144 }
145
146 L_(debug) << "STS Hitfinder Chain: Find Clusters...";
147 if (!Params().sts.findClustersMultiKernels) {
148 queue.launch<FindClusters>(xpu::n_blocks(nModuleSides));
149 }
150 else {
151 queue.launch<ChannelOffsets>(xpu::n_blocks(nModuleSides));
152 xpu::k_add_bytes<ChannelOffsets>(nDigisTotal * sizeof(CbmStsDigi));
153 queue.launch<CreateDigiConnections>(xpu::n_threads(nDigisTotal));
154 xpu::k_add_bytes<CreateDigiConnections>(nDigisTotal * sizeof(CbmStsDigi));
155 queue.launch<CreateClusters>(xpu::n_threads(nDigisTotal));
156 xpu::k_add_bytes<CreateClusters>(nDigisTotal * sizeof(CbmStsDigi));
157 }
158#ifdef CBM_ONLINE_USE_FAIRLOGGER
159 if (fair::Logger::GetConsoleSeverity() == fair::Severity::trace) {
160#else
161 if (Opts().LogLevel() == trace) {
162#endif // CBM_ONLINE_USE_FAIRLOGGER
163 L_(trace) << "Ensuring STS channel offsets correct...";
164 xpu::buffer_prop propsOffset{hfc.channelOffsetPerModule};
165 std::vector<u32> channelOffsetPerModule;
166 channelOffsetPerModule.resize(propsOffset.size());
167 queue.copy(hfc.channelOffsetPerModule.get(), channelOffsetPerModule.data(), propsOffset.size());
168 queue.wait();
169 EnsureChannelOffsets(channelOffsetPerModule);
170
171 L_(trace) << "Ensuring STS clusters are ok...";
172 xpu::buffer_prop props{hfc.clusterIdxPerModule};
173
174 std::vector<ClusterIdx> clusterIdxPerModule;
175 clusterIdxPerModule.resize(props.size());
176 std::vector<PaddedToCacheLine<int>> nClustersPerModule;
177 nClustersPerModule.resize(fPars->setup.modules.size() * 2, 0);
178
179 queue.copy(hfc.clusterIdxPerModule.get(), clusterIdxPerModule.data(), props.size());
180 queue.copy(hfc.nClustersPerModule.get(), nClustersPerModule.data(), nClustersPerModule.size());
181 queue.wait();
182 EnsureClustersSane(clusterIdxPerModule, nClustersPerModule);
183 }
184
185 L_(debug) << "STS Hitfinder Chain: Sort Clusters...";
186 queue.launch<SortClusters>(xpu::n_blocks(nModuleSides));
187
188 L_(debug) << "STS Hitfinder Chain: Find Hits...";
189 queue.copy(hfc.nClustersPerModule, xpu::d2h);
190 queue.wait();
191 xpu::h_view nClusters(hfc.nClustersPerModule);
192 size_t nClustersTotal = std::accumulate(nClusters.begin(), nClusters.end(), 0);
193 xpu::k_add_bytes<SortClusters>(nClustersTotal * sizeof(ClusterIdx));
194
195 size_t nClustersFront = std::accumulate(nClusters.begin(), nClusters.begin() + nModules, 0);
196
197 bool isCpu = xpu::device::active().backend() == xpu::cpu;
198 // bool isCpu = false;
199 int threadsCPU = kFindHitsChunksPerModule * nModules;
200 xpu::grid findHitsG = xpu::n_threads(isCpu ? threadsCPU : nClustersFront);
201 queue.launch<FindHits>(findHitsG);
202 xpu::k_add_bytes<FindHits>(nClustersTotal * sizeof(sts::Cluster));
203
204 queue.copy(hfc.nHitsPerModule, xpu::d2h);
205 queue.wait();
206
207 auto hits = FlattenHits(queue);
208 queue.copy(hfc.monitor, xpu::d2h);
209
210 queue.wait();
211
212 // TODO: make this step optional / hide behind flag
213 {
214 xpu::scoped_timer hitStreams_{"Sort Hits"};
215
216 // TODO: Make number of streams configurable
219#ifdef CBM_ONLINE_USE_FAIRLOGGER
220 if (fair::Logger::GetConsoleSeverity() == fair::Severity::trace) {
221#else
222 if (Opts().LogLevel() == trace) {
223#endif // CBM_ONLINE_USE_FAIRLOGGER
225 }
226 }
227
228 xpu::h_view monitor(hfc.monitor);
229
230 // Note: Checking for cluster bucket overflow is probably paranoid
231 // as we allocate enough memory for one cluster per digi.
232 if (monitor[0].nClusterBucketOverflow > 0) {
233 L_(error) << "STS Hitfinder Chain: Cluster bucket overflow! " << monitor[0].nClusterBucketOverflow
234 << " clusters were discarded!";
235
236 for (size_t m = 0; m < nModules * 2; m++) {
237 if (static_cast<size_t>(nClusters[m]) > hfc.maxClustersPerModule) {
238 L_(error) << "STS Hitfinder Chain: Cluster bucket overflow in module " << m << " with " << *nClusters[m]
239 << " (of " << hfc.maxClustersPerModule << " max)"
240 << " clusters!";
241 nClusters[m] = hfc.maxClustersPerModule;
242 }
243 }
244 }
245
246 xpu::h_view nHits(hfc.nHitsPerModule);
247 if (monitor[0].nHitBucketOverflow > 0) {
248 L_(error) << "STS Hitfinder Chain: Hit bucket overflow! " << monitor[0].nHitBucketOverflow
249 << " hits were discarded!";
250
251 for (size_t m = 0; m < nModules; m++) {
252 const size_t nHitsModule = nHits[m];
253 const size_t nClustersModule = nClusters[m];
254 const float hitsPerCluster = nClustersModule > 0 ? nHitsModule / float(nClustersModule) : 0;
255 if (nHitsModule > hfc.maxHitsPerModule) {
256 L_(error) << "STS Hitfinder Chain: Hit bucket overflow in module " << m << " with " << nHitsModule << " (of "
257 << hfc.maxHitsPerModule << " max) hits! "
258 << "Module has " << nClustersModule << " clusters. " << hitsPerCluster << " hits per cluster.";
259 nHits[m] = hfc.maxHitsPerModule;
260 }
261 }
262 }
263
265 if (storeClusters) clusters = FlattenClusters(queue);
266
267 size_t nHitsTotal = std::accumulate(nHits.begin(), nHits.end(), 0);
268 L_(debug) << "Timeslice contains " << nHitsTotal << " STS hits and " << nClustersTotal << " STS clusters";
269
270 result.hits = std::move(hits);
271 result.clusters = std::move(clusters);
272 result.monitor.nClusterTotal = nClustersTotal;
273 result.monitor.nHitsTotal = result.hits.NElements();
274 result.monitor.SetDeviceMon(monitor[0]);
275 return result;
276}
277
279{
280 if (fPars == std::nullopt) throw std::runtime_error("sts::HitfinderChain: Parameters not set. Can't continue!");
281}
282
284{
286
287 // Shorthands for common constants
288 const int nChannels = fPars->setup.nChannels / 2; // Only count channels on one side of the module
289 const int nModules = fPars->setup.modules.size();
290 const int nModuleSides = 2 * nModules; // Number of module front + backsides
291
292 xpu::queue q;
293
294 // Set GPU constants
295 fHitfinder.nModules = nModules;
296 fHitfinder.nChannels = nChannels;
297 fHitfinder.asic = fPars->setup.asic;
298
299 // Copy landau table
300 size_t nLandauTableEntries = fPars->setup.landauTable.values.size();
301 fHitfinder.landauTableSize = nLandauTableEntries;
302 fHitfinder.landauStepSize = fPars->setup.landauTable.stepSize;
303 fHitfinder.landauTable.reset(nLandauTableEntries, xpu::buf_io);
304 q.copy(fPars->setup.landauTable.values.data(), fHitfinder.landauTable.get(), nLandauTableEntries);
305
306 // Copy transformation matrix
307 fHitfinder.localToGlobalRotationByModule.reset(9 * nModules, xpu::buf_io);
308 fHitfinder.localToGlobalTranslationByModule.reset(3 * nModules, xpu::buf_io);
309
310 // - Copy matrix into flat array
311 xpu::h_view hRot(fHitfinder.localToGlobalRotationByModule);
312 xpu::h_view hTrans(fHitfinder.localToGlobalTranslationByModule);
313 for (int m = 0; m < nModules; m++) {
314 const auto& module = fPars->setup.modules.at(m);
315 std::copy_n(module.localToGlobal.rotation.data(), 9, &hRot[m * 9]);
316 std::copy_n(module.localToGlobal.translation.data(), 3, &hTrans[m * 3]);
317 }
318
319 // - Then copy to GPU
320 q.copy(fHitfinder.localToGlobalRotationByModule, xpu::h2d);
321 q.copy(fHitfinder.localToGlobalTranslationByModule, xpu::h2d);
322
323 // Copy Sensor Parameteres
324 fHitfinder.sensorPars.reset(nModules, xpu::buf_io);
325 xpu::h_view hSensorPars(fHitfinder.sensorPars);
326 for (int m = 0; m < nModules; m++) {
327 const auto& module = fPars->setup.modules.at(m);
328 auto& gpuPars = hSensorPars[m];
329 gpuPars.dY = module.dY;
330 gpuPars.pitch = module.pitch;
331 gpuPars.stereoF = module.stereoF;
332 gpuPars.stereoB = module.stereoB;
333 gpuPars.lorentzF = module.lorentzF;
334 gpuPars.lorentzB = module.lorentzB;
335 }
336 q.copy(fHitfinder.sensorPars, xpu::h2d);
337
338 // Time errors
339 fHitfinder.maxClusterTimeErrorByModuleSide.reset(nModuleSides, xpu::buf_device);
340
341 fHitfinder.monitor.reset(1, xpu::buf_io);
342
343 q.wait();
344}
345
346void sts::HitfinderChain::AllocateDynamic(size_t maxNDigisPerModule, size_t nDigisTotal)
347{
348 L_(debug) << "STS Hitfinder Chain: Allocating dynamic memory for " << maxNDigisPerModule << " digis per module and "
349 << nDigisTotal << " digis in total";
351
352 xpu::scoped_timer t_("Allocate");
353
354 // TODO: some of these buffers have a constant size and can be allocated statically.
355 // Just the data they contain is static.
356
357 fPars->memory.maxNDigisPerModule = maxNDigisPerModule;
358 fPars->memory.maxNDigisPerTS = nDigisTotal;
359
360 // Shorthands for common constants
361 const int nChannels = fPars->setup.nChannels / 2; // Only count channels on one side of the module
362 const int nModules = fPars->setup.modules.size();
363 const int nModuleSides = 2 * nModules; // Number of module front + backsides
364
365 const size_t maxNClustersPerModule = fPars->memory.MaxNClustersPerModule();
366 const size_t maxNHitsPerModule = fPars->memory.MaxNHitsPerModule();
367
368 // TODO: this number should be much lower, estimate from max digis per TS
369 const size_t maxHitsTotal = maxNHitsPerModule * nModules;
370
371 // Allocate Digi Buffers
372 fHitfinder.digiOffsetPerModule.reset(nModuleSides + 1, xpu::buf_io);
373 fHitfinder.digisPerModule.reset(nDigisTotal, xpu::buf_io);
374
375 fHitfinder.digisPerModuleTmp.reset(nDigisTotal, xpu::buf_device);
376 fHitfinder.digiConnectorsPerModule.reset(nDigisTotal, xpu::buf_device);
377
378 fHitfinder.channelOffsetPerModule.reset(nModuleSides * nChannels, xpu::buf_device);
379
380 // Allocate Cluster Buffers
381 fHitfinder.maxClustersPerModule = maxNClustersPerModule;
382
383 fHitfinder.clusterIdxPerModule.reset(maxNClustersPerModule * nModuleSides, xpu::buf_device);
384 fHitfinder.clusterIdxPerModuleTmp.reset(maxNClustersPerModule * nModuleSides, xpu::buf_device);
385 fHitfinder.clusterIdxSortedPerModule.reset(nModuleSides, xpu::buf_device);
386
387 fHitfinder.clusterDataPerModule.reset(maxNClustersPerModule * nModuleSides, xpu::buf_io);
388 fHitfinder.nClustersPerModule.reset(nModuleSides, xpu::buf_io);
389
390 // Allocate Hit Buffers
391 fHitfinder.hitsAllocatedPerModule = maxNHitsPerModule;
392 fHitfinder.hitsPerModule.reset(maxNHitsPerModule * nModules, xpu::buf_device);
393 fHitfinder.nHitsPerModule.reset(nModules, xpu::buf_io);
394
395 fHitfinder.hitsFlatCapacity = maxHitsTotal;
396 fHitfinder.hitsFlat.reset(maxHitsTotal, xpu::buf_pinned);
397}
398
400{
401 L_(debug) << "STS Hitfinder Chain: Sorting " << digis.size() << " digis into modules";
402 xpu::scoped_timer t_("Count Digis By Module");
403 xpu::t_add_bytes(digis.size_bytes());
404
405 size_t nModules = fPars->setup.modules.size();
406 size_t nModuleSides = nModules * 2;
407 DigiMap digiMap;
408 digiMap.nDigisPerModule.resize(nModuleSides);
409
410 // Create map from module address to index
411 // This could be done just once in the beginning,
412 // but overhead is negligible compared to the digi copying
413 digiMap.addrToIndex.resize(1 << 17, InvalidModule);
414 for (size_t m = 0; m < nModules; m++) {
415 const auto& module = fPars->setup.modules[m];
416 i32 paddr = CbmStsAddress::PackDigiAddress(module.address);
417 digiMap.addrToIndex[paddr] = u16(m);
418 }
419
420 int nChannelsPerSide = fPars->setup.nChannels / 2;
421
422 // TODO: try to parallise
423 // Count digis per module in parallel
424 // Then calculate offset per module
425 // Write digis directly to flat array in parallel
426 // Use atomic add to increment offset per module
427 digiMap.nDigisPerThread.resize(openmp::GetMaxThreads());
428 for (auto& v : digiMap.nDigisPerThread)
429 v.resize(nModuleSides, 0);
430
432 {
433 int threadId = openmp::GetThreadNum(); // Move out of the loop, seems to create small overhead
434 auto& nDigis = digiMap.nDigisPerThread.at(threadId);
435
436 CBM_OMP(for schedule(static))
437 for (size_t i = 0; i < digis.size(); i++) {
438 const auto& digi = digis[i];
439 u16 moduleIndex = digiMap.ModuleIndex(digi);
440
441 if (moduleIndex == InvalidModule) continue;
442 // if (moduleIndex == InvalidModule || digi.GetAddress() == 0x10000002) continue;
443
444
445 bool isFront = digi.GetChannel() < nChannelsPerSide;
446 nDigis[moduleIndex + (isFront ? 0 : nModules)]++;
447 }
448 }
449
450
451 // Sum up digis per module
452 for (auto& v : digiMap.nDigisPerThread) {
453 for (size_t m = 0; m < nModuleSides; m++) {
454 digiMap.nDigisPerModule[m] += v[m];
455 }
456 }
457
458 // if (nPulsers > 0) L_(warning) << "STS Hitfinder: Discarded " << nPulsers << " pulser digis";
459
460 // Print digi counts per module
461 for (const auto& mod : fPars->setup.modules) {
462 // FIXME should use module index here
463 i32 moduleAddr = CbmStsAddress::PackDigiAddress(mod.address);
464 u16 moduleIndex = digiMap.addrToIndex[moduleAddr];
465 L_(debug) << "Module " << moduleAddr << " has " << digiMap.nDigisPerModule[moduleIndex] << " front digis and "
466 << digiMap.nDigisPerModule[moduleIndex + nModules] << " back digis";
467 }
468
469 digiMap.maxNDigisPerModule = 0;
470 for (const auto& nDigis : digiMap.nDigisPerModule) {
471 digiMap.maxNDigisPerModule = std::max(digiMap.maxNDigisPerModule, nDigis);
472 }
473
474 return digiMap;
475}
476
477void sts::HitfinderChain::FlattenDigis(gsl::span<const CbmStsDigi> digis, DigiMap& digiMap)
478{
479 L_(debug) << "STS Hitfinder Chain: Flattening digis";
480 xpu::scoped_timer t_("Flatten Digis");
481 xpu::t_add_bytes(digis.size_bytes());
482
483 const auto& modules = fPars->setup.modules;
484
485 size_t nModules = modules.size();
486 size_t nModuleSides = nModules * 2;
487 int nChannelsPerSide = fPars->setup.nChannels / 2;
488 xpu::h_view pDigisFlat(fHitfinder.digisPerModule); // Final input copied the GPU
489
490 xpu::h_view pMdigiOffset(fHitfinder.digiOffsetPerModule);
491 pMdigiOffset[0] = 0;
492 for (size_t m = 1; m < nModuleSides + 1; m++) {
493 pMdigiOffset[m] = pMdigiOffset[m - 1] + digiMap.nDigisPerModule.at(m - 1);
494 }
495
496 std::vector<std::vector<size_t>> digiOffsetsPerThread(openmp::GetMaxThreads());
497 for (auto& v : digiOffsetsPerThread)
498 v.resize(nModuleSides);
499 for (size_t i = 1; i < digiOffsetsPerThread.size(); i++) {
500 for (size_t m = 0; m < nModuleSides; m++) {
501 for (size_t j = 0; j < i; j++) {
502 digiOffsetsPerThread[i][m] += digiMap.nDigisPerThread[j][m];
503 }
504 }
505 }
506
508 {
509 int threadId = openmp::GetThreadNum(); // Move out of the loop, seems to create small overhead
510 auto& digiOffsets = digiOffsetsPerThread.at(threadId);
511
512 CBM_OMP(for schedule(static))
513 for (size_t i = 0; i < digis.size(); i++) {
514 const auto& digi = digis[i];
515
516 u16 moduleIndex = digiMap.ModuleIndex(digi);
517
518 if (moduleIndex == InvalidModule) continue;
519
520 bool isFront = digi.GetChannel() < nChannelsPerSide;
521 moduleIndex += isFront ? 0 : nModules;
522
523 size_t moduleOffset = pMdigiOffset[moduleIndex];
524 size_t digiOffset = digiOffsets[moduleIndex]++;
525
526 pDigisFlat[moduleOffset + digiOffset] = digi;
527 }
528 }
529
530 // Channels in Digis are counted 0 to 2047 by default.
531 // Channel 0 -> 1023 is the front side of the module
532 // Channel 1024 -> 2047 is the back side of the module
533 // Cluster finder only operates on one side, so we have to shift the channel numbers
534 CBM_PARALLEL_FOR(schedule(static))
535 for (size_t i = pMdigiOffset[nModules]; i < pMdigiOffset[nModuleSides]; i++) {
536 auto& digi = pDigisFlat[i];
537 digi.SetChannel(digi.GetChannel() - nChannelsPerSide);
538 }
539}
540
542{
543 auto& hfc = fHitfinder;
544 xpu::h_view nHits(hfc.nHitsPerModule);
545
546 size_t nHitsTotal = 0;
547 for (int m = 0; m < hfc.nModules; m++)
548 nHitsTotal += GetNHits(nHits, m);
549 L_(debug) << "STS Hitfinder Chain: Flattening " << nHitsTotal << " hits";
550
551 if (nHitsTotal > hfc.hitsFlatCapacity)
552 throw ProcessingError("STS Hitfinder Chain: Not enough memory for hits: {} > {}", nHitsTotal, hfc.hitsFlatCapacity);
553
554 // hitBuffer is potentially bigger than nHitsTotal
555 // So construct a span to the memory region we actually use
556 xpu::h_view hitBuffer(hfc.hitsFlat);
557 gsl::span hits(hitBuffer.data(), nHitsTotal);
558
559 // Transfer Hits / Cluster back to host
560 // Hits and Clusters are stored in buckets. Buckets aren't necessarily full.
561 // So we have to copy each bucket individually.
562 // Alternative: flatten both arrays on the GPU and transfer flat array back.
563 // Latter would require a lot of memory but give us parallel copying when running on the host.
564 //
565 // Also at this point, values in nHits could be wrong if the bucket overflowed.
566 // So take extra care to not access nHits directly.
567 if (xpu::device::active().backend() == xpu::cpu) {
568 // Hits already in host memory
569 // Flatten in parallel
570 // Note: this is still pretty slow for mCBM because there are only 9 modules...
571 xpu::scoped_timer t_("Flatten Hits");
572 xpu::t_add_bytes(nHitsTotal * sizeof(sts::Hit));
573 CBM_PARALLEL_FOR(schedule(dynamic))
574 for (int m = 0; m < hfc.nModules; m++) {
575 size_t offset = 0;
576 for (int i = 0; i < m; i++) {
577 offset += GetNHits(nHits, i);
578 }
579 std::copy_n(hfc.hitsPerModule.get() + hfc.hitsAllocatedPerModule * m, GetNHits(nHits, m), hits.begin() + offset);
580 }
581 }
582 else { // GPU
583 // Initialize host memory:
584 // memset(hits.data(), 0, nHitsTotal * sizeof(sts::Hit));
585
586 // Hits not in host memory
587 // FIXME this is very slow (1.7 GBs vs 16 GBs we should get from PCIe 3.0 x16)
588 // I assume because of page faults as the allocated memory is not touched before
589 // But even with a memset on host memory before, throughput only goes up to 3.7 GBs
590 size_t nHitsCopied = 0;
591 for (int m = 0; m < hfc.nModules; m++) {
592 size_t numHitsInModule = GetNHits(nHits, m);
593 queue.copy(hfc.hitsPerModule.get() + hfc.hitsAllocatedPerModule * m, hits.data() + nHitsCopied, numHitsInModule);
594 nHitsCopied += numHitsInModule;
595 }
596 }
597
598
599 // Could do this more efficiently, cache addresses once in the beginning, ...
600 // doesn't really matter overhead wise
601 fHitOffsets.clear();
602 // nModules + 1 entries, first entry is always 0, last entry is total number of hits (required by PartitionedSpan)
603 fHitOffsets.push_back(0);
604 for (int m = 0; m < hfc.nModules; m++)
605 fHitOffsets.push_back(fHitOffsets.back() + GetNHits(nHits, m));
606
607 fAddresses = {};
608 for (auto& m : fPars->setup.modules)
609 fAddresses.push_back(m.address);
610
611 PartitionedSpan partitionedHits(hits, fHitOffsets, fAddresses);
612 return partitionedHits;
613}
614
616{
617 auto& hfc = fHitfinder;
618
619 xpu::h_view nClusters(hfc.nClustersPerModule);
620 int maxClustersPerModule = hfc.maxClustersPerModule;
621 sts::Cluster* clusters = hfc.clusterDataPerModule.get();
622
623 int nModuleSides = nClusters.size();
624
625 L_(debug) << "Copy clusters from " << nModuleSides << " modules sides";
626
627 std::vector<u32> addresses;
628 for (size_t i = 0; i < 2; i++) { // Repeat twice for front and back
629 for (auto& m : fPars->setup.modules)
630 addresses.push_back(m.address);
631 }
632
633 size_t nClustersTotal = std::accumulate(nClusters.begin(), nClusters.end(), 0);
634
635 // TODO: copy could be made much more efficient by using pinned memory
636 // But clusters are stored for debugging only, so no effort is made to optimize this
637 std::vector<sts::Cluster> clustersFlat;
638 clustersFlat.resize(nClustersTotal);
639 size_t offset = 0;
640 for (int m = 0; m < nModuleSides; m++) {
641 size_t nClustersInModule = nClusters[m];
642 queue.copy(clusters + maxClustersPerModule * m, clustersFlat.data() + offset, nClustersInModule);
643 offset += nClustersInModule;
644 }
645 queue.wait();
646
647 std::vector<size_t> sizes(nClusters.begin(), nClusters.end());
648 PartitionedVector<sts::Cluster> out(std::move(clustersFlat), gsl::make_span(sizes), gsl::make_span(addresses));
649 return out;
650}
651
652size_t sts::HitfinderChain::GetNHits(xpu::h_view<PaddedToCacheLine<int>> nHitsPerModule, int module)
653{
654 return std::min<size_t>(nHitsPerModule[module], fHitfinder.maxHitsPerModule);
655}
656
658{
659 constexpr size_t MinHitsPerStream = 10;
660
661 int nSensors = hits.NPartitions();
662 int streamsPerSensor = std::max(1, nstreamsMax / nSensors);
663
664 // TODO: move into helper function in base/ ?
665 auto nextMultipleOf = [](int x, int mult) {
666 if (mult == 0) {
667 return x;
668 }
669
670 int rem = x % mult;
671 if (rem == 0) {
672 return x;
673 }
674
675 return x + mult - rem;
676 };
677
678 fHitStreamOffsets.clear();
679 fStreamAddresses.clear();
680
681 fHitStreamOffsets.push_back(0);
682 for (size_t sensor = 0; sensor < hits.NPartitions(); sensor++) {
683 auto [hitsSensor, addr] = hits.Partition(sensor);
684 const size_t nHits = hitsSensor.size();
685
686 // Sensor has too few hits -> create only one stream for entire sensor
687 if (nHits < streamsPerSensor * MinHitsPerStream) {
688 fHitStreamOffsets.push_back(fHitStreamOffsets.back() + nHits);
689 fStreamAddresses.push_back(addr);
690 continue;
691 }
692
693 size_t rem = nHits % streamsPerSensor;
694 if (rem == 0) {
695 size_t streamSize = nHits / streamsPerSensor;
696 for (int s = 0; s < streamsPerSensor; s++) {
697 fHitStreamOffsets.push_back(fHitStreamOffsets.back() + streamSize);
698 fStreamAddresses.push_back(addr);
699 }
700 }
701 else {
702 size_t streamSize = nextMultipleOf(nHits, streamsPerSensor) / streamsPerSensor;
703 size_t nHitsLeft = nHits;
704 for (int s = 0; s < streamsPerSensor; s++) {
705 fHitStreamOffsets.push_back(fHitStreamOffsets.back() + std::min(nHitsLeft, streamSize));
706 fStreamAddresses.push_back(addr);
707 nHitsLeft -= streamSize;
708 }
709 }
710 }
711
713 return hitStreams;
714}
715
717{
718 CBM_PARALLEL_FOR(schedule(dynamic))
719 for (size_t i = 0; i < hits.NPartitions(); i++) {
720 auto stream = hits[i];
721 std::sort(stream.begin(), stream.end(), [](const sts::Hit& a, const sts::Hit& b) { return a.fTime < b.fTime; });
722 }
723}
724
726{
727 xpu::h_view digiOffset(fHitfinder.digiOffsetPerModule);
728
729 size_t nModules = fPars->setup.modules.size();
730 size_t offset = 0;
731 // Front
732 for (size_t m = 0; m < nModules * 2; m++) {
733 if (digiOffset[m] != offset) {
734 throw FatalError("Module {}: Digi offset mismatch: {} != {}", m, digiOffset[m], offset);
735 }
736 size_t nDigis = digi.nDigisPerModule[m];
737 offset += nDigis;
738 }
739
740 if (offset != digiOffset[2 * nModules]) {
741 throw FatalError("Digi offset mismatch: {} != {}", digiOffset[2 * nModules], offset);
742 }
743}
744
746{
747 xpu::h_view digiOffset(fHitfinder.digiOffsetPerModule);
748 xpu::h_view digis(fHitfinder.digisPerModule);
749
750 bool isSorted = true;
751
752 for (size_t m = 0; m < fPars->setup.modules.size(); m++) {
753 int nDigis = digiOffset[m + 1] - digiOffset[m];
754 if (nDigis == 0) continue;
755
756 auto* digisModule = &digis[digiOffset[m]];
757
758 for (int i = 0; i < nDigis - 1; i++) {
759 const auto &digi1 = digisModule[i], &digi2 = digisModule[i + 1];
760
761 if ((digi1.GetChannel() < digi2.GetChannel())
762 || (digi1.GetChannel() == digi2.GetChannel()
763 && digi1.GetTime() <= digi2.GetTime()) // Unpacker sometimes produces multiple
764 // digis with the same time and channel, FIXME!
765 ) {
766 continue;
767 }
768
769 isSorted = false;
770 L_(error) << "Module " << m << " not sorted: " << digi1.ToString() << " " << digi2.ToString();
771 break;
772 }
773 }
774
775 if (!isSorted) {
776 throw FatalError("Digis are not sorted");
777 }
778}
779
780void sts::HitfinderChain::EnsureChannelOffsets(gsl::span<u32> channelOffsetsByModule)
781{
782 xpu::h_view digisPerModule(fHitfinder.digisPerModule);
783 xpu::h_view digiOffsetPerModule(fHitfinder.digiOffsetPerModule);
784 for (size_t m = 0; m < fPars->setup.modules.size() * 2; m++) {
785 int nChannels = fPars->setup.nChannels / 2; // Consider module sides
786
787 std::vector<u32> expectedChannelOffsets(nChannels, 0);
788
789 int offset = digiOffsetPerModule[m];
790 int nDigis = digiOffsetPerModule[m + 1] - offset;
791 gsl::span<CbmStsDigi> digis(&digisPerModule[offset], nDigis);
792 gsl::span<u32> channelOffsets = channelOffsetsByModule.subspan(m * nChannels, nChannels);
793
794 if (nDigis == 0) continue;
795
796 if (channelOffsets[0] != 0) {
797 throw FatalError("Module {}: First channel offset is not 0", m);
798 }
799
800 int chan = digis[0].GetChannel();
801 for (int i = 0; i < nDigis; i++) {
802 if (digis[i].GetChannel() != chan) {
803 while (chan < digis[i].GetChannel()) {
804 chan++;
805 expectedChannelOffsets[chan] = i;
806 }
807 }
808 }
809 while (chan < nChannels) {
810 chan++;
811 expectedChannelOffsets[chan] = nDigis;
812 }
813
814 for (int i = 0; i < nChannels; i++) {
815 if (channelOffsets[i] != expectedChannelOffsets[i]) {
816 throw FatalError("Module {}: Channel offset for channel {} is {} but should be {}", m, i, channelOffsets[i],
817 expectedChannelOffsets[i]);
818 }
819 }
820 }
821}
822
823void sts::HitfinderChain::EnsureClustersSane(gsl::span<ClusterIdx> hClusterIdx,
824 gsl::span<PaddedToCacheLine<int>> hNClusters)
825{
826 for (size_t m = 0; m < 2 * fPars->setup.modules.size(); m++) {
827 int nClusters = hNClusters[m];
828
829 L_(trace) << "Module " << m << " has " << nClusters << " clusters of " << fHitfinder.maxClustersPerModule;
830
831 if (nClusters == 0) continue;
832
833 if (nClusters < 0) {
834 throw FatalError("Module {} has negative number of clusters {}", m, nClusters);
835 }
836 if (size_t(nClusters) > fHitfinder.maxClustersPerModule) {
837 throw FatalError("Module {} has {} clusters, but only {} are allowed", m, nClusters,
838 fHitfinder.maxClustersPerModule);
839 }
840
841 auto* clusterIdx = &hClusterIdx[m * fHitfinder.maxClustersPerModule];
842
843 for (int i = 0; i < nClusters; i++) {
844 auto& cidx = clusterIdx[i];
845
846 if (int(cidx.fIdx) < 0 || size_t(cidx.fIdx) >= fHitfinder.maxClustersPerModule) {
847 throw FatalError("Cluster {} of module {} has invalid index {}", i, m, cidx.fIdx);
848 }
849
850 if (cidx.fTime == 0xFFFFFFFF) {
851 throw FatalError("Cluster {} of module has invalid time {}", i, m, cidx.fTime);
852 }
853 }
854 }
855
856 L_(trace) << "Clusters ok";
857}
858
860{
861 L_(trace) << "Ensure STS Hits sorted";
862
863 for (size_t i = 0; i < hits.NPartitions(); i++) {
864 auto [part, addr] = hits.Partition(i);
865 bool ok = std::is_sorted(part.begin(), part.end(), [](const Hit& a, const Hit& b) { return a.fTime < b.fTime; });
866 L_(trace) << "Stream " << i << ": addr=" << addr << ", nhits=" << part.size();
867 if (!ok) {
868 throw FatalError{"Hits not sorted in stream %zd (addr %u)", i, addr};
869 }
870 }
871
872 L_(trace) << "STS Hits sorted!";
873}
#define L_(level)
static vector< vector< QAHit > > hits
fscal v[fmask::Size]
Definition KfSimdPseudo.h:4
#define CBM_PARALLEL_FOR(...)
Definition OpenMP.h:25
#define CBM_OMP(...)
Definition OpenMP.h:39
#define CBM_PARALLEL(...)
Definition OpenMP.h:32
Data class for a single-channel message in the STS.
Definition CbmStsDigi.h:40
A vector that is partitioned into multiple subvectors.
const RecoParams & Params() const
Definition SubChain.h:21
const Options & Opts() const
Definition SubChain.h:20
void EnsureHitsSorted(PartitionedSpan< sts::Hit >)
PartitionedSpan< sts::Hit > SplitHitsIntoStreams(PartitionedSpan< sts::Hit > hits, int nstreamsMax)
PartitionedVector< sts::Cluster > FlattenClusters(xpu::queue)
void SortHitsWithinPartition(PartitionedSpan< sts::Hit > hits)
void EnsureClustersSane(gsl::span< ClusterIdx >, gsl::span< PaddedToCacheLine< int > >)
void EnsureChannelOffsets(gsl::span< u32 >)
std::vector< size_t > fHitStreamOffsets
size_t GetNHits(xpu::h_view< PaddedToCacheLine< int > > nHitsPerModule, int module)
Returns the number of hits of a module.
std::vector< size_t > fHitOffsets
PartitionedSpan< sts::Hit > FlattenHits(xpu::queue)
std::optional< sts::HitfinderChainPars > fPars
void FlattenDigis(gsl::span< const CbmStsDigi > digis, DigiMap &digiMap)
DigiMap CountDigisPerModules(gsl::span< const CbmStsDigi > digis)
Result operator()(gsl::span< const CbmStsDigi >, bool storeClusters=false)
std::vector< u32 > fStreamAddresses
void SetParameters(const HitfinderChainPars &parameters)
void AllocateDynamic(size_t, size_t)
static constexpr u16 InvalidModule
XPU_D int32_t PackDigiAddress(int32_t address)
Strip address to contain only unit, (half)ladder and module.
int GetMaxThreads()
Definition OpenMP.h:46
int GetThreadNum()
Definition OpenMP.h:47
uint16_t u16
Definition Definitions.h:19
PaddedValue< T, SizeOfCacheLine > PaddedToCacheLine
Definition PaddedValue.h:80
int32_t i32
Definition Definitions.h:20
@ kFindHitsChunksPerModule
Definition Hitfinder.h:40
std::string_view ToString(T t)
Definition CbmEnumDict.h:64
Indicates an unrecoverable error. Should tear down the process.
Definition Exceptions.h:34
u16 ModuleIndex(const CbmStsDigi &digi) const
std::vector< std::vector< size_t > > nDigisPerThread
PartitionedVector< sts::Cluster > clusters
PartitionedSpan< sts::Hit > hits
void SetDeviceMon(const HitfinderMonDevice &devMon)