Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 12 additions & 17 deletions GPU/GPUTracking/Base/GPUReconstructionCPU.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@

#include <atomic>
#include <ctime>
#include <string>

#ifndef _WIN32
#include <unistd.h>
Expand Down Expand Up @@ -230,6 +231,7 @@ int32_t GPUReconstructionCPU::RunChains()
GPUInfo("Allocated memory when starting processing %34s", "");
PrintMemoryOverview();
}

mTimerTotal.Start();
const std::clock_t cpuTimerStart = std::clock();
int32_t retVal = 0;
Expand Down Expand Up @@ -264,7 +266,10 @@ int32_t GPUReconstructionCPU::RunChains()
double kernelTotal = 0;
std::vector<double> kernelStepTimes(gpudatatypes::N_RECO_STEPS, 0.);

debugWriter writer(GetProcessingSettings().debugCSV, GetProcessingSettings().debugMarkdown, mStatNEvents);

if (GetProcessingSettings().debugLevel >= 1) {
writer.header();
for (uint32_t i = 0; i < mTimers.size(); i++) {
double time = 0;
if (mTimers[i] == nullptr) {
Expand All @@ -284,11 +289,7 @@ int32_t GPUReconstructionCPU::RunChains()
int32_t stepNum = getRecoStepNum(mTimers[i]->step);
kernelStepTimes[stepNum] += time;
}
char bandwidth[256] = "";
if (mTimers[i]->memSize && mStatNEvents && time != 0.) {
snprintf(bandwidth, 256, " (%8.3f GB/s - %'14zu bytes - %'14zu per call)", mTimers[i]->memSize / time * 1e-9, mTimers[i]->memSize / mStatNEvents, mTimers[i]->memSize / mStatNEvents / mTimers[i]->count);
}
printf("Execution Time: Task (%c %8ux): %50s Time: %'10.0f us%s\n", type == 0 ? 'K' : 'C', mTimers[i]->count, mTimers[i]->name.c_str(), time * 1000000 / mStatNEvents, bandwidth);
writer.row('K', mTimers[i]->count, mTimers[i]->name.c_str(), time, -1.0, -1.0, mTimers[i]->memSize);
if (GetProcessingSettings().resetTimers) {
mTimers[i]->count = 0;
mTimers[i]->memSize = 0;
Expand All @@ -298,16 +299,13 @@ int32_t GPUReconstructionCPU::RunChains()
if (GetProcessingSettings().recoTaskTiming) {
for (int32_t i = 0; i < gpudatatypes::N_RECO_STEPS; i++) {
if (kernelStepTimes[i] != 0. || mTimersRecoSteps[i].timerTotal.GetElapsedTime() != 0.) {
printf("Execution Time: Step : %11s %38s Time: %'10.0f us %64s ( Total Time : %'14.0f us, CPU Time : %'14.0f us, %'7.2fx )\n", "Tasks",
gpudatatypes::RECO_STEP_NAMES[i], kernelStepTimes[i] * 1000000 / mStatNEvents, "", mTimersRecoSteps[i].timerTotal.GetElapsedTime() * 1000000 / mStatNEvents, mTimersRecoSteps[i].timerCPU * 1000000 / mStatNEvents, mTimersRecoSteps[i].timerCPU / mTimersRecoSteps[i].timerTotal.GetElapsedTime());
writer.row(' ', 0, std::string(gpudatatypes::RECO_STEP_NAMES[i]) + " (Tasks)", kernelStepTimes[i], mTimersRecoSteps[i].timerCPU, mTimersRecoSteps[i].timerTotal.GetElapsedTime(), 0);
}
if (mTimersRecoSteps[i].bytesToGPU) {
printf("Execution Time: Step (D %8ux): %11s %38s Time: %'10.0f us (%8.3f GB/s - %'14zu bytes - %'14zu per call)\n", mTimersRecoSteps[i].countToGPU, "DMA to GPU", gpudatatypes::RECO_STEP_NAMES[i], mTimersRecoSteps[i].timerToGPU.GetElapsedTime() * 1000000 / mStatNEvents,
mTimersRecoSteps[i].bytesToGPU / mTimersRecoSteps[i].timerToGPU.GetElapsedTime() * 1e-9, mTimersRecoSteps[i].bytesToGPU / mStatNEvents, mTimersRecoSteps[i].bytesToGPU / mTimersRecoSteps[i].countToGPU);
writer.row('D', mTimersRecoSteps[i].countToGPU, std::string(gpudatatypes::RECO_STEP_NAMES[i]) + " (DMA to GPU)", mTimersRecoSteps[i].timerToGPU.GetElapsedTime(), -1.0, -1.0, mTimersRecoSteps[i].bytesToGPU);
}
if (mTimersRecoSteps[i].bytesToHost) {
printf("Execution Time: Step (D %8ux): %11s %38s Time: %'10.0f us (%8.3f GB/s - %'14zu bytes - %'14zu per call)\n", mTimersRecoSteps[i].countToHost, "DMA to Host", gpudatatypes::RECO_STEP_NAMES[i], mTimersRecoSteps[i].timerToHost.GetElapsedTime() * 1000000 / mStatNEvents,
mTimersRecoSteps[i].bytesToHost / mTimersRecoSteps[i].timerToHost.GetElapsedTime() * 1e-9, mTimersRecoSteps[i].bytesToHost / mStatNEvents, mTimersRecoSteps[i].bytesToHost / mTimersRecoSteps[i].countToHost);
writer.row('D', mTimersRecoSteps[i].countToHost, std::string(gpudatatypes::RECO_STEP_NAMES[i]) + " (DMA to Host)", mTimersRecoSteps[i].timerToHost.GetElapsedTime(), -1.0, -1.0, mTimersRecoSteps[i].bytesToHost);
}
if (GetProcessingSettings().resetTimers) {
mTimersRecoSteps[i].bytesToGPU = mTimersRecoSteps[i].bytesToHost = 0;
Expand All @@ -321,14 +319,11 @@ int32_t GPUReconstructionCPU::RunChains()
}
for (int32_t i = 0; i < gpudatatypes::N_GENERAL_STEPS; i++) {
if (mTimersGeneralSteps[i].GetElapsedTime() != 0.) {
printf("Execution Time: General Step : %50s Time: %'10.0f us\n", gpudatatypes::GENERAL_STEP_NAMES[i], mTimersGeneralSteps[i].GetElapsedTime() * 1000000 / mStatNEvents);
writer.row(' ', 0, gpudatatypes::GENERAL_STEP_NAMES[i], mTimersGeneralSteps[i].GetElapsedTime(), -1.0, -1.0, 0);
}
}
if (GetProcessingSettings().debugLevel >= 1) {
mStatKernelTime = kernelTotal * 1000000 / mStatNEvents;
printf("Execution Time: Total : %50s Time: %'10.0f us%s\n", "Total Kernel", mStatKernelTime, nEventReport.c_str());
}
printf("Execution Time: Total : %50s Time: %'10.0f us ( CPU Time : %'10.0f us, %7.2fx ) %s\n", "Total Wall", mStatWallTime, mStatCPUTime * 1000000 / mStatNEvents, mStatCPUTime / mTimerTotal.GetElapsedTime(), nEventReport.c_str());
double gpu_time = GetProcessingSettings().debugLevel >= 1 ? kernelTotal : -1.0;
writer.row(' ', 0, "Wall", gpu_time, mStatCPUTime, mTimerTotal.GetElapsedTime(), 0, nEventReport);
} else if (GetProcessingSettings().debugLevel >= 0) {
GPUInfo("Total Wall Time: %10.0f us%s", mStatWallTime, nEventReport.c_str());
}
Expand Down
13 changes: 13 additions & 0 deletions GPU/GPUTracking/Base/GPUReconstructionCPU.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,9 @@
#define GPURECONSTRUCTIONICPU_H

#include "GPUReconstructionProcessing.h"
#include <fstream>
#include <stdexcept>
#include <string>
#include <vector>

namespace Ort
Expand Down Expand Up @@ -100,6 +102,17 @@ class GPUReconstructionCPU : public GPUReconstructionProcessing::KernelInterface
size_t TransferMemoryResourcesHelper(GPUProcessor* proc, int32_t stream, bool all, bool toGPU);
template <class S, int32_t I = 0, typename... Args>
void runKernelInterface(krnlSetup&& setup, Args const&... args);

struct debugWriter {
debugWriter(std::string filenameCSV, bool markdown, uint32_t statNEvents);
void header();
void row(char type, uint32_t count, std::string name, double gpu_time, double cpu_time, double total_time, std::size_t memSize, std::string nEventReport = "");

private:
std::ofstream streamCSV;
bool mMarkdown;
uint32_t mStatNEvents;
};
};

} // namespace o2::gpu
Expand Down
107 changes: 107 additions & 0 deletions GPU/GPUTracking/Base/GPUReconstructionDebug.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
/// \author David Rohr

#include "GPUReconstruction.h"
#include "GPUReconstructionCPU.h"
#include "GPULogging.h"
#include "GPUSettings.h"

Expand All @@ -23,6 +24,8 @@
#include <filesystem>
#include <chrono>
#include <format>
#include <iostream>
#include <string>

using namespace o2::gpu;

Expand Down Expand Up @@ -186,3 +189,107 @@ bool GPUReconstruction::triggerDebugDump()
}
return false;
}

GPUReconstructionCPU::debugWriter::debugWriter(std::string filenameCSV, bool markdown, uint32_t statNEvents) : mMarkdown{markdown}, mStatNEvents{statNEvents}
{
if (!filenameCSV.empty()) {
streamCSV.open(filenameCSV, std::ios::out | std::ios::app);
}
}

void GPUReconstructionCPU::debugWriter::header()
{
if (streamCSV.is_open() && !streamCSV.tellp()) {
streamCSV << "type,count,name,gpu (us),cpu (us),cpu/total,total (us),GB/s,bytes,bytes/call\n";
}

if (mMarkdown) {
std::cout << "| | count | name | gpu (us) | cpu (us) | cpu/tot | tot (us) | GB/s | bytes | bytes/call |\n";
std::cout << "|---|--------|-------------------------------------------|-----------|-----------|---------|-----------|-----------|---------------|---------------|\n";
}
}

void GPUReconstructionCPU::debugWriter::row(char type, uint32_t count, std::string name, double gpu_time, double cpu_time, double total_time, std::size_t memSize, std::string nEventReport)
{
double scale = 1000000.0 / mStatNEvents;

if (streamCSV.is_open()) {
streamCSV << type << ",";
if (count != 0)
streamCSV << count;
streamCSV << "," << name << ",";
if (gpu_time != -1.0)
streamCSV << std::format("{:.0f}", gpu_time * scale);
streamCSV << ",";
if (cpu_time != -1.0)
streamCSV << std::format("{:.0f}", cpu_time * scale);
streamCSV << ",";
if (cpu_time != -1.0 && total_time != -1.0)
streamCSV << std::format("{:.2f}", cpu_time / total_time);
streamCSV << ",";
if (total_time != -1.0)
streamCSV << std::format("{:.0f}", total_time * scale);
streamCSV << ",";
if (memSize != 0 && count != 0)
streamCSV << std::format("{:.3f},{},{}", memSize / gpu_time * 1e-9, memSize / mStatNEvents, memSize / mStatNEvents / count);
else
streamCSV << ",,";
streamCSV << std::endl;
}

if (mMarkdown) {
std::cout << "| " << type << " | ";
if (count != 0)
std::cout << std::format("{:6} |", count);
else
std::cout << " |";
std::cout << std::format(" {:42}|", name);
if (gpu_time != -1.0)
std::cout << std::format("{:10.0f} |", gpu_time * scale);
else
std::cout << " |";
if (cpu_time != -1.0)
std::cout << std::format("{:10.0f} |", cpu_time * scale);
else
std::cout << " |";
if (cpu_time != -1.0 && total_time != -1.0)
std::cout << std::format("{:8.2f} |", cpu_time / total_time);
else
std::cout << " |";
if (total_time != -1.0)
std::cout << std::format("{:10.0f} |", total_time * scale);
else
std::cout << " |";
if (memSize != 0 && count != 0)
std::cout << std::format("{:10.3f} |{:14} |{:14} |", memSize / gpu_time * 1e-9, memSize / mStatNEvents, memSize / mStatNEvents / count);
else
std::cout << " | | |";
std::cout << std::endl;
} else {
if (name.substr(0, 3) == "GPU") {
char bandwidth[256] = "";
if (memSize && mStatNEvents && gpu_time != 0.0) {
snprintf(bandwidth, 256, " (%8.3f GB/s - %'14zu bytes - %'14zu per call)", memSize / gpu_time * 1e-9, memSize / mStatNEvents, memSize / mStatNEvents / count);
}
printf("Execution Time: Task (%c %8ux): %50s Time: %'10.0f us%s\n", type, count, name.c_str(), gpu_time * scale, bandwidth);
} else if (name.substr(0, 3) == "TPC") {
std::size_t n = name.find('(');
std::string basename = name.substr(0, n - 1);
std::string postfix = name.substr(n + 1, name.size() - n - 2);
if (total_time != -1.0) {
printf("Execution Time: Step : %11s %38s Time: %'10.0f us %64s ( Total Time : %'14.0f us, CPU Time : %'14.0f us, %'7.2fx )\n", postfix.c_str(),
basename.c_str(), gpu_time * scale, "", total_time * scale, cpu_time * scale, cpu_time / total_time);
} else {
printf("Execution Time: Step (D %8ux): %11s %38s Time: %'10.0f us (%8.3f GB/s - %'14zu bytes - %'14zu per call)\n", count, postfix.c_str(), basename.c_str(), gpu_time * scale,
memSize / gpu_time * 1e-9, memSize / mStatNEvents, memSize / mStatNEvents / count);
}
} else if (name == "Prepare") {
printf("Execution Time: General Step : %50s Time: %'10.0f us\n", name.c_str(), gpu_time * scale);
} else if (name == "Wall") {
if (gpu_time != -1.0) {
printf("Execution Time: Total : %50s Time: %'10.0f us%s\n", "Total Kernel", gpu_time * scale, nEventReport.c_str());
}
printf("Execution Time: Total : %50s Time: %'10.0f us ( CPU Time : %'10.0f us, %7.2fx ) %s\n", "Total Wall", total_time * scale, cpu_time * scale, cpu_time / total_time, nEventReport.c_str());
}
}
}
2 changes: 2 additions & 0 deletions GPU/GPUTracking/Definitions/GPUSettingsList.h
Original file line number Diff line number Diff line change
Expand Up @@ -329,6 +329,8 @@ AddOption(debugLevel, int32_t, -1, "debug", 'd', "Set debug level (-2 = silent,
AddOption(allocDebugLevel, int32_t, 0, "allocDebug", 0, "Some debug output for memory allocations (without messing with normal debug level)")
AddOption(debugMask, uint32_t, (1 << 18) - 1, "debugMask", 0, "Mask for debug output dumps to file")
AddOption(debugLogSuffix, std::string, "", "debugSuffix", 0, "Suffix for debug log files with --debug 6")
AddOption(debugCSV, std::string, "", "", 0, "CSV filename to append the benchmark results. Verbosity determined by parameter --debug.")
AddOption(debugMarkdown, bool, false, "", 0, "Print the results of standlaone benchmarks in markdown format")
AddOption(serializeGPU, int8_t, 0, "", 0, "Synchronize after each kernel call (bit 1) and DMA transfer (bit 2) and identify failures")
AddOption(recoTaskTiming, bool, 0, "", 0, "Perform summary timing after whole reconstruction tasks")
AddOption(deterministicGPUReconstruction, int32_t, -1, "", 0, "Make CPU and GPU debug output comparable (sort / skip concurrent parts), -1 = automatic if debugLevel >= 6 or deterministic compile flag set", def(1))
Expand Down
8 changes: 6 additions & 2 deletions GPU/GPUTracking/Standalone/Benchmark/standalone.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -627,7 +627,9 @@ int32_t RunBenchmark(GPUReconstruction* recUse, GPUChainTracking* chainTrackingU
if (configStandalone.runs > 1) {
printf("Run %d (thread %d)\n", iteration + 1, threadId);
}
recUse->SetResetTimers(iRun < configStandalone.runsInit);
if (configStandalone.runsInit > 0 && configStandalone.proc.debugCSV.empty()) {
recUse->SetResetTimers(iRun < configStandalone.runsInit);
}
if (configStandalone.outputcontrolmem) {
recUse->SetOutputControl(threadId ? outputmemoryPipeline.get() : outputmemory.get(), configStandalone.outputcontrolmem);
}
Expand Down Expand Up @@ -685,7 +687,9 @@ int32_t RunBenchmark(GPUReconstruction* recUse, GPUChainTracking* chainTrackingU
chainTrackingAsync->mIOPtrs.nRawClusters[i] = 0;
}
chainTrackingAsync->mIOPtrs.clustersNative = nullptr;
recAsync->SetResetTimers(iRun < configStandalone.runsInit);
if (configStandalone.runsInit > 0 && configStandalone.proc.debugCSV.empty()) {
recAsync->SetResetTimers(iRun < configStandalone.runsInit);
}
tmpRetVal = recAsync->RunChains();
if (tmpRetVal == 0 || tmpRetVal == 2) {
OutputStat(chainTrackingAsync, nullptr, nullptr);
Expand Down
Loading