cudaMallocGPS(&mat, mat_dim *mat_dim_size);
cudaMallocGPS(&vec1, mat_dim_size);
cudaMallocGPS(&vec2, mat_dim_size);
cudaMemset(vec2, 0, mat_dim_size);
for (int iter = 0; iter < MAX_ITER; ++iter) {
if (iter == 0)
cuGPSTrackingStart();
for (int device = 0; device < num_devices; ++device) {
cudaSetDevice(device);
mvmul<<<num_blocks, num_threads, stream[device]>>>(mat, vec1,
vec2 /* ... */);
mvmul<<<num_blocks, num_threads, stream[device]>>>(mat, vec2,
vec1 /* ... */);
}
if (iter == 0)
cuGPSTrackingStop();
}