Hi,
Am trying to sort a vector on GPU (GeForce GT 750M) using example code which generates SEGV.
However, it runs fine on Iris Pro (Intel(R) Core(TM) i7-4850HQ CPU).
We observe SEGV when size is 10 million.
When the size is 5 million, following exception is thrown:
boost/1_61_0/include/boost/compute/command_queue.hpp(453): Throw in function
boost::compute::event boost::compute::command_queue::enqueue_write_buffer(const
boost::compute::buffer &, size_t, size_t, const void *, const boost::compute::wait_list
&) Dynamic exception type:
boost::exception_detail::clone_impl > std::exception::what: Invalid Value
Another observation: if the size of vector is 50 million the sorting works fine,
though the timings are worse than Iris Pro.
Also, when is size is 100 million, the binary causes the OS to crash.
Compiler details:
clang++ --version
Apple LLVM version 7.3.0 (clang-703.0.31)
Target: x86_64-apple-darwin15.5.0
Thread model: posix
OS:
System Version: OS X 10.11.5 (15F34)
Kernel Version: Darwin 15.5.0
Regards,
Prashant
----------------------------Cut here-------------------------------------
#include <iostream>
#include <vector>
#include <algorithm>
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
namespace compute = boost::compute;
int main(int argc, char* argv[])
{
if (argc != 2) {
std::cout << "Usage: " << argv[0] << " <size> " << std::endl;
return 0;
}
// generate random data on the host
std::vector<float> host_vector(atoi(argv[1]));
std::generate(host_vector.begin(), host_vector.end(), rand);
std::cout <<
"===============CPU==================\n";
for (size_t k=0; k<5; k++)
{
std::vector<float> host_copy_vector(host_vector);
auto start = std::chrono::high_resolution_clock::now();
std::sort(host_copy_vector.begin(), host_copy_vector.end());
auto duration = std::chrono::duration_caststd::chrono::milliseconds
(std::chrono::high_resolution_clock::now() - start);
std::cout << "time: iteration ("<< k << ") : " << duration.count() << " ms" <<
std::endl;
}
std::vectorcompute::platform platforms = compute::system::platforms();
for(size_t i = 0; i < platforms.size(); i++){
const compute::platform &platform = platforms[i];
std::cout << "Platform '" << platform.name() << "'" << std::endl;
std::vectorcompute::device devices = platform.devices();
for(size_t j = 0; j < devices.size(); j++){
const compute::device &device = devices[j];
std::string type;
if(device.type() & compute::device::gpu)
type = "GPU Device";
else if(device.type() & compute::device::cpu)
type = "CPU Device";
else if(device.type() & compute::device::accelerator)
type = "Accelerator Device";
else
type = "Unknown Device";
if (type != "GPU Device") {
std::cout << "Ignoring non GPU devices.\n";
continue;
}
std::cout <<
"====\n";
std::cout << " " << type << ": " << device.name() << std::endl;
std::cout <<
"====\n";
compute::context context(device);
compute::command_queue queue(context, device);
for (size_t k=0; k<5; k++)
{
compute::vector<float> device_vector(host_vector.size(), context);
// copy data from the host to the device
compute::copy(
host_vector.begin(), host_vector.end(), device_vector.begin(), queue
);
auto start = std::chrono::high_resolution_clock::now();
try {
compute::sort(device_vector.begin(), device_vector.end(), queue);
} catch (boost::exception & e) {
std::cerr << diagnostic_information(e);
break;
}
auto duration = std::chrono::duration_caststd::chrono::milliseconds
(std::chrono::high_resolution_clock::now() - start);
std::cout << "time: iteration ("<< k << ") : " << duration.count() << " ms" <<
std::endl;
}
std::cout <<
"====\n";
}
}
return 0;
}