CUDA anahtarında 3 dizileri sıralama (belki de Thrust kullanarak)

Aynı boyutta (300.000 öğesinden daha fazla) 3 dizim var. Bir dizi float numarası ve iki dizin dizisi. Yani, her bir sayı için 2 Kimlikler var.CUDA anahtarında 3 dizileri sıralama (belki de Thrust kullanarak)

3 dizilerinin tümü zaten GPU genel belleğinde. Tüm sayıları onların kimlikleriyle uygun şekilde sıralamak istiyorum.

Bu görevi yapmak için Thrust kitaplığını kullanmanın bir yolu var mı? Thrust kütüphanesinden daha iyi bir yolu var mı? Tabii ki, onları birkaç kez ana belleğe kopyalamayı tercih etmiyorum. Bu arada, diziler vektörler değil.

Yardımlarınız için şimdiden teşekkür ederiz.

Geçici çözüm, ancak bu son derece yavaştır. Neredeyse 4 saniye sürer ve benim dizi boyutu Thrust kullanabilirsiniz Tabii 300000

thrust::device_ptr<float> keys(afterSum); 
thrust::device_ptr<int> vals0(d_index); 
thrust::device_ptr<int> vals1(blockId); 

thrust::device_vector<int> sortedIndex(numElements); 
thrust::device_vector<int> sortedBlockId(numElements); 

thrust::counting_iterator<int> iter(0); 
thrust::device_vector<int> indices(numElements); 
thrust::copy(iter, iter + indices.size(), indices.begin()); 

thrust::sort_by_key(keys, keys + numElements , indices.begin());  

thrust::gather(indices.begin(), indices.end(), vals0, sortedIndex.begin()); 
thrust::gather(indices.begin(), indices.end(), vals1, sortedBlockId.begin()); 

thrust::host_vector<int> h_sortedIndex=sortedIndex; 
thrust::host_vector<int> h_sortedBlockId=sortedBlockId;

kaynak

2011-07-07 Kiarash

sırasına göre olduğunu. İlk olarak, ham CUDA cihaz işaretçinizi thrust::device_ptr ile sarmanız gerekir. Varsayarsak senin şamandıra değerleri dizisinde pkeys içindedir ve kimlikleri diziler pvals0 ve pvals1 içindedir ve numElements diziler uzunluğudur, böyle bir şey çalışması gerekir:

#include <thrust/device_ptr.h> 
#include <thrust/sort.h> 
#include <thrust/gather.h> 
#include <thrust/iterator/counting_iterator.h> 

cudaEvent_t start, stop; 
cudaEventCreate(&start); 
cudaEventCreate(&stop); 

cudaEventRecord(start); 

thrust::device_ptr<float> keys(pkeys); 
thrust::device_ptr<int> vals0(pvals0); 
thrust::device_ptr<int> vals1(pvals1); 

// allocate space for the output 
thrust::device_vector<int> sortedVals0(numElements); 
thrust::device_vector<int> sortedVals1(numElements); 

// initialize indices vector to [0,1,2,..] 
thrust::counting_iterator<int> iter(0); 
thrust::device_vector<int> indices(numElements); 
thrust::copy(iter, iter + indices.size(), indices.begin()); 

// first sort the keys and indices by the keys 
thrust::sort_by_key(keys.begin(), keys.end(), indices.begin()); 

// Now reorder the ID arrays using the sorted indices 
thrust::gather(indices.begin(), indices.end(), vals0.begin(), sortedVals0.begin()); 
thrust::gather(indices.begin(), indices.end(), vals1.begin(), sortedVals1.begin()); 

cudaEventRecord(stop); 
cudaEventSynchronize(stop); 
float milliseconds = 0; 
cudaEventElapsedTime(&milliseconds, start, stop); 
printf("Took %f milliseconds for %d elements\n", milliseconds, numElements);

kaynak

2011-07-08 00:43:24 harrism

Teşekkürler harrism. Neredeyse doğru kod kullandım.hariç ben pkeys, pvals, numElements benim ile değişti. Çok fazla hata alıyorum. Bunları soru bölümüne koydum. Anlamaya çalışıyorum. – Kiarash

Sorunu nasıl çözeceğimi buldum ama şimdi çok yavaş. Bununla ilgili ne yapabilirim? – Kiarash

Çalışma kodunu soru bölümüne koydum! – Kiarash

Ben birini gerçekleştirmek için zip_iterator kullanacağı

her iki indis vektöründe de sort_by_key.

Bu şuna benzer:

typedef typename thrust::tuple<thrust::device_vector<int>::iterator, thrust::device_vector<int>::iterator> IteratorTuple; 
    typedef typename thrust::zip_iterator<IteratorTuple> ZipIterator; 

    // here I suppose your 3 arrays are pointed to by device_ptr as suggested by @harrism 
    thrust::device_vector<float> key(pKey, pKey + numElements); 
    thrust::device_vector<int> val0(pVal0, pVal0 + numElements); 
    thrust::device_vector<int> val1(pVal1, pVal1 + numElements); 

    ZipIterator iterBegin(thrust::make_tuple(val0.begin(), val1.begin())); 
    thrust::sort_by_key(key.begin(), key.end(), iterBegin);

kaynak

2015-08-10 12:23:32 Tikoloche

ben, yani yukarıda önerilen iki yaklaşım karşılaştırıldığında thrust::zip_iterator kullanarak o ve thrust::gather kullanarak bu var. Poster tarafından talep edildiği gibi, iki diziyi anahtar veya üç dizi ile sıralama durumunda test ettim. Her iki durumda da, thrust::gather'un kullanıldığı yaklaşımın daha hızlı olduğu görülmüştür.

N = 1048576

için 2 diziler

#include <time.h>  // --- time 
#include <stdlib.h>  // --- srand, rand 

#include <thrust\host_vector.h> 
#include <thrust\device_vector.h> 
#include <thrust\sort.h> 
#include <thrust\iterator\zip_iterator.h> 

#include "TimingGPU.cuh" 

//#define VERBOSE 
//#define COMPACT 

int main() { 

    const int N = 1048576; 
    //const int N = 10; 

    TimingGPU timerGPU; 

    // --- Initialize random seed 
    srand(time(NULL)); 

    thrust::host_vector<int> h_code(N); 
    thrust::host_vector<double> h_x(N); 
    thrust::host_vector<double> h_y(N); 

    for (int k = 0; k < N; k++) {  
     // --- Generate random numbers between 0 and 9 
     h_code[k] = rand() % 10 + 1; 
     h_x[k] = ((double)rand()/(RAND_MAX)); 
     h_y[k] = ((double)rand()/(RAND_MAX)); 
    } 

    thrust::device_vector<int> d_code(h_code); 

    thrust::device_vector<double> d_x(h_x); 
    thrust::device_vector<double> d_y(h_y); 

#ifdef VERBOSE 
    printf("Before\n"); 
    for (int k = 0; k < N; k++) printf("code = %i; x = %f; y = %f\n", h_code[k], h_x[k], h_y[k]); 
#endif 

    timerGPU.StartCounter(); 
#ifdef COMPACT 
    thrust::sort_by_key(d_code.begin(), d_code.end(), thrust::make_zip_iterator(thrust::make_tuple(d_x.begin(), d_y.begin()))); 
#else 

    // --- Initialize indices vector to [0,1,2,..] 
    thrust::counting_iterator<int> iter(0); 
    thrust::device_vector<int> indices(N); 
    thrust::copy(iter, iter + indices.size(), indices.begin()); 

    // --- First, sort the keys and indices by the keys 
    thrust::sort_by_key(d_code.begin(), d_code.end(), indices.begin()); 

    // Now reorder the ID arrays using the sorted indices 
    thrust::gather(indices.begin(), indices.end(), d_x.begin(), d_x.begin()); 
    thrust::gather(indices.begin(), indices.end(), d_y.begin(), d_y.begin()); 
#endif 

    printf("Timing GPU = %f\n", timerGPU.GetCounter()); 

#ifdef VERBOSE 
    h_code = d_code; 
    h_x = d_x; 
    h_y = d_y; 

    printf("After\n"); 
    for (int k = 0; k < N; k++) printf("code = %i; x = %f; y = %f\n", h_code[k], h_x[k], h_y[k]); 
#endif 
}

2 diziler halinde

#include <time.h>  // --- time 
#include <stdlib.h>  // --- srand, rand 

#include <thrust\host_vector.h> 
#include <thrust\device_vector.h> 
#include <thrust\sort.h> 
#include <thrust\iterator\zip_iterator.h> 

#include "TimingGPU.cuh" 

//#define VERBOSE 
//#define COMPACT 

int main() { 

    const int N = 1048576; 
    //const int N = 10; 

    TimingGPU timerGPU; 

    // --- Initialize random seed 
    srand(time(NULL)); 

    thrust::host_vector<int> h_code(N); 
    thrust::host_vector<double> h_x(N); 
    thrust::host_vector<double> h_y(N); 
    thrust::host_vector<double> h_z(N); 

    for (int k = 0; k < N; k++) { 
     // --- Generate random numbers between 0 and 9 
     h_code[k] = rand() % 10 + 1; 
     h_x[k] = ((double)rand()/(RAND_MAX)); 
     h_y[k] = ((double)rand()/(RAND_MAX)); 
     h_z[k] = ((double)rand()/(RAND_MAX)); 
    } 

    thrust::device_vector<int> d_code(h_code); 

    thrust::device_vector<double> d_x(h_x); 
    thrust::device_vector<double> d_y(h_y); 
    thrust::device_vector<double> d_z(h_z); 

#ifdef VERBOSE 
    printf("Before\n"); 
    for (int k = 0; k < N; k++) printf("code = %i; x = %f; y = %f\n", h_code[k], h_x[k], h_y[k]); 
#endif 

    timerGPU.StartCounter(); 
#ifdef COMPACT 
    thrust::sort_by_key(d_code.begin(), d_code.end(), thrust::make_zip_iterator(thrust::make_tuple(d_x.begin(), d_y.begin(), d_z.begin()))); 
#else 

    // --- Initialize indices vector to [0,1,2,..] 
    thrust::counting_iterator<int> iter(0); 
    thrust::device_vector<int> indices(N); 
    thrust::copy(iter, iter + indices.size(), indices.begin()); 

    // --- First, sort the keys and indices by the keys 
    thrust::sort_by_key(d_code.begin(), d_code.end(), indices.begin()); 

    // Now reorder the ID arrays using the sorted indices 
    thrust::gather(indices.begin(), indices.end(), d_x.begin(), d_x.begin()); 
    thrust::gather(indices.begin(), indices.end(), d_y.begin(), d_y.begin()); 
    thrust::gather(indices.begin(), indices.end(), d_z.begin(), d_z.begin()); 
#endif 

    printf("Timing GPU = %f\n", timerGPU.GetCounter()); 

#ifdef VERBOSE 
    h_code = d_code; 
    h_x = d_x; 
    h_y = d_y; 

    printf("After\n"); 
    for (int k = 0; k < N; k++) printf("code = %i; x = %f; y = %f\n", h_code[k], h_x[k], h_y[k]); 
#endif 
}

Zamanlama 3 diziler durumunda ÖRNEĞİ NVIDIA GTX 960 kartında gerçekleştirilen N = 1048576

zip_iterator = 9.64ms 
gather  = 4.22ms

Testleri için 3 diziler halinde

zip_iterator = 7.34ms 
gather  = 4.27ms

Zamanlama.

kaynak

2017-02-27 11:30:49 JackOLantern

CUDA anahtarında 3 dizileri sıralama (belki de Thrust kullanarak)

cevap

İlgili konular