From 66d1ccda44ea7c5b7766b6b8826180e3702456b5 Mon Sep 17 00:00:00 2001 From: Soroush Safari Loaliyan Date: Wed, 24 Jun 2026 10:24:59 -0700 Subject: [PATCH] Initialize padded sort buffers before CUB sort --- cujson/parse_json_lines.cu | 25 ++++++++++++++++++++----- cujson/parse_standard_json.cu | 23 +++++++++++++++++++---- 2 files changed, 39 insertions(+), 9 deletions(-) diff --git a/cujson/parse_json_lines.cu b/cujson/parse_json_lines.cu index 515a127..f9a1001 100644 --- a/cujson/parse_json_lines.cu +++ b/cujson/parse_json_lines.cu @@ -945,9 +945,22 @@ inline uint8_t * stage2_tokenizer( uint8_t* block_GPU, uint8_t* out_string_open_close_8_GPU; uint32_t* out_string_open_close_8_index_GPU; // it's going to store structural index, not real index - cudaMallocAsync(&out_string_open_close_8_GPU, (last_index_tokens_open_close + padding2) * sizeof(uint8_t),0); - cudaMallocAsync(&out_string_open_close_8_index_GPU, last_index_tokens_open_close * sizeof(uint32_t),0); - + // Fixed: Initialize it to 0 before using it in the kernel to avoid undefined behavior. + cudaMallocAsync(&out_string_open_close_8_GPU, + (last_index_tokens_open_close + padding2) * sizeof(uint8_t), + 0); + cudaMemsetAsync(out_string_open_close_8_GPU, + 0, + (last_index_tokens_open_close + padding2) * sizeof(uint8_t), + 0); + + cudaMallocAsync(&out_string_open_close_8_index_GPU, + last_index_tokens_open_close * sizeof(uint32_t), + 0); + cudaMemsetAsync(out_string_open_close_8_index_GPU, + 0, + last_index_tokens_open_close * sizeof(uint32_t), + 0); // extractStructuralIdx(): extractStructuralIdx() + extractOpenCloseIdx() extractStructuralIdx<<>>(acc_structural_cnt, // prefix sum set bits until each word of structural acc_open_close_cnt, // prefix sum set bits until each word of open close @@ -1061,8 +1074,10 @@ int32_t* stage3_parser(uint8_t* open_close_bitmap, int32_t** open_close_index_d, // _______________STEP_1__(a)_________________ int32_t* res; // temporary result that will use in following uint32_t* oc_1; // output - cudaMallocAsync(&oc_1, oc_cnt_32*sizeof(uint32_t), 0); - + // Fixed: Allocate memory for oc_1 and initialize it to zero + cudaMallocAsync(&oc_1, oc_cnt_32 * sizeof(uint32_t), 0); + cudaMemsetAsync(oc_1, 0, oc_cnt_32 * sizeof(uint32_t), 0); + map_open_close<<>>( (uint32_t*) open_close_bitmap, oc_1, oc_cnt_32, oc_cnt); cudaStreamSynchronize(0); diff --git a/cujson/parse_standard_json.cu b/cujson/parse_standard_json.cu index 4127927..db2ea83 100644 --- a/cujson/parse_standard_json.cu +++ b/cujson/parse_standard_json.cu @@ -1319,9 +1319,22 @@ inline uint8_t * Tokenize( uint8_t* block_GPU, uint8_t* out_string_open_close_8_GPU; uint32_t* out_string_open_close_8_index_GPU; // it's going to store structural index, not real index - cudaMallocAsync(&out_string_open_close_8_GPU, (last_index_tokens_open_close + padding2) * sizeof(uint8_t),0); - cudaMallocAsync(&out_string_open_close_8_index_GPU, last_index_tokens_open_close * sizeof(uint32_t),0); - + // Fixed: Initialize it to 0 before using it in the kernel to avoid undefined behavior. + cudaMallocAsync(&out_string_open_close_8_GPU, + (last_index_tokens_open_close + padding2) * sizeof(uint8_t), + 0); + cudaMemsetAsync(out_string_open_close_8_GPU, + 0, + (last_index_tokens_open_close + padding2) * sizeof(uint8_t), + 0); + + cudaMallocAsync(&out_string_open_close_8_index_GPU, + last_index_tokens_open_close * sizeof(uint32_t), + 0); + cudaMemsetAsync(out_string_open_close_8_index_GPU, + 0, + last_index_tokens_open_close * sizeof(uint32_t), + 0); // cout << "res size before remove copy: " << last_index_tokens_open_close << "\n"; // cudaEventRecord(start, 0); @@ -1575,7 +1588,9 @@ int32_t* Parser(uint8_t* open_close_GPU, int32_t** open_close_index_d, int32_t* int32_t* res; // temporary result that will use in following uint32_t* oc_1; // output - cudaMallocAsync(&oc_1, oc_cnt_32*sizeof(uint32_t), 0); + // Fixed: Allocate memory for oc_1 on the device and initialize it to zero. + cudaMallocAsync(&oc_1, oc_cnt_32 * sizeof(uint32_t), 0); + cudaMemsetAsync(oc_1, 0, oc_cnt_32 * sizeof(uint32_t), 0); depth_init_MathAPI<<>>( (uint32_t*) open_close_GPU, oc_1, oc_cnt_32, oc_cnt); cudaStreamSynchronize(0);