From 66d1ccda44ea7c5b7766b6b8826180e3702456b5 Mon Sep 17 00:00:00 2001
From: Soroush Safari Loaliyan <minato1374@gmail.com>
Date: Wed, 24 Jun 2026 10:24:59 -0700
Subject: [PATCH] Initialize padded sort buffers before CUB sort

---
 cujson/parse_json_lines.cu    | 25 ++++++++++++++++++++-----
 cujson/parse_standard_json.cu | 23 +++++++++++++++++++----
 2 files changed, 39 insertions(+), 9 deletions(-)

diff --git a/cujson/parse_json_lines.cu b/cujson/parse_json_lines.cu
index 515a127..f9a1001 100644
--- a/cujson/parse_json_lines.cu
+++ b/cujson/parse_json_lines.cu
@@ -945,9 +945,22 @@ inline uint8_t * stage2_tokenizer(  uint8_t* block_GPU,
 
     uint8_t* out_string_open_close_8_GPU;
     uint32_t* out_string_open_close_8_index_GPU; // it's going to store structural index, not real index
-    cudaMallocAsync(&out_string_open_close_8_GPU, (last_index_tokens_open_close + padding2)  * sizeof(uint8_t),0);
-    cudaMallocAsync(&out_string_open_close_8_index_GPU, last_index_tokens_open_close * sizeof(uint32_t),0);
-
+    // Fixed: Initialize it to 0 before using it in the kernel to avoid undefined behavior.
+    cudaMallocAsync(&out_string_open_close_8_GPU,
+                    (last_index_tokens_open_close + padding2) * sizeof(uint8_t),
+                    0);
+    cudaMemsetAsync(out_string_open_close_8_GPU,
+                    0,
+                    (last_index_tokens_open_close + padding2) * sizeof(uint8_t),
+                    0);
+
+    cudaMallocAsync(&out_string_open_close_8_index_GPU,
+                    last_index_tokens_open_close * sizeof(uint32_t),
+                    0);
+    cudaMemsetAsync(out_string_open_close_8_index_GPU,
+                    0,
+                    last_index_tokens_open_close * sizeof(uint32_t),
+                    0);
     // extractStructuralIdx(): extractStructuralIdx() + extractOpenCloseIdx()
     extractStructuralIdx<<<numBlock, BLOCKSIZE>>>(acc_structural_cnt,       // prefix sum set bits until each word of structural
                                         acc_open_close_cnt,                 // prefix sum set bits until each word of open close
@@ -1061,8 +1074,10 @@ int32_t* stage3_parser(uint8_t* open_close_bitmap, int32_t** open_close_index_d,
     // _______________STEP_1__(a)_________________    
     int32_t* res; // temporary result that will use in following
     uint32_t* oc_1; // output 
-    cudaMallocAsync(&oc_1, oc_cnt_32*sizeof(uint32_t), 0); 
-    
+    // Fixed: Allocate memory for oc_1 and initialize it to zero
+    cudaMallocAsync(&oc_1, oc_cnt_32 * sizeof(uint32_t), 0);
+    cudaMemsetAsync(oc_1, 0, oc_cnt_32 * sizeof(uint32_t), 0);
+
     map_open_close<<<numBlock_open_close_32, BLOCKSIZE>>>( (uint32_t*) open_close_bitmap, oc_1, oc_cnt_32, oc_cnt);
     cudaStreamSynchronize(0);
 
diff --git a/cujson/parse_standard_json.cu b/cujson/parse_standard_json.cu
index 4127927..db2ea83 100644
--- a/cujson/parse_standard_json.cu
+++ b/cujson/parse_standard_json.cu
@@ -1319,9 +1319,22 @@ inline uint8_t * Tokenize(  uint8_t* block_GPU,
 
     uint8_t* out_string_open_close_8_GPU;
     uint32_t* out_string_open_close_8_index_GPU; // it's going to store structural index, not real index
-    cudaMallocAsync(&out_string_open_close_8_GPU, (last_index_tokens_open_close + padding2)  * sizeof(uint8_t),0);
-    cudaMallocAsync(&out_string_open_close_8_index_GPU, last_index_tokens_open_close * sizeof(uint32_t),0);
-
+    // Fixed: Initialize it to 0 before using it in the kernel to avoid undefined behavior.
+    cudaMallocAsync(&out_string_open_close_8_GPU,
+                    (last_index_tokens_open_close + padding2) * sizeof(uint8_t),
+                    0);
+    cudaMemsetAsync(out_string_open_close_8_GPU,
+                    0,
+                    (last_index_tokens_open_close + padding2) * sizeof(uint8_t),
+                    0);
+
+    cudaMallocAsync(&out_string_open_close_8_index_GPU,
+                    last_index_tokens_open_close * sizeof(uint32_t),
+                    0);
+    cudaMemsetAsync(out_string_open_close_8_index_GPU,
+                    0,
+                    last_index_tokens_open_close * sizeof(uint32_t),
+                    0);
     // cout << "res size before remove copy: " << last_index_tokens_open_close << "\n";
 
     // cudaEventRecord(start, 0);
@@ -1575,7 +1588,9 @@ int32_t* Parser(uint8_t* open_close_GPU, int32_t** open_close_index_d,  int32_t*
     int32_t* res; // temporary result that will use in following
 
     uint32_t* oc_1; // output 
-    cudaMallocAsync(&oc_1, oc_cnt_32*sizeof(uint32_t), 0); 
+    // Fixed: Allocate memory for oc_1 on the device and initialize it to zero.
+    cudaMallocAsync(&oc_1, oc_cnt_32 * sizeof(uint32_t), 0);
+    cudaMemsetAsync(oc_1, 0, oc_cnt_32 * sizeof(uint32_t), 0);
 
     depth_init_MathAPI<<<numBlock_open_close_32, BLOCKSIZE>>>( (uint32_t*) open_close_GPU, oc_1, oc_cnt_32, oc_cnt);
     cudaStreamSynchronize(0);