Azure
diff --git a/‎examples/megatron-lm/GPT3-175B/aks/helm/megatron-training/scripts/train_megatron.sh‎
Lines changed: 4 additions & 5 deletions b/‎examples/megatron-lm/GPT3-175B/aks/helm/megatron-training/scripts/train_megatron.sh‎
Lines changed: 4 additions & 5 deletions
diff --git a/‎examples/megatron-lm/GPT3-175B/aks/helm/prepare-data/scripts/concatenate.py‎
Lines changed: 40 additions & 20 deletions b/‎examples/megatron-lm/GPT3-175B/aks/helm/prepare-data/scripts/concatenate.py‎
Lines changed: 40 additions & 20 deletions
diff --git a/‎examples/megatron-lm/GPT3-175B/aks/helm/prepare-data/scripts/download_slimpajama.py‎
Lines changed: 61 additions & 26 deletions b/‎examples/megatron-lm/GPT3-175B/aks/helm/prepare-data/scripts/download_slimpajama.py‎
Lines changed: 61 additions & 26 deletions
@@ -42,10 +42,10 @@ export OMPI_MCA_coll_hcoll_enable=0 \
 
 if [ "$USE_SHARP" -eq 1 ]; then
 	export SHARP_SMX_UCX_INTERFACE=mlx5_ib0:1 \
-	SHARP_COLL_ENABLE_SAT=1 \
-	SHARP_COLL_LOG_LEVEL=3 \
-	SHARP_COLL_ENABLE_PCI_RELAXED_ORDERING=1 \
-	NCCL_COLLNET_ENABLE=1
+		SHARP_COLL_ENABLE_SAT=1 \
+		SHARP_COLL_LOG_LEVEL=3 \
+		SHARP_COLL_ENABLE_PCI_RELAXED_ORDERING=1 \
+		NCCL_COLLNET_ENABLE=1
 fi
 
 export NCCL_TOPO_FILE=$TOPO_FILE
@@ -61,7 +61,6 @@ VOCAB_FILE=${VOCAB_FILE:-$DATA_PATH/../bpe/vocab.json}
 MERGE_FILE=${MERGE_FILE:-$DATA_PATH/../bpe/merges.txt}
 DATA_CACHE_DIR=${DATA_CACHE_DIR:-$STORAGE_MOUNT/datacache}
 
-
 DATA_SET_SIZE=$(find $DATA_PATH -name "*.bin" -type f | wc -l)
 
 readarray -t TRAIN_DATA < <(find $DATA_PATH -name "*.bin" -type f | sort | head -n $(($DATA_SET_SIZE - $CHUNKS - $CHUNKS)) | xargs -n 1 echo 1.0 | sed "s/.bin//g")
 
@@ -1,6 +1,6 @@
-import os
 import argparse
 import logging
+import os
 from glob import glob
 
 logging.basicConfig(
@@ -9,58 +9,78 @@
     handlers=[logging.StreamHandler()],
 )
 
-def concatenate(input_directory="", output_directory="", worker_index=0, total_workers=1):
+
+def concatenate(
+    input_directory="", output_directory="", worker_index=0, total_workers=1
+):
     shards_per_file = 1200
     files = sorted(glob(os.path.join(input_directory, "example_train_chunk*.jsonl")))
     num_files = len(files)
-    
+
     logging.info(f"Input directory: {input_directory}")
     logging.info(f"Output directory: {output_directory}")
     logging.info(f"Found {num_files} files to process")
-    
+
     # Find the ceiling of the result
-    shards = ((num_files + shards_per_file - 1) // shards_per_file)
-    
-    logging.info(f"Creating {shards} combined chunk(s) comprising {shards_per_file} files each")
-    
+    shards = (num_files + shards_per_file - 1) // shards_per_file
+
+    logging.info(
+        f"Creating {shards} combined chunk(s) comprising {shards_per_file} files each"
+    )
+
     # Ensure output directory exists
     os.makedirs(output_directory, exist_ok=True)
-    
+
     chunks_processed = 0
     for i in range(shards):
         if ((i - worker_index) % total_workers) != 0:
             continue
-            
+
         file_start = i * shards_per_file
-        
+
         if ((i + 1) * shards_per_file) >= len(files):
             file_stop = len(files)
         else:
             file_stop = (i + 1) * shards_per_file
-        
+
         logging.info(f"Building chunk {i} with files {file_start} to {file_stop}")
-        
+
         output_file = os.path.join(output_directory, f"slim_pajama_{i}.jsonl")
         with open(output_file, "w") as outf:
             for file_idx in range(file_start, min(file_stop, len(files))):
                 with open(files[file_idx], "r") as inf:
                     outf.write(inf.read())
-               
+
         chunks_processed += 1
-    
+
     # Create completion marker file in output directory
-    completion_file = os.path.join(output_directory, f".concatenate-{worker_index}-complete")
+    completion_file = os.path.join(
+        output_directory, f".concatenate-{worker_index}-complete"
+    )
     with open(completion_file, "w") as f:
-        f.write(f"Worker {worker_index} completed concatenating {chunks_processed} chunks")
+        f.write(
+            f"Worker {worker_index} completed concatenating {chunks_processed} chunks"
+        )
     logging.info(f"Created completion marker: {completion_file}")
 
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="Concatenate JSONL files")
-    parser.add_argument("--input-directory", type=str, required=True, help="Directory containing input files")
-    parser.add_argument("--output-directory", type=str, required=True, help="Directory to write concatenated files")
+    parser.add_argument(
+        "--input-directory",
+        type=str,
+        required=True,
+        help="Directory containing input files",
+    )
+    parser.add_argument(
+        "--output-directory",
+        type=str,
+        required=True,
+        help="Directory to write concatenated files",
+    )
     parser.add_argument("--worker-index", type=int, default=0, help="Worker index")
     parser.add_argument("--total-workers", type=int, default=1, help="Total workers")
-    
+
     args = parser.parse_args()
 
     # Handle backward compatibility
 
@@ -9,6 +9,7 @@
 import logging
 import os
 import time
+
 import requests
 
 CHUNKS = 10
@@ -26,6 +27,7 @@
     handlers=[logging.StreamHandler()],
 )
 
+
 def download_shard(url, filename, retry=RETRIES):
     """Download a shard from the given URL and save it to the specified filename."""
     if os.path.exists(filename):
@@ -34,7 +36,7 @@ def download_shard(url, filename, retry=RETRIES):
 
     try:
         response = requests.get(url, timeout=REQUEST_TIMEOUT)
-        
+
         if response.status_code == 429 and retry > 0:
             time.sleep(BACKOFF_TIME)
             logging.warning("Throttled. Retrying download for %s...", filename)
@@ -44,76 +46,109 @@ def download_shard(url, filename, retry=RETRIES):
         if response.status_code != 200:
             if retry > 0:
                 time.sleep(BACKOFF_TIME)
-                logging.warning("HTTP %s for %s. Retrying (%d attempts left)...", 
-                              response.status_code, filename, retry)
+                logging.warning(
+                    "HTTP %s for %s. Retrying (%d attempts left)...",
+                    response.status_code,
+                    filename,
+                    retry,
+                )
                 download_shard(url, filename, retry=retry - 1)
                 return
             else:
-                logging.error("Failed to download %s: HTTP %s", url, response.status_code)
+                logging.error(
+                    "Failed to download %s: HTTP %s", url, response.status_code
+                )
                 return
 
         with open(filename, "wb") as fn:
             fn.write(response.content)
         logging.info("Downloaded %s", filename)
-        
+
     except requests.exceptions.Timeout:
         if retry > 0:
             time.sleep(BACKOFF_TIME)
-            logging.warning("Timeout downloading %s. Retrying (%d attempts left)...", filename, retry)
+            logging.warning(
+                "Timeout downloading %s. Retrying (%d attempts left)...",
+                filename,
+                retry,
+            )
             download_shard(url, filename, retry=retry - 1)
         else:
             logging.error("Timeout downloading %s after %d retries", filename, RETRIES)
     except requests.exceptions.RequestException as e:
         if retry > 0:
             time.sleep(BACKOFF_TIME)
-            logging.warning("Network error downloading %s: %s. Retrying (%d attempts left)...", 
-                          filename, str(e), retry)
+            logging.warning(
+                "Network error downloading %s: %s. Retrying (%d attempts left)...",
+                filename,
+                str(e),
+                retry,
+            )
             download_shard(url, filename, retry=retry - 1)
         else:
-            logging.error("Network error downloading %s after %d retries: %s", filename, RETRIES, str(e))
+            logging.error(
+                "Network error downloading %s after %d retries: %s",
+                filename,
+                RETRIES,
+                str(e),
+            )
+
 
-def download(directory, full_dataset=True, sample_files=100, worker_index=0, total_workers=1):
+def download(
+    directory, full_dataset=True, sample_files=100, worker_index=0, total_workers=1
+):
     """Download SlimPajama dataset from Hugging Face with parallel worker support."""
     files_downloaded = 0
     files_to_process = []
-    
+
     # First, calculate all files that need to be downloaded
     for chunk in range(1, CHUNKS + 1):
-        shard_limit = SHARDS if full_dataset else min(sample_files // CHUNKS + 1, SHARDS)
+        shard_limit = (
+            SHARDS if full_dataset else min(sample_files // CHUNKS + 1, SHARDS)
+        )
         for shard in range(0, shard_limit):
             if not full_dataset and len(files_to_process) >= sample_files:
                 break
-            
+
             filename = f"example_train_chunk{chunk}_shard{shard}.jsonl.zst"
             url = f"{REPOSITORY_PATH}/chunk{chunk}/example_train_{shard}.jsonl.zst"
             files_to_process.append((filename, url))
-        
+
         if not full_dataset and len(files_to_process) >= sample_files:
             break
-    
+
     # Limit to sample_files if not downloading full dataset
     if not full_dataset:
         files_to_process = files_to_process[:sample_files]
-    
+
     # Distribute files across workers using modulo
-    worker_files = [file_info for i, file_info in enumerate(files_to_process) if i % total_workers == worker_index]
-    
-    logging.info(f"Worker {worker_index}/{total_workers}: Processing {len(worker_files)} files out of {len(files_to_process)} total files")
-    
+    worker_files = [
+        file_info
+        for i, file_info in enumerate(files_to_process)
+        if i % total_workers == worker_index
+    ]
+
+    logging.info(
+        f"Worker {worker_index}/{total_workers}: Processing {len(worker_files)} files out of {len(files_to_process)} total files"
+    )
+
     # Download assigned files
     for filename, url in worker_files:
         full_filename = os.path.join(directory, filename)
         download_shard(url, full_filename)
         files_downloaded += 1
-    
-    logging.info(f"Worker {worker_index} completed: Downloaded {files_downloaded} files")
-    
+
+    logging.info(
+        f"Worker {worker_index} completed: Downloaded {files_downloaded} files"
+    )
+
     # Create completion marker file
     completion_file = os.path.join(directory, f".download-{worker_index}-complete")
     with open(completion_file, "w") as f:
         f.write(f"Worker {worker_index} completed downloading {files_downloaded} files")
     logging.info(f"Created completion marker: {completion_file}")
 
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(
         description="Download SlimPajama from Hugging Face with parallel worker support."
@@ -151,9 +186,9 @@ def download(directory, full_dataset=True, sample_files=100, worker_index=0, tot
 
     os.makedirs(args.directory, exist_ok=True)
     download(
-        args.directory, 
-        args.full_dataset, 
+        args.directory,
+        args.full_dataset,
         args.sample_files,
         args.worker_index,
-        args.total_workers
+        args.total_workers,
     )