AbsaOSS · yruslan · Dec 19, 2025 · Dec 17, 2025 · Dec 18, 2025 · Dec 19, 2025
@@ -39,6 +39,9 @@ Among the motivations for this project, it is possible to highlight:
 
 - The COBOL copybooks parser doesn't have a Spark dependency and can be reused for integrating into other data processing engines.
 
+- Supports reading files compressed in Hadoop-compatible way (gzip, bzip2, etc), but with limited parallelism. 
+  Uncompressed files are preferred for performance. 
+
 ## Videos
 
 We have presented Cobrix at DataWorks Summit 2019 and Spark Summit 2019 conferences. The screencasts are available here:
@@ -1602,7 +1605,7 @@ The output looks like this:
 | .option("redefine-segment-id-map:0", "REDEFINED_FIELD1 => SegmentId1,SegmentId2,...") | Specifies a mapping between redefined field names and segment id values. Each option specifies a mapping for a single segment. The numeric value for each mapping option must be incremented so the option keys are unique.                                                                                                                                                        |
 | .option("segment-children:0", "COMPANY => EMPLOYEE,DEPARTMENT")                       | Specifies a mapping between segment redefined fields and their children. Each option specifies a mapping for a single parent field. The numeric value for each mapping option must be incremented so the option keys are unique. If such mapping is specified hierarchical record structure will be automatically reconstructed. This require `redefine-segment-id-map` to be set. | 
 | .option("enable_indexes", "true")                                                     | Turns on indexing of multisegment variable length files (on by default).                                                                                                                                                                                                                                                                                                           |
-| .option("enable_index_cache", "false")                                                | When true, calculated indexes are cached in memory for later use. This improves performance of processing when same files are processed more than once.                                                                                                                                                                                                                            |
+| .option("enable_index_cache", "true")                                                 | When true (default), calculated indexes are cached in memory for later use. This improves performance of processing when same files are processed more than once.                                                                                                                                                                                                                  |
 | .option("input_split_records", 50000)                                                 | Specifies how many records will be allocated to each split/partition. It will be processed by Spark tasks. (The default is not set and the split will happen according to size, see the next option)                                                                                                                                                                               |
 | .option("input_split_size_mb", 100)                                                   | Specify how many megabytes to allocate to each partition/split. (The default is 100 MB)                                                                                                                                                                                                                                                                                            |
 

@@ -42,6 +42,8 @@ import scala.reflect.ClassTag
 class VarLenNestedReader[T: ClassTag](copybookContents: Seq[String],
                                       readerProperties: ReaderParameters,
                                       handler: RecordHandler[T]) extends VarLenReader with Logging with Serializable {
+  private val DEFAULT_INDEX_SIZE_COMPRESSED_FILES_MB = 1024
+  private val DEFAULT_FS_INDEX_SIZE_MULTIPLIER = 4
 
   protected val cobolSchema: CobolSchema = loadCopyBook(copybookContents)
 
@@ -144,7 +146,7 @@ class VarLenNestedReader[T: ClassTag](copybookContents: Seq[String],
                              fileNumber: Int,
                              isRdwBigEndian: Boolean): ArrayBuffer[SparseIndexEntry] = {
     val inputSplitSizeRecords: Option[Int] = readerProperties.inputSplitRecords
-    val inputSplitSizeMB: Option[Int] = getSplitSizeMB
+    val inputSplitSizeMB: Option[Int] = getSplitSizeMB(dataStream.isCompressed)
 
     if (inputSplitSizeRecords.isDefined) {
       if (inputSplitSizeRecords.get < 1 || inputSplitSizeRecords.get > 1000000000) {
@@ -153,8 +155,9 @@ class VarLenNestedReader[T: ClassTag](copybookContents: Seq[String],
       logger.info(s"Input split size = ${inputSplitSizeRecords.get} records")
     } else {
       if (inputSplitSizeMB.nonEmpty) {
-        if (inputSplitSizeMB.get < 1 || inputSplitSizeMB.get > 2000) {
-          throw new IllegalArgumentException(s"Invalid input split size of ${inputSplitSizeMB.get} MB.")
+        val maxSplitSizeMB = if (dataStream.isCompressed) 200000 else 2000
+        if (inputSplitSizeMB.get < 1 || inputSplitSizeMB.get > maxSplitSizeMB) {
+          throw new IllegalArgumentException(s"Invalid input split size of ${inputSplitSizeMB.get} MB (max allowed: $maxSplitSizeMB MB).")
         }
         logger.info(s"Input split size = ${inputSplitSizeMB.get} MB")
       }
@@ -214,11 +217,11 @@ class VarLenNestedReader[T: ClassTag](copybookContents: Seq[String],
     }
   }
 
-  private def getSplitSizeMB: Option[Int] = {
-    if (readerProperties.inputSplitSizeMB.isDefined) {
-      readerProperties.inputSplitSizeMB
+  private def getSplitSizeMB(isCompressed: Boolean): Option[Int] = {
+    if (isCompressed) {
+      readerProperties.inputSplitSizeCompressedMB.orElse(Some(DEFAULT_INDEX_SIZE_COMPRESSED_FILES_MB))
     } else {
-      readerProperties.hdfsDefaultBlockSize
+      readerProperties.inputSplitSizeMB.orElse(readerProperties.fsDefaultBlockSize).map(_ * DEFAULT_FS_INDEX_SIZE_MULTIPLIER)
     }
   }
 

@@ -118,13 +118,14 @@ object CobolParametersParser extends Logging {
   val PARAM_SEGMENT_REDEFINE_PREFIX_ALT = "redefine-segment-id-map"
 
   // Indexed multisegment file processing
-  val PARAM_ENABLE_INDEXES            = "enable_indexes"
-  val PARAM_ENABLE_INDEX_CACHE        = "enable_index_cache"
-  val PARAM_INPUT_SPLIT_RECORDS       = "input_split_records"
-  val PARAM_INPUT_SPLIT_SIZE_MB       = "input_split_size_mb"
-  val PARAM_SEGMENT_ID_PREFIX         = "segment_id_prefix"
-  val PARAM_OPTIMIZE_ALLOCATION       = "optimize_allocation"
-  val PARAM_IMPROVE_LOCALITY          = "improve_locality"
+  val PARAM_ENABLE_INDEXES                   = "enable_indexes"
+  val PARAM_ENABLE_INDEX_CACHE               = "enable_index_cache"
+  val PARAM_INPUT_SPLIT_RECORDS              = "input_split_records"
+  val PARAM_INPUT_SPLIT_SIZE_MB              = "input_split_size_mb"
+  val PARAM_INPUT_SPLIT_SIZE_COMPRESSED_MB   = "input_split_size_compressed_mb"
+  val PARAM_SEGMENT_ID_PREFIX                = "segment_id_prefix"
+  val PARAM_OPTIMIZE_ALLOCATION              = "optimize_allocation"
+  val PARAM_IMPROVE_LOCALITY                 = "improve_locality"
 
   // Parameters for debugging
   val PARAM_DEBUG_LAYOUT_POSITIONS    = "debug_layout_positions"
@@ -385,6 +386,7 @@ object CobolParametersParser extends Logging {
                                  isIndexCachingAllowed = false,
                                  inputSplitRecords = None,
                                  inputSplitSizeMB = None,
+                                 inputSplitSizeCompressedMB = None,
                                  improveLocality = false,
                                  optimizeAllocation = false,
                                  inputFileNameColumn = "",
@@ -421,7 +423,8 @@ object CobolParametersParser extends Logging {
       isIndexCachingAllowed = varLenParams.isIndexCachingAllowed,
       inputSplitRecords = varLenParams.inputSplitRecords,
       inputSplitSizeMB = varLenParams.inputSplitSizeMB,
-      hdfsDefaultBlockSize = defaultBlockSize,
+      inputSplitSizeCompressedMB = varLenParams.inputSplitSizeCompressedMB,
+      fsDefaultBlockSize = defaultBlockSize,
       startOffset = parameters.recordStartOffset,
       endOffset = parameters.recordEndOffset,
       fileStartOffset = varLenParams.fileStartOffset,
@@ -505,9 +508,10 @@ object CobolParametersParser extends Logging {
         fileEndOffset,
         isRecordIdGenerationEnabled,
         params.getOrElse(PARAM_ENABLE_INDEXES, "true").toBoolean,
-        params.getOrElse(PARAM_ENABLE_INDEX_CACHE, "false").toBoolean,
+        params.getOrElse(PARAM_ENABLE_INDEX_CACHE, "true").toBoolean,
         params.get(PARAM_INPUT_SPLIT_RECORDS).map(v => v.toInt),
         params.get(PARAM_INPUT_SPLIT_SIZE_MB).map(v => v.toInt),
+        params.get(PARAM_INPUT_SPLIT_SIZE_COMPRESSED_MB).map(v => v.toInt),
         params.getOrElse(PARAM_IMPROVE_LOCALITY, "true").toBoolean,
         params.getOrElse(PARAM_OPTIMIZE_ALLOCATION, "false").toBoolean,
         params.getOrElse(PARAM_INPUT_FILE_COLUMN, ""),

@@ -20,7 +20,7 @@ import za.co.absa.cobrix.cobol.parser.decoders.FloatingPointFormat
 import za.co.absa.cobrix.cobol.parser.decoders.FloatingPointFormat.FloatingPointFormat
 import za.co.absa.cobrix.cobol.parser.policies.DebugFieldsPolicy.DebugFieldsPolicy
 import za.co.absa.cobrix.cobol.parser.policies.StringTrimmingPolicy.StringTrimmingPolicy
-import za.co.absa.cobrix.cobol.parser.policies.{CommentPolicy, DebugFieldsPolicy, FillerNamingPolicy, MetadataPolicy, StringTrimmingPolicy}
+import za.co.absa.cobrix.cobol.parser.policies._
 import za.co.absa.cobrix.cobol.parser.recordformats.RecordFormat
 import za.co.absa.cobrix.cobol.parser.recordformats.RecordFormat.FixedLength
 import za.co.absa.cobrix.cobol.reader.policies.SchemaRetentionPolicy
@@ -50,7 +50,8 @@ import za.co.absa.cobrix.cobol.reader.policies.SchemaRetentionPolicy.SchemaReten
   * @param isIndexGenerationNeeded Is indexing input file before processing is requested
   * @param inputSplitRecords       The number of records to include in each partition. Notice mainframe records may have variable size, inputSplitMB is the recommended option
   * @param inputSplitSizeMB        A partition size to target. In certain circumstances this size may not be exactly that, but the library will do the best effort to target that size
-  * @param hdfsDefaultBlockSize    Default HDFS block size for the HDFS filesystem used. This value is used as the default split size if inputSplitSizeMB is not specified
+  * @param inputSplitSizeCompressedMB A partition size to target for compressed files.
+  * @param fsDefaultBlockSize      Default HDFS block size for the HDFS filesystem used. This value is used as the default split size if inputSplitSizeMB is not specified
   * @param startOffset             An offset to the start of the record in each binary data block.
   * @param endOffset               An offset from the end of the record to the end of the binary data block.
   * @param fileStartOffset         A number of bytes to skip at the beginning of each file
@@ -102,7 +103,8 @@ case class ReaderParameters(
                              isIndexCachingAllowed:   Boolean = false,
                              inputSplitRecords:       Option[Int] = None,
                              inputSplitSizeMB:        Option[Int] = None,
-                             hdfsDefaultBlockSize:    Option[Int] = None,
+                             inputSplitSizeCompressedMB: Option[Int] = None,
+                             fsDefaultBlockSize:      Option[Int] = None,
                              startOffset:             Int = 0,
                              endOffset:               Int = 0,
                              fileStartOffset:         Int = 0,