Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 43 additions & 1 deletion datafusion/common/src/config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ use crate::encryption::{FileDecryptionProperties, FileEncryptionProperties};
use crate::error::_config_err;
use crate::format::{ExplainAnalyzeLevel, ExplainFormat};
use crate::parquet_config::DFParquetWriterVersion;
use crate::parsers::CompressionTypeVariant;
use crate::parsers::{CompressionTypeVariant, CsvQuoteStyle};
use crate::utils::get_available_parallelism;
use crate::{DataFusionError, Result};
#[cfg(feature = "parquet_encryption")]
Expand Down Expand Up @@ -1855,6 +1855,17 @@ impl ConfigField for CompressionTypeVariant {
}
}

impl ConfigField for CsvQuoteStyle {
fn visit<V: Visit>(&self, v: &mut V, key: &str, description: &'static str) {
v.some(key, self, description)
}

fn set(&mut self, _: &str, value: &str) -> Result<()> {
*self = CsvQuoteStyle::from_str(value)?;
Ok(())
}
}

/// An implementation trait used to recursively walk configuration
pub trait Visit {
fn some<V: Display>(&mut self, key: &str, value: V, description: &'static str);
Expand Down Expand Up @@ -2927,6 +2938,13 @@ config_namespace! {
pub terminator: Option<u8>, default = None
pub escape: Option<u8>, default = None
pub double_quote: Option<bool>, default = None
/// Quote style for CSV writing.
/// One of: "Always", "Necessary", "NonNumeric", "Never"
pub quote_style: CsvQuoteStyle, default = CsvQuoteStyle::Necessary
/// Whether to ignore leading whitespace in string values when writing CSV.
pub ignore_leading_whitespace: Option<bool>, default = None
/// Whether to ignore trailing whitespace in string values when writing CSV.
pub ignore_trailing_whitespace: Option<bool>, default = None
/// Specifies whether newlines in (quoted) values are supported.
///
/// Parsing newlines in quoted values may be affected by execution behaviour such as
Expand Down Expand Up @@ -3035,6 +3053,30 @@ impl CsvOptions {
self
}

/// Set the quote style for CSV writing.
pub fn with_quote_style(mut self, quote_style: CsvQuoteStyle) -> Self {
self.quote_style = quote_style;
self
}

/// Set whether to ignore leading whitespace in string values when writing CSV.
pub fn with_ignore_leading_whitespace(
mut self,
ignore_leading_whitespace: bool,
) -> Self {
self.ignore_leading_whitespace = Some(ignore_leading_whitespace);
self
}

/// Set whether to ignore trailing whitespace in string values when writing CSV.
pub fn with_ignore_trailing_whitespace(
mut self,
ignore_trailing_whitespace: bool,
) -> Self {
self.ignore_trailing_whitespace = Some(ignore_trailing_whitespace);
self
}

/// Specifies whether newlines in (quoted) values are supported.
///
/// Parsing newlines in quoted values may be affected by execution behaviour such as
Expand Down
17 changes: 15 additions & 2 deletions datafusion/common/src/file_options/csv_writer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,9 @@

use crate::config::CsvOptions;
use crate::error::{DataFusionError, Result};
use crate::parsers::CompressionTypeVariant;
use crate::parsers::{CompressionTypeVariant, CsvQuoteStyle};

use arrow::csv::WriterBuilder;
use arrow::csv::{QuoteStyle, WriterBuilder};

/// Options for writing CSV files
#[derive(Clone, Debug)]
Expand Down Expand Up @@ -94,6 +94,19 @@ impl TryFrom<&CsvOptions> for CsvWriterOptions {
if let Some(v) = &value.double_quote {
builder = builder.with_double_quote(*v)
}
let style = match value.quote_style {
CsvQuoteStyle::Always => QuoteStyle::Always,
CsvQuoteStyle::NonNumeric => QuoteStyle::NonNumeric,
CsvQuoteStyle::Never => QuoteStyle::Never,
CsvQuoteStyle::Necessary => QuoteStyle::Necessary,
};
builder = builder.with_quote_style(style);
if let Some(v) = &value.ignore_leading_whitespace {
builder = builder.with_ignore_leading_whitespace(*v)
}
if let Some(v) = &value.ignore_trailing_whitespace {
builder = builder.with_ignore_trailing_whitespace(*v)
}
Ok(CsvWriterOptions {
writer_options: builder,
compression: value.compression,
Expand Down
45 changes: 45 additions & 0 deletions datafusion/common/src/parsers.rs
Original file line number Diff line number Diff line change
Expand Up @@ -73,3 +73,48 @@ impl CompressionTypeVariant {
!matches!(self, &Self::UNCOMPRESSED)
}
}

/// CSV quote style
///
/// Controls when fields are quoted when writing CSV files.
/// Corresponds to [`arrow::csv::QuoteStyle`].
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Default)]
pub enum CsvQuoteStyle {
/// Quote all fields
Always,
/// Only quote fields when necessary (default)
#[default]
Necessary,
/// Quote all non-numeric fields
NonNumeric,
/// Never quote fields
Never,
}

impl FromStr for CsvQuoteStyle {
type Err = DataFusionError;

fn from_str(s: &str) -> Result<Self, Self::Err> {
match s {
"Always" | "ALWAYS" | "always" => Ok(Self::Always),
"Necessary" | "NECESSARY" | "necessary" | "" => Ok(Self::Necessary),
"NonNumeric" | "NON_NUMERIC" | "nonnumeric" => Ok(Self::NonNumeric),
"Never" | "NEVER" | "never" => Ok(Self::Never),
_ => Err(DataFusionError::NotImplemented(format!(
"Unsupported CSV quote style {s}"
))),
}
}
}

impl Display for CsvQuoteStyle {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
let str = match self {
Self::Always => "Always",
Self::Necessary => "Necessary",
Self::NonNumeric => "NonNumeric",
Self::Never => "Never",
};
write!(f, "{str}")
}
}
19 changes: 19 additions & 0 deletions datafusion/proto-common/proto/datafusion_common.proto
Original file line number Diff line number Diff line change
Expand Up @@ -430,6 +430,13 @@ message JsonWriterOptions {
}


enum CsvQuoteStyle {
NECESSARY = 0;
ALWAYS = 1;
NON_NUMERIC = 2;
NEVER = 3;
}

message CsvWriterOptions {
// Compression type
CompressionTypeVariant compression = 1;
Expand All @@ -453,6 +460,12 @@ message CsvWriterOptions {
string escape = 10;
// Optional flag whether to double quotes, instead of escaping. Defaults to `true`
bool double_quote = 11;
// Quote style for CSV writing
CsvQuoteStyle quote_style = 12;
// Whether to ignore leading whitespace in string values
bool ignore_leading_whitespace = 13;
// Whether to ignore trailing whitespace in string values
bool ignore_trailing_whitespace = 14;
}

// Options controlling CSV format
Expand All @@ -476,6 +489,12 @@ message CsvOptions {
bytes terminator = 17; // Optional terminator character as a byte
bytes truncated_rows = 18; // Indicates if truncated rows are allowed
optional uint32 compression_level = 19; // Optional compression level
// Quote style for CSV writing
CsvQuoteStyle quote_style = 20;
// Whether to ignore leading whitespace in string values
bytes ignore_leading_whitespace = 21;
// Whether to ignore trailing whitespace in string values
bytes ignore_trailing_whitespace = 22;
}

// Options controlling CSV format
Expand Down
37 changes: 35 additions & 2 deletions datafusion/proto-common/src/from_proto/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ use crate::common::proto_error;
use crate::protobuf_common as protobuf;
use arrow::array::{ArrayRef, AsArray};
use arrow::buffer::Buffer;
use arrow::csv::WriterBuilder;
use arrow::csv::{QuoteStyle, WriterBuilder};
use arrow::datatypes::{
DataType, Field, IntervalDayTimeType, IntervalMonthDayNanoType, IntervalUnit, Schema,
TimeUnit, UnionFields, UnionMode, i256,
Expand Down Expand Up @@ -947,6 +947,17 @@ impl From<CompressionTypeVariant> for protobuf::CompressionTypeVariant {
}
}

impl From<protobuf::CsvQuoteStyle> for datafusion_common::parsers::CsvQuoteStyle {
fn from(value: protobuf::CsvQuoteStyle) -> Self {
match value {
protobuf::CsvQuoteStyle::Necessary => Self::Necessary,
protobuf::CsvQuoteStyle::Always => Self::Always,
protobuf::CsvQuoteStyle::NonNumeric => Self::NonNumeric,
protobuf::CsvQuoteStyle::Never => Self::Never,
}
}
}

impl TryFrom<&protobuf::CsvWriterOptions> for CsvWriterOptions {
type Error = DataFusionError;

Expand Down Expand Up @@ -1003,6 +1014,15 @@ impl TryFrom<&protobuf::CsvOptions> for CsvOptions {
.then(|| proto_opts.null_regex.clone()),
comment: proto_opts.comment.first().copied(),
truncated_rows: proto_opts.truncated_rows.first().map(|h| *h != 0),
quote_style: proto_opts.quote_style().into(),
ignore_leading_whitespace: proto_opts
.ignore_leading_whitespace
.first()
.map(|h| *h != 0),
ignore_trailing_whitespace: proto_opts
.ignore_trailing_whitespace
.first()
.map(|h| *h != 0),
})
}
}
Expand Down Expand Up @@ -1253,12 +1273,25 @@ pub(crate) fn csv_writer_options_from_proto(
return Err(proto_error("Error parsing CSV Escape"));
}
}
let quote_style = match protobuf::CsvQuoteStyle::try_from(writer_options.quote_style)
{
Ok(protobuf::CsvQuoteStyle::Always) => QuoteStyle::Always,
Ok(protobuf::CsvQuoteStyle::NonNumeric) => QuoteStyle::NonNumeric,
Ok(protobuf::CsvQuoteStyle::Never) => QuoteStyle::Never,
Ok(protobuf::CsvQuoteStyle::Necessary) => QuoteStyle::Necessary,
_ => Err(proto_error(
"Unknown quote style, must be one of: 'Always', 'NonNumeric', 'Never', 'Necessary'",
))?,
};
Ok(builder
.with_header(writer_options.has_header)
.with_date_format(writer_options.date_format.clone())
.with_datetime_format(writer_options.datetime_format.clone())
.with_timestamp_format(writer_options.timestamp_format.clone())
.with_time_format(writer_options.time_format.clone())
.with_null(writer_options.null_value.clone())
.with_double_quote(writer_options.double_quote))
.with_double_quote(writer_options.double_quote)
.with_quote_style(quote_style)
.with_ignore_leading_whitespace(writer_options.ignore_leading_whitespace)
.with_ignore_trailing_whitespace(writer_options.ignore_trailing_whitespace))
}
Loading