Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 10 additions & 5 deletions src/probe_data_sources.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,10 +69,14 @@ def get_remote_file_date(url: str) -> datetime | None:
result = parsedate_to_datetime(last_modified)
if isinstance(result, datetime):
return result
except (ValueError, TypeError):
pass
except Exception:
pass
except (ValueError, TypeError) as e:
# Invalid date format - log and continue
import logging
logging.debug(f"Could not parse Last-Modified header '{last_modified}': {e}")
except Exception as e:
# Network or other error - log and continue
import logging
logging.debug(f"Error getting remote file date from {url}: {e}")
return None


Expand Down Expand Up @@ -417,7 +421,8 @@ def main():
updates_available = []

for name, config in DATA_SOURCES.items():
assert isinstance(config, dict)
if not isinstance(config, dict):
raise TypeError(f"Config for {name} must be a dict, got {type(config)}")
is_available, message, metadata = probe_data_source(name, config)
results[name] = {"available": is_available, "message": message, "metadata": metadata}

Expand Down
4 changes: 2 additions & 2 deletions src/process_updated_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
"""

import logging
import subprocess
import subprocess # nosec B404 - subprocess is used safely with list arguments, not shell=True
import sys
from pathlib import Path

Expand Down Expand Up @@ -67,7 +67,7 @@ def process_shapefile_with_centroids(
output_suffix,
]

result = subprocess.run(cmd, capture_output=True, text=True, cwd=Path.cwd())
result = subprocess.run(cmd, capture_output=True, text=True, cwd=Path.cwd()) # nosec B603

if result.returncode == 0:
logging.info(f"Successfully processed {shapefile}")
Expand Down
6 changes: 4 additions & 2 deletions src/update_data_sources.py
Original file line number Diff line number Diff line change
Expand Up @@ -285,7 +285,8 @@ def update_all_sources(force: bool = False) -> dict[str, bool]:
source_metadata = metadata.get(name, {})

if force or source_metadata.get("update_available", False):
assert isinstance(config, dict)
if not isinstance(config, dict):
raise TypeError(f"Config for {name} must be a dict, got {type(config)}")
success = update_data_source(name, config)
results[name] = success
else:
Expand Down Expand Up @@ -317,7 +318,8 @@ def main():
# Update single source
if args.source in DATA_SOURCES:
config = DATA_SOURCES[args.source]
assert isinstance(config, dict)
if not isinstance(config, dict):
raise TypeError(f"Config for {args.source} must be a dict, got {type(config)}")
success = update_data_source(args.source, config)
sys.exit(0 if success else 1)
else:
Expand Down
41 changes: 11 additions & 30 deletions src/validate_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,9 +123,7 @@ def compare_schemas(old_schema: dict, new_schema: dict) -> dict[str, Any]:
old_type = old_dtypes.get(col)
new_type = new_dtypes.get(col)
if old_type and new_type and old_type != new_type:
type_changes = changes["type_changes"]
assert isinstance(type_changes, list)
type_changes.append({"column": col, "old_type": old_type, "new_type": new_type})
changes["type_changes"].append({"column": col, "old_type": old_type, "new_type": new_type})

# Check row count change
old_count = old_schema.get("row_count", 0)
Expand Down Expand Up @@ -195,9 +193,7 @@ def validate_data_quality(
if col != "geometry":
missing = gdf[col].isna().sum()
if missing > 0:
missing_values = quality_metrics["missing_values"]
assert isinstance(missing_values, dict)
missing_values[col] = {
quality_metrics["missing_values"][col] = {
"count": int(missing),
"percent": (missing / len(gdf)) * 100,
}
Expand All @@ -219,9 +215,7 @@ def validate_data_quality(
upper_bound = q3 + 1.5 * iqr
outliers = ((values < lower_bound) | (values > upper_bound)).sum()
if outliers > 0:
outliers_dict = quality_metrics["outliers"]
assert isinstance(outliers_dict, dict)
outliers_dict[col] = {
quality_metrics["outliers"][col] = {
"count": int(outliers),
"percent": (outliers / len(values)) * 100,
}
Expand All @@ -230,9 +224,7 @@ def validate_data_quality(
for col in df.columns:
missing = df[col].isna().sum()
if missing > 0:
missing_values = quality_metrics["missing_values"]
assert isinstance(missing_values, dict)
missing_values[col] = {
quality_metrics["missing_values"][col] = {
"count": int(missing),
"percent": (missing / len(df)) * 100,
}
Expand All @@ -252,9 +244,7 @@ def validate_data_quality(
upper_bound = q3 + 1.5 * iqr
outliers = ((values < lower_bound) | (values > upper_bound)).sum()
if outliers > 0:
outliers_dict = quality_metrics["outliers"]
assert isinstance(outliers_dict, dict)
outliers_dict[col] = {
quality_metrics["outliers"][col] = {
"count": int(outliers),
"percent": (outliers / len(values)) * 100,
}
Expand Down Expand Up @@ -317,9 +307,7 @@ def validate_data_file(file_path: Path, source_name: str | None = None) -> dict[

if not current_schema:
validation_result["valid"] = False
errors = validation_result["errors"]
assert isinstance(errors, list)
errors.append("Could not read schema from file")
validation_result["errors"].append("Could not read schema from file")
return validation_result

# Check for schema changes
Expand All @@ -338,22 +326,17 @@ def validate_data_file(file_path: Path, source_name: str | None = None) -> dict[
validation_result["schema_changes"] = schema_changes

# Check for breaking changes
warnings = validation_result["warnings"]
assert isinstance(warnings, list)
if schema_changes["removed_columns"]:
removed_cols = schema_changes["removed_columns"]
assert isinstance(removed_cols, list)
warnings.append(f"Removed columns: {', '.join(removed_cols)}")
validation_result["warnings"].append(f"Removed columns: {', '.join(removed_cols)}")

if schema_changes["type_changes"]:
type_changes = schema_changes["type_changes"]
assert isinstance(type_changes, list)
warnings.append(f"Type changes: {len(type_changes)} columns")
validation_result["warnings"].append(f"Type changes: {len(type_changes)} columns")

if schema_changes["crs_change"]:
crs_change = schema_changes["crs_change"]
assert isinstance(crs_change, dict)
warnings.append(f"CRS changed: {crs_change['old']} -> {crs_change['new']}")
validation_result["warnings"].append(f"CRS changed: {crs_change['old']} -> {crs_change['new']}")

# Save current schema version
if source_name not in schema_versions:
Expand All @@ -371,13 +354,11 @@ def validate_data_file(file_path: Path, source_name: str | None = None) -> dict[
validation_result["quality_metrics"] = quality_metrics

# Check for quality issues
warnings = validation_result["warnings"]
assert isinstance(warnings, list)
if quality_metrics["invalid_geometries"] > 0:
warnings.append(f"{quality_metrics['invalid_geometries']} invalid geometries")
validation_result["warnings"].append(f"{quality_metrics['invalid_geometries']} invalid geometries")

if quality_metrics["empty_geometries"] > 0:
warnings.append(f"{quality_metrics['empty_geometries']} empty geometries")
validation_result["warnings"].append(f"{quality_metrics['empty_geometries']} empty geometries")

high_missing = {
col: metrics
Expand Down
Loading