Skip to content

Commit cf7532d

Browse files
authored
Use USER_AGENT to try bypass being blocked. (#8486)
* Use USER_AGENT to try bypass being blocked. * Use proxy * updating test
1 parent f8d24ae commit cf7532d

File tree

2 files changed

+95
-29
lines changed

2 files changed

+95
-29
lines changed

electricitymap/contrib/parsers/CNDC.py

Lines changed: 84 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@
1717
TotalProductionList,
1818
)
1919
from electricitymap.contrib.lib.types import ZoneKey
20+
from electricitymap.contrib.parsers.lib.config import use_proxy
21+
from electricitymap.contrib.parsers.lib.exceptions import ParserException
2022

2123
tz_bo = ZoneInfo("America/La_Paz")
2224

@@ -28,6 +30,9 @@
2830
INDEX_URL = "https://www.cndc.bo/gene/index.php"
2931
DATA_URL = "https://www.cndc.bo/gene/dat/gene.php?fechag={0}"
3032
SOURCE = "cndc.bo"
33+
# User-Agent to avoid being blocked as a bot
34+
USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
35+
REQUEST_TIMEOUT = 30 # seconds
3136

3237

3338
def extract_xsrf_token(html):
@@ -45,27 +50,84 @@ def get_datetime(query_date: datetime, hour: int) -> datetime:
4550
)
4651

4752

53+
def _check_response(response, context: str = ""):
54+
"""Check HTTP response and raise appropriate ParserException if needed."""
55+
if response.status_code == 403:
56+
raise ParserException(
57+
"CNDC.py",
58+
f"Access forbidden (403){context}. The server may be blocking requests.",
59+
"BO",
60+
)
61+
elif response.status_code == 429:
62+
raise ParserException("CNDC.py", f"Rate limit exceeded (429){context}.", "BO")
63+
elif response.status_code >= 500:
64+
raise ParserException(
65+
"CNDC.py",
66+
f"Server error ({response.status_code}){context}. The CNDC server may be down.",
67+
"BO",
68+
)
69+
elif not response.ok:
70+
raise ParserException("CNDC.py", f"HTTP {response.status_code}{context}", "BO")
71+
72+
4873
def fetch_data(
4974
session: Session | None = None, target_datetime: datetime | None = None
5075
) -> tuple[list[dict], datetime]:
51-
if session is None:
52-
session = Session()
53-
54-
if target_datetime is None:
55-
target_datetime = datetime.now()
56-
target_datetime = target_datetime.astimezone(tz_bo)
57-
# Define actual and previous day (for midnight data).
76+
session = session or Session()
77+
target_datetime = (target_datetime or datetime.now()).astimezone(tz_bo)
5878
formatted_dt = target_datetime.strftime("%Y-%m-%d")
5979

60-
# XSRF token for the initial request
61-
xsrf_token = extract_xsrf_token(session.get(INDEX_URL).text)
62-
63-
resp = session.get(
64-
DATA_URL.format(formatted_dt), headers={"x-csrf-token": xsrf_token}
65-
)
66-
67-
hour_rows = json.loads(resp.text.replace("", ""))["data"]
68-
return hour_rows, target_datetime
80+
# Headers to mimic a browser and avoid being blocked
81+
headers = {
82+
"User-Agent": USER_AGENT,
83+
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
84+
"Accept-Language": "en-US,en;q=0.5",
85+
"Connection": "keep-alive",
86+
}
87+
88+
try:
89+
# Get XSRF token from index page
90+
index_response = session.get(
91+
INDEX_URL, headers=headers, timeout=REQUEST_TIMEOUT
92+
)
93+
_check_response(index_response)
94+
95+
try:
96+
xsrf_token = extract_xsrf_token(index_response.text)
97+
except (AttributeError, IndexError) as e:
98+
raise ParserException(
99+
"CNDC.py",
100+
"Failed to extract XSRF token. Website structure may have changed.",
101+
"BO",
102+
) from e
103+
104+
# Fetch data with XSRF token
105+
headers["x-csrf-token"] = xsrf_token
106+
data_response = session.get(
107+
DATA_URL.format(formatted_dt), headers=headers, timeout=REQUEST_TIMEOUT
108+
)
109+
_check_response(data_response, " when fetching data")
110+
111+
# Parse JSON response
112+
try:
113+
hour_rows = json.loads(data_response.text.replace("", ""))["data"]
114+
except (json.JSONDecodeError, KeyError) as e:
115+
raise ParserException(
116+
"CNDC.py",
117+
f"Failed to parse JSON response. API format may have changed: {e}",
118+
"BO",
119+
) from e
120+
121+
return hour_rows, target_datetime
122+
123+
except ParserException:
124+
raise
125+
except Exception as e:
126+
raise ParserException(
127+
"CNDC.py",
128+
f"Unexpected error: {type(e).__name__}: {e}",
129+
"BO",
130+
) from e
69131

70132

71133
def parse_generation_forecast(
@@ -105,6 +167,9 @@ def parser_production_breakdown(
105167
if total is None or None in modes_extracted:
106168
continue
107169

170+
unknown_value = round(total - thermo - hydro - solar - wind - bagasse, 3)
171+
unknown_value = None if abs(unknown_value) < 0.05 else unknown_value
172+
108173
result.append(
109174
zoneKey=zone_key,
110175
datetime=timestamp,
@@ -115,14 +180,15 @@ def parser_production_breakdown(
115180
biomass=bagasse,
116181
gas=round(thermo * gas_oil_ratio, 3),
117182
oil=round(thermo * (1 - gas_oil_ratio), 3),
118-
unknown=round(total - thermo - hydro - solar - wind - bagasse, 3),
183+
unknown=unknown_value,
119184
),
120185
source=SOURCE,
121186
)
122187

123188
return result
124189

125190

191+
@use_proxy(country_code="BO")
126192
def fetch_production(
127193
zone_key: ZoneKey = ZoneKey("BO"),
128194
session: Session | None = None,
@@ -152,8 +218,8 @@ def fetch_generation_forecast(
152218

153219
if __name__ == "__main__":
154220
"""Main method, never used by the Electricity Map backend, but handy for testing."""
155-
print("fetch_production() ->")
156221
print(fetch_production())
222+
print("fetch_production() ->")
157223

158224
# print("fetch_generation_forecast() ->")
159225
# print(fetch_generation_forecast())

electricitymap/contrib/parsers/tests/__snapshots__/test_CNDC.ambr

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -183,7 +183,7 @@
183183
'hydro': 384.86,
184184
'oil': 11.074,
185185
'solar': 0,
186-
'unknown': -0.0,
186+
'unknown': None,
187187
'wind': 45.39,
188188
}),
189189
'source': 'cndc.bo',
@@ -202,7 +202,7 @@
202202
'hydro': 402.78,
203203
'oil': 10.355,
204204
'solar': 0,
205-
'unknown': -0.0,
205+
'unknown': None,
206206
'wind': 47.16,
207207
}),
208208
'source': 'cndc.bo',
@@ -221,7 +221,7 @@
221221
'hydro': 406.37,
222222
'oil': 9.835,
223223
'solar': 0,
224-
'unknown': -0.0,
224+
'unknown': None,
225225
'wind': 48.76,
226226
}),
227227
'source': 'cndc.bo',
@@ -240,7 +240,7 @@
240240
'hydro': 380.18,
241241
'oil': 9.855,
242242
'solar': 0,
243-
'unknown': -0.0,
243+
'unknown': None,
244244
'wind': 47.33,
245245
}),
246246
'source': 'cndc.bo',
@@ -259,7 +259,7 @@
259259
'hydro': 379.26,
260260
'oil': 9.831,
261261
'solar': 0,
262-
'unknown': 0.0,
262+
'unknown': None,
263263
'wind': 51.32,
264264
}),
265265
'source': 'cndc.bo',
@@ -278,7 +278,7 @@
278278
'hydro': 350.42,
279279
'oil': 9.628,
280280
'solar': 0.39,
281-
'unknown': -0.0,
281+
'unknown': None,
282282
'wind': 50.26,
283283
}),
284284
'source': 'cndc.bo',
@@ -297,7 +297,7 @@
297297
'hydro': 422.66,
298298
'oil': 9.968,
299299
'solar': 13.45,
300-
'unknown': -0.0,
300+
'unknown': None,
301301
'wind': 41.49,
302302
}),
303303
'source': 'cndc.bo',
@@ -316,7 +316,7 @@
316316
'hydro': 500.35,
317317
'oil': 10.666,
318318
'solar': 47.85,
319-
'unknown': 0.0,
319+
'unknown': None,
320320
'wind': 45.95,
321321
}),
322322
'source': 'cndc.bo',
@@ -335,7 +335,7 @@
335335
'hydro': 505.62,
336336
'oil': 11.446,
337337
'solar': 90.01,
338-
'unknown': -0.0,
338+
'unknown': None,
339339
'wind': 46.62,
340340
}),
341341
'source': 'cndc.bo',
@@ -354,7 +354,7 @@
354354
'hydro': 523.88,
355355
'oil': 12.567,
356356
'solar': 113.07,
357-
'unknown': -0.0,
357+
'unknown': None,
358358
'wind': 32.52,
359359
}),
360360
'source': 'cndc.bo',
@@ -373,7 +373,7 @@
373373
'hydro': 511.72,
374374
'oil': 13.695,
375375
'solar': 124.97,
376-
'unknown': 0.0,
376+
'unknown': None,
377377
'wind': 24.76,
378378
}),
379379
'source': 'cndc.bo',

0 commit comments

Comments
 (0)