Skip to content

Commit e3e539c

Browse files
feat: Update URL extraction to preserve case sensitivity (#2550)
Bug 2491, which stated that the URLs were being converted to lowercase, which potentially caused issues for URLs with uppercase characters. Co-authored-by: Sai Prathik R <[email protected]>
1 parent 9b6e0f0 commit e3e539c

File tree

3 files changed

+6
-5
lines changed

3 files changed

+6
-5
lines changed

mobsf/DynamicAnalyzer/views/common/shared.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -27,17 +27,18 @@
2727
def extract_urls_domains_emails(checksum, data):
2828
"""Extract URLs, Domains and Emails."""
2929
# URL Extraction
30-
urls = re.findall(URL_REGEX, data.lower())
30+
urls = re.findall(URL_REGEX, data)
3131
if urls:
3232
urls = list(set(urls))
3333
else:
3434
urls = []
3535
# Domain Extraction and Malware Check
3636
logger.info('Performing Malware check on extracted domains')
37+
# For domain extraction, use lowercased URLs
3738
domains = MalwareDomainCheck().scan(
3839
checksum,
39-
urls)
40-
# Email Etraction Regex
40+
[u.lower() for u in urls if isinstance(u, str)])
41+
# Email Extraction Regex
4142
emails = set()
4243
for email in EMAIL_REGEX.findall(data.lower()):
4344
if email.startswith('//'):

mobsf/MobSF/utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@
5555
r'file://|javascript:|data:|www\d{0,3}[.])'
5656
r'[\w().=/;,#:@?&~*+!$%\'{}-]+)'
5757
),
58-
re.UNICODE)
58+
re.UNICODE | re.IGNORECASE)
5959
EMAIL_REGEX = re.compile(r'[\w+.-]{1,20}@[\w-]{1,20}\.[\w]{2,10}')
6060
USERNAME_REGEX = re.compile(r'^\w[\w\-\@\.]{1,35}$')
6161
GOOGLE_API_KEY_REGEX = re.compile(r'AIza[0-9A-Za-z-_]{35}$')

mobsf/StaticAnalyzer/views/common/shared_func.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -358,7 +358,7 @@ def url_n_email_extract(dat, relative_path):
358358
url_n_file = []
359359
email_n_file = []
360360
# URL Extraction
361-
urllist = URL_REGEX.findall(dat.lower())
361+
urllist = URL_REGEX.findall(dat)
362362
for url in urllist:
363363
urls.add(url)
364364
if urls:

0 commit comments

Comments
 (0)