Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion apps/api/src/trigger/policies/update-policy.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ import { logger, metadata, queue, schemaTask } from '@trigger.dev/sdk';
import { z } from 'zod';
import { processPolicyUpdate } from './update-policy-helpers';

export const updatePolicyQueue = queue({ name: 'update-policy', concurrencyLimit: 50 });
const updatePolicyQueue = queue({ name: 'update-policy', concurrencyLimit: 50 });

export const updatePolicy = schemaTask({
id: 'update-policy',
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ import Firecrawl from '@mendable/firecrawl-js';
import { logger } from '@trigger.dev/sdk';
import { vendorRiskAssessmentAgentSchema } from './agent-schema';
import type { VendorRiskAssessmentDataV1 } from './agent-types';
import { extractVendorDomain, validateVendorUrl } from './url-validation';

function normalizeUrl(url: string | null | undefined): string | null {
if (!url) return null;
Expand Down Expand Up @@ -58,19 +59,30 @@ export async function firecrawlAgentVendorRiskAssessment(params: {

const firecrawlClient = new Firecrawl({ apiKey });

// Extract vendor domain for URL validation
const vendorDomain = extractVendorDomain(vendorWebsite);
if (!vendorDomain) {
logger.warn('Could not extract vendor domain for URL validation', {
vendorWebsite,
});
return null;
}

const prompt = `Complete cyber security research on the vendor "${vendorName}" with website ${vendorWebsite}.

CRITICAL: Only return URLs that belong to the domain "${vendorDomain}" or its subdomains (e.g., trust.${vendorDomain}, security.${vendorDomain}). Do NOT return URLs from any other domain. If you cannot find a page on ${vendorDomain}, return an empty string for that field rather than a URL from another website.

Extract the following information:
1. **Certifications**: Find any security certifications they have (SOC 2 Type I, SOC 2 Type II, ISO 27001 etc). For each certification found, determine:
- The type of certification
- Whether it's verified/current, expired, or not certified
- Any issue or expiry dates mentioned
- Link to the compliance/trust page or report if available
- Link to the compliance/trust page or report if available (must be on ${vendorDomain})

2. **Legal & Security Documents**: Find the direct URLs to:
2. **Legal & Security Documents**: Find the direct URLs on ${vendorDomain} to:
- Privacy Policy page (usually at /privacy, /privacy-policy, or linked in the footer)
- Terms of Service page (usually at /terms, /tos, /terms-of-service, or linked in the footer)
- Trust Center or Security page (typically could be at /trust, /security or trust.website.com or security.website.com)
- Trust Center or Security page (typically could be at /trust, /security or trust.${vendorDomain} or security.${vendorDomain})

3. **Recent News**: Find recent news articles (last 12 months) about the company, especially:
- Security incidents or data breaches
Expand All @@ -81,13 +93,26 @@ Extract the following information:

4. **Summary**: Provide an overall assessment of the vendor's security posture.

Focus on their official website (especially trust/security/compliance pages), press releases, and reputable news sources.`;
Focus on their official website ${vendorWebsite} (especially trust/security/compliance pages), press releases, and reputable news sources.`;

// Provide seed URLs covering common legal/security paths so the agent
// stays on the vendor's domain instead of wandering to unrelated sites.
const seedUrls = [
origin,
`${origin}/privacy`,
`${origin}/privacy-policy`,
`${origin}/terms`,
`${origin}/terms-of-service`,
`${origin}/security`,
`${origin}/trust`,
`${origin}/legal`,
`${origin}/compliance`,
];

// Using SDK (no maxCredits override, no explicit polling here)
// Important: avoid crawling huge sites with a wildcard (e.g. workspace.google.com).
const agentResponse = await firecrawlClient.agent({
prompt,
urls: [origin],
urls: seedUrls,
strictConstrainToURLs: false, // allow following links from seed URLs, but seeds anchor it to the right domain
schema: {
type: 'object',
properties: {
Expand Down Expand Up @@ -173,7 +198,10 @@ Focus on their official website (especially trust/security/compliance pages), pr
});

const normalizedLinks = linkPairs
.map((l) => ({ ...l, url: normalizeUrl(l.url) }))
.map((l) => ({
...l,
url: validateVendorUrl(l.url, vendorDomain, l.label),
}))
.filter((l): l is { label: string; url: string } => Boolean(l.url));

const certifications =
Expand All @@ -182,7 +210,7 @@ Focus on their official website (especially trust/security/compliance pages), pr
status: c.status ?? 'unknown',
issuedAt: normalizeIso(c.issued_at ?? null),
expiresAt: normalizeIso(c.expires_at ?? null),
url: normalizeUrl(c.url ?? null),
url: validateVendorUrl(c.url ?? null, vendorDomain, `cert:${c.type}`),
})) ?? [];

const news =
Expand Down
58 changes: 51 additions & 7 deletions apps/api/src/trigger/vendor/vendor-risk-assessment/firecrawl.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import { logger } from '@trigger.dev/sdk';
import { firecrawlVendorDataSchema, type FirecrawlVendorData } from './schema';
import { extractVendorDomain, isUrlFromVendorDomain } from './url-validation';

type FirecrawlStartResponse = {
success: boolean;
Expand Down Expand Up @@ -48,6 +49,14 @@ function normalizeUrl(url: string | null | undefined): string | null {
export async function firecrawlExtractVendorData(
website: string,
): Promise<FirecrawlVendorData | null> {
// Extract vendor domain for URL validation
const vendorDomain = extractVendorDomain(website);
if (!vendorDomain) {
logger.warn('Could not extract vendor domain for URL validation', {
website,
});
return null;
}
const apiKey = process.env.FIRECRAWL_API_KEY;
if (!apiKey) {
logger.warn(
Expand Down Expand Up @@ -77,9 +86,11 @@ export async function firecrawlExtractVendorData(

Goal: return the MOST SPECIFIC, DIRECT URL for each document type below. Do not return general category pages.

You may crawl the site (including subdomains) and follow internal links. Trust portals are often linked in the header/footer under: "Trust", "Trust Center", "Security", "Compliance", "Legal", "Governance", "Privacy", "Data Processing", "DPA".
CRITICAL: Only return URLs that belong to the domain "${vendorDomain}" or its subdomains (e.g., trust.${vendorDomain}, security.${vendorDomain}). Do NOT return URLs from any other domain. If you cannot find a page on ${vendorDomain}, return an empty string for that field.

Return ONLY absolute https URLs. If you cannot find a dedicated page that matches the definition, return an empty string.
You may crawl the site (including subdomains of ${vendorDomain}) and follow internal links. Trust portals are often linked in the header/footer under: "Trust", "Trust Center", "Security", "Compliance", "Legal", "Governance", "Privacy", "Data Processing", "DPA".

Return ONLY absolute https URLs on ${vendorDomain}. If you cannot find a dedicated page that matches the definition, return an empty string.

DEFINITIONS (be strict):
1) trust_portal_url:
Expand Down Expand Up @@ -193,13 +204,46 @@ When multiple candidates exist, choose the most direct URL that best matches the
return null;
}

// Normalize URLs and filter out any that don't belong to the vendor's domain
const validateVendorUrl = (
url: string | null | undefined,
label: string,
): string | null => {
const normalized = normalizeUrl(url);
if (!normalized) return null;
if (!isUrlFromVendorDomain(normalized, vendorDomain)) {
logger.warn('Filtered out URL from wrong domain', {
vendorDomain,
label,
url: normalized,
});
return null;
}
return normalized;
};

const normalized = {
...parsed.data,
privacy_policy_url: normalizeUrl(parsed.data.privacy_policy_url),
terms_of_service_url: normalizeUrl(parsed.data.terms_of_service_url),
security_overview_url: normalizeUrl(parsed.data.security_overview_url),
trust_portal_url: normalizeUrl(parsed.data.trust_portal_url),
soc2_report_url: normalizeUrl(parsed.data.soc2_report_url),
privacy_policy_url: validateVendorUrl(
parsed.data.privacy_policy_url,
'privacy_policy',
),
terms_of_service_url: validateVendorUrl(
parsed.data.terms_of_service_url,
'terms_of_service',
),
security_overview_url: validateVendorUrl(
parsed.data.security_overview_url,
'security_overview',
),
trust_portal_url: validateVendorUrl(
parsed.data.trust_portal_url,
'trust_portal',
),
soc2_report_url: validateVendorUrl(
parsed.data.soc2_report_url,
'soc2_report',
),
};

logger.info('Firecrawl extraction completed', {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
import {
isUrlFromVendorDomain,
extractVendorDomain,
validateVendorUrl,
} from './url-validation';

// Mock the logger so tests don't need @trigger.dev/sdk
jest.mock('@trigger.dev/sdk', () => ({
logger: { warn: jest.fn(), info: jest.fn(), debug: jest.fn() },
}));

describe('isUrlFromVendorDomain', () => {
it('accepts exact domain match', () => {
expect(isUrlFromVendorDomain('https://wix.com/privacy', 'wix.com')).toBe(
true,
);
});

it('accepts www subdomain', () => {
expect(
isUrlFromVendorDomain('https://www.wix.com/terms', 'wix.com'),
).toBe(true);
});

it('accepts other subdomains', () => {
expect(
isUrlFromVendorDomain('https://trust.wix.com', 'wix.com'),
).toBe(true);
expect(
isUrlFromVendorDomain('https://security.wix.com/page', 'wix.com'),
).toBe(true);
});

it('rejects completely different domains', () => {
expect(isUrlFromVendorDomain('https://x.com/privacy', 'wix.com')).toBe(
false,
);
expect(
isUrlFromVendorDomain('https://twitter.com/wix', 'wix.com'),
).toBe(false);
});

it('rejects domains that end with vendor domain but are different', () => {
// "notwix.com" ends with "wix.com" as a string, but is a different domain
expect(
isUrlFromVendorDomain('https://notwix.com/privacy', 'wix.com'),
).toBe(false);
});

it('is case-insensitive', () => {
expect(
isUrlFromVendorDomain('https://WWW.WIX.COM/privacy', 'wix.com'),
).toBe(true);
expect(
isUrlFromVendorDomain('https://wix.com/privacy', 'WIX.COM'),
).toBe(true);
});

it('returns false for invalid URLs', () => {
expect(isUrlFromVendorDomain('not-a-url', 'wix.com')).toBe(false);
});
});

describe('extractVendorDomain', () => {
it('extracts domain from full URL', () => {
expect(extractVendorDomain('https://www.wix.com')).toBe('wix.com');
});

it('strips www prefix', () => {
expect(extractVendorDomain('https://www.example.com/path')).toBe(
'example.com',
);
});

it('handles URLs without protocol', () => {
expect(extractVendorDomain('wix.com')).toBe('wix.com');
expect(extractVendorDomain('www.wix.com')).toBe('wix.com');
});

it('returns null for invalid input', () => {
expect(extractVendorDomain('')).toBe(null);
});

it('preserves subdomains other than www', () => {
expect(extractVendorDomain('https://trust.wix.com')).toBe('trust.wix.com');
});
});

describe('validateVendorUrl', () => {
it('returns normalized URL for valid vendor URLs', () => {
expect(validateVendorUrl('https://wix.com/privacy', 'wix.com', 'privacy')).toBe(
'https://wix.com/privacy',
);
});

it('returns null for URLs from wrong domain', () => {
expect(
validateVendorUrl('https://x.com/privacy', 'wix.com', 'privacy'),
).toBe(null);
});

it('returns null for empty/null input', () => {
expect(validateVendorUrl(null, 'wix.com', 'test')).toBe(null);
expect(validateVendorUrl(undefined, 'wix.com', 'test')).toBe(null);
expect(validateVendorUrl('', 'wix.com', 'test')).toBe(null);
});

it('normalizes bare domains by adding https', () => {
expect(validateVendorUrl('wix.com/terms', 'wix.com', 'terms')).toBe(
'https://wix.com/terms',
);
});

it('accepts subdomain URLs', () => {
expect(
validateVendorUrl('https://trust.wix.com', 'wix.com', 'trust'),
).toBe('https://trust.wix.com/');
});
});
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
import { logger } from '@trigger.dev/sdk';

/**
* Checks whether a URL belongs to the given vendor domain (including subdomains).
* For example, if vendorDomain is "wix.com", accepts "wix.com", "www.wix.com",
* "trust.wix.com", but rejects "x.com" or "notwix.com".
*/
export function isUrlFromVendorDomain(
url: string,
vendorDomain: string,
): boolean {
try {
const hostname = new URL(url).hostname.toLowerCase();
const domain = vendorDomain.toLowerCase();
// Exact match or subdomain match (e.g., trust.wix.com for wix.com)
return hostname === domain || hostname.endsWith(`.${domain}`);
} catch {
return false;
}
}

/**
* Extracts the vendor domain from a website URL, stripping www. prefix.
* Returns null if the URL is invalid.
*/
export function extractVendorDomain(
website: string,
): string | null {
try {
const urlObj = new URL(
/^https?:\/\//i.test(website) ? website : `https://${website}`,
);
return urlObj.hostname.toLowerCase().replace(/^www\./, '');
} catch {
return null;
}
}

/**
* Validates and filters a URL, ensuring it belongs to the vendor domain.
* Returns null (with a warning log) if the URL is from a different domain.
*/
export function validateVendorUrl(
url: string | null | undefined,
vendorDomain: string,
label: string,
): string | null {
if (!url) return null;
const trimmed = url.trim();
if (!trimmed) return null;

// Normalize: add https if looks like a bare domain
const looksLikeDomain =
!/^https?:\/\//i.test(trimmed) &&
/^[a-z0-9.-]+\.[a-z]{2,}([/].*)?$/i.test(trimmed);
const candidate = looksLikeDomain ? `https://${trimmed}` : trimmed;

try {
const u = new URL(candidate);
if (!['http:', 'https:'].includes(u.protocol)) return null;
const normalized = u.toString();

if (!isUrlFromVendorDomain(normalized, vendorDomain)) {
logger.warn('Filtered out URL from wrong domain', {
vendorDomain,
label,
url: normalized,
});
return null;
}

return normalized;
} catch {
return null;
}
}
Loading