diff --git a/apps/api/src/trigger/policies/update-policy.ts b/apps/api/src/trigger/policies/update-policy.ts index 2187256d1c..0ddb3e5b32 100644 --- a/apps/api/src/trigger/policies/update-policy.ts +++ b/apps/api/src/trigger/policies/update-policy.ts @@ -2,7 +2,7 @@ import { logger, metadata, queue, schemaTask } from '@trigger.dev/sdk'; import { z } from 'zod'; import { processPolicyUpdate } from './update-policy-helpers'; -export const updatePolicyQueue = queue({ name: 'update-policy', concurrencyLimit: 50 }); +const updatePolicyQueue = queue({ name: 'update-policy', concurrencyLimit: 50 }); export const updatePolicy = schemaTask({ id: 'update-policy', diff --git a/apps/api/src/trigger/vendor/vendor-risk-assessment/firecrawl-agent.ts b/apps/api/src/trigger/vendor/vendor-risk-assessment/firecrawl-agent.ts index 7b1cc6e2d3..5605f920a7 100644 --- a/apps/api/src/trigger/vendor/vendor-risk-assessment/firecrawl-agent.ts +++ b/apps/api/src/trigger/vendor/vendor-risk-assessment/firecrawl-agent.ts @@ -2,6 +2,7 @@ import Firecrawl from '@mendable/firecrawl-js'; import { logger } from '@trigger.dev/sdk'; import { vendorRiskAssessmentAgentSchema } from './agent-schema'; import type { VendorRiskAssessmentDataV1 } from './agent-types'; +import { extractVendorDomain, validateVendorUrl } from './url-validation'; function normalizeUrl(url: string | null | undefined): string | null { if (!url) return null; @@ -58,19 +59,30 @@ export async function firecrawlAgentVendorRiskAssessment(params: { const firecrawlClient = new Firecrawl({ apiKey }); + // Extract vendor domain for URL validation + const vendorDomain = extractVendorDomain(vendorWebsite); + if (!vendorDomain) { + logger.warn('Could not extract vendor domain for URL validation', { + vendorWebsite, + }); + return null; + } + const prompt = `Complete cyber security research on the vendor "${vendorName}" with website ${vendorWebsite}. +CRITICAL: Only return URLs that belong to the domain "${vendorDomain}" or its subdomains (e.g., trust.${vendorDomain}, security.${vendorDomain}). Do NOT return URLs from any other domain. If you cannot find a page on ${vendorDomain}, return an empty string for that field rather than a URL from another website. + Extract the following information: 1. **Certifications**: Find any security certifications they have (SOC 2 Type I, SOC 2 Type II, ISO 27001 etc). For each certification found, determine: - The type of certification - Whether it's verified/current, expired, or not certified - Any issue or expiry dates mentioned - - Link to the compliance/trust page or report if available + - Link to the compliance/trust page or report if available (must be on ${vendorDomain}) -2. **Legal & Security Documents**: Find the direct URLs to: +2. **Legal & Security Documents**: Find the direct URLs on ${vendorDomain} to: - Privacy Policy page (usually at /privacy, /privacy-policy, or linked in the footer) - Terms of Service page (usually at /terms, /tos, /terms-of-service, or linked in the footer) - - Trust Center or Security page (typically could be at /trust, /security or trust.website.com or security.website.com) + - Trust Center or Security page (typically could be at /trust, /security or trust.${vendorDomain} or security.${vendorDomain}) 3. **Recent News**: Find recent news articles (last 12 months) about the company, especially: - Security incidents or data breaches @@ -81,13 +93,26 @@ Extract the following information: 4. **Summary**: Provide an overall assessment of the vendor's security posture. -Focus on their official website (especially trust/security/compliance pages), press releases, and reputable news sources.`; +Focus on their official website ${vendorWebsite} (especially trust/security/compliance pages), press releases, and reputable news sources.`; + + // Provide seed URLs covering common legal/security paths so the agent + // stays on the vendor's domain instead of wandering to unrelated sites. + const seedUrls = [ + origin, + `${origin}/privacy`, + `${origin}/privacy-policy`, + `${origin}/terms`, + `${origin}/terms-of-service`, + `${origin}/security`, + `${origin}/trust`, + `${origin}/legal`, + `${origin}/compliance`, + ]; - // Using SDK (no maxCredits override, no explicit polling here) - // Important: avoid crawling huge sites with a wildcard (e.g. workspace.google.com). const agentResponse = await firecrawlClient.agent({ prompt, - urls: [origin], + urls: seedUrls, + strictConstrainToURLs: false, // allow following links from seed URLs, but seeds anchor it to the right domain schema: { type: 'object', properties: { @@ -173,7 +198,10 @@ Focus on their official website (especially trust/security/compliance pages), pr }); const normalizedLinks = linkPairs - .map((l) => ({ ...l, url: normalizeUrl(l.url) })) + .map((l) => ({ + ...l, + url: validateVendorUrl(l.url, vendorDomain, l.label), + })) .filter((l): l is { label: string; url: string } => Boolean(l.url)); const certifications = @@ -182,7 +210,7 @@ Focus on their official website (especially trust/security/compliance pages), pr status: c.status ?? 'unknown', issuedAt: normalizeIso(c.issued_at ?? null), expiresAt: normalizeIso(c.expires_at ?? null), - url: normalizeUrl(c.url ?? null), + url: validateVendorUrl(c.url ?? null, vendorDomain, `cert:${c.type}`), })) ?? []; const news = diff --git a/apps/api/src/trigger/vendor/vendor-risk-assessment/firecrawl.ts b/apps/api/src/trigger/vendor/vendor-risk-assessment/firecrawl.ts index 4ce68c7c5b..a4a1b46ab1 100644 --- a/apps/api/src/trigger/vendor/vendor-risk-assessment/firecrawl.ts +++ b/apps/api/src/trigger/vendor/vendor-risk-assessment/firecrawl.ts @@ -1,5 +1,6 @@ import { logger } from '@trigger.dev/sdk'; import { firecrawlVendorDataSchema, type FirecrawlVendorData } from './schema'; +import { extractVendorDomain, isUrlFromVendorDomain } from './url-validation'; type FirecrawlStartResponse = { success: boolean; @@ -48,6 +49,14 @@ function normalizeUrl(url: string | null | undefined): string | null { export async function firecrawlExtractVendorData( website: string, ): Promise { + // Extract vendor domain for URL validation + const vendorDomain = extractVendorDomain(website); + if (!vendorDomain) { + logger.warn('Could not extract vendor domain for URL validation', { + website, + }); + return null; + } const apiKey = process.env.FIRECRAWL_API_KEY; if (!apiKey) { logger.warn( @@ -77,9 +86,11 @@ export async function firecrawlExtractVendorData( Goal: return the MOST SPECIFIC, DIRECT URL for each document type below. Do not return general category pages. -You may crawl the site (including subdomains) and follow internal links. Trust portals are often linked in the header/footer under: "Trust", "Trust Center", "Security", "Compliance", "Legal", "Governance", "Privacy", "Data Processing", "DPA". +CRITICAL: Only return URLs that belong to the domain "${vendorDomain}" or its subdomains (e.g., trust.${vendorDomain}, security.${vendorDomain}). Do NOT return URLs from any other domain. If you cannot find a page on ${vendorDomain}, return an empty string for that field. -Return ONLY absolute https URLs. If you cannot find a dedicated page that matches the definition, return an empty string. +You may crawl the site (including subdomains of ${vendorDomain}) and follow internal links. Trust portals are often linked in the header/footer under: "Trust", "Trust Center", "Security", "Compliance", "Legal", "Governance", "Privacy", "Data Processing", "DPA". + +Return ONLY absolute https URLs on ${vendorDomain}. If you cannot find a dedicated page that matches the definition, return an empty string. DEFINITIONS (be strict): 1) trust_portal_url: @@ -193,13 +204,46 @@ When multiple candidates exist, choose the most direct URL that best matches the return null; } + // Normalize URLs and filter out any that don't belong to the vendor's domain + const validateVendorUrl = ( + url: string | null | undefined, + label: string, + ): string | null => { + const normalized = normalizeUrl(url); + if (!normalized) return null; + if (!isUrlFromVendorDomain(normalized, vendorDomain)) { + logger.warn('Filtered out URL from wrong domain', { + vendorDomain, + label, + url: normalized, + }); + return null; + } + return normalized; + }; + const normalized = { ...parsed.data, - privacy_policy_url: normalizeUrl(parsed.data.privacy_policy_url), - terms_of_service_url: normalizeUrl(parsed.data.terms_of_service_url), - security_overview_url: normalizeUrl(parsed.data.security_overview_url), - trust_portal_url: normalizeUrl(parsed.data.trust_portal_url), - soc2_report_url: normalizeUrl(parsed.data.soc2_report_url), + privacy_policy_url: validateVendorUrl( + parsed.data.privacy_policy_url, + 'privacy_policy', + ), + terms_of_service_url: validateVendorUrl( + parsed.data.terms_of_service_url, + 'terms_of_service', + ), + security_overview_url: validateVendorUrl( + parsed.data.security_overview_url, + 'security_overview', + ), + trust_portal_url: validateVendorUrl( + parsed.data.trust_portal_url, + 'trust_portal', + ), + soc2_report_url: validateVendorUrl( + parsed.data.soc2_report_url, + 'soc2_report', + ), }; logger.info('Firecrawl extraction completed', { diff --git a/apps/api/src/trigger/vendor/vendor-risk-assessment/url-validation.spec.ts b/apps/api/src/trigger/vendor/vendor-risk-assessment/url-validation.spec.ts new file mode 100644 index 0000000000..129901c633 --- /dev/null +++ b/apps/api/src/trigger/vendor/vendor-risk-assessment/url-validation.spec.ts @@ -0,0 +1,119 @@ +import { + isUrlFromVendorDomain, + extractVendorDomain, + validateVendorUrl, +} from './url-validation'; + +// Mock the logger so tests don't need @trigger.dev/sdk +jest.mock('@trigger.dev/sdk', () => ({ + logger: { warn: jest.fn(), info: jest.fn(), debug: jest.fn() }, +})); + +describe('isUrlFromVendorDomain', () => { + it('accepts exact domain match', () => { + expect(isUrlFromVendorDomain('https://wix.com/privacy', 'wix.com')).toBe( + true, + ); + }); + + it('accepts www subdomain', () => { + expect( + isUrlFromVendorDomain('https://www.wix.com/terms', 'wix.com'), + ).toBe(true); + }); + + it('accepts other subdomains', () => { + expect( + isUrlFromVendorDomain('https://trust.wix.com', 'wix.com'), + ).toBe(true); + expect( + isUrlFromVendorDomain('https://security.wix.com/page', 'wix.com'), + ).toBe(true); + }); + + it('rejects completely different domains', () => { + expect(isUrlFromVendorDomain('https://x.com/privacy', 'wix.com')).toBe( + false, + ); + expect( + isUrlFromVendorDomain('https://twitter.com/wix', 'wix.com'), + ).toBe(false); + }); + + it('rejects domains that end with vendor domain but are different', () => { + // "notwix.com" ends with "wix.com" as a string, but is a different domain + expect( + isUrlFromVendorDomain('https://notwix.com/privacy', 'wix.com'), + ).toBe(false); + }); + + it('is case-insensitive', () => { + expect( + isUrlFromVendorDomain('https://WWW.WIX.COM/privacy', 'wix.com'), + ).toBe(true); + expect( + isUrlFromVendorDomain('https://wix.com/privacy', 'WIX.COM'), + ).toBe(true); + }); + + it('returns false for invalid URLs', () => { + expect(isUrlFromVendorDomain('not-a-url', 'wix.com')).toBe(false); + }); +}); + +describe('extractVendorDomain', () => { + it('extracts domain from full URL', () => { + expect(extractVendorDomain('https://www.wix.com')).toBe('wix.com'); + }); + + it('strips www prefix', () => { + expect(extractVendorDomain('https://www.example.com/path')).toBe( + 'example.com', + ); + }); + + it('handles URLs without protocol', () => { + expect(extractVendorDomain('wix.com')).toBe('wix.com'); + expect(extractVendorDomain('www.wix.com')).toBe('wix.com'); + }); + + it('returns null for invalid input', () => { + expect(extractVendorDomain('')).toBe(null); + }); + + it('preserves subdomains other than www', () => { + expect(extractVendorDomain('https://trust.wix.com')).toBe('trust.wix.com'); + }); +}); + +describe('validateVendorUrl', () => { + it('returns normalized URL for valid vendor URLs', () => { + expect(validateVendorUrl('https://wix.com/privacy', 'wix.com', 'privacy')).toBe( + 'https://wix.com/privacy', + ); + }); + + it('returns null for URLs from wrong domain', () => { + expect( + validateVendorUrl('https://x.com/privacy', 'wix.com', 'privacy'), + ).toBe(null); + }); + + it('returns null for empty/null input', () => { + expect(validateVendorUrl(null, 'wix.com', 'test')).toBe(null); + expect(validateVendorUrl(undefined, 'wix.com', 'test')).toBe(null); + expect(validateVendorUrl('', 'wix.com', 'test')).toBe(null); + }); + + it('normalizes bare domains by adding https', () => { + expect(validateVendorUrl('wix.com/terms', 'wix.com', 'terms')).toBe( + 'https://wix.com/terms', + ); + }); + + it('accepts subdomain URLs', () => { + expect( + validateVendorUrl('https://trust.wix.com', 'wix.com', 'trust'), + ).toBe('https://trust.wix.com/'); + }); +}); diff --git a/apps/api/src/trigger/vendor/vendor-risk-assessment/url-validation.ts b/apps/api/src/trigger/vendor/vendor-risk-assessment/url-validation.ts new file mode 100644 index 0000000000..c49f2224ae --- /dev/null +++ b/apps/api/src/trigger/vendor/vendor-risk-assessment/url-validation.ts @@ -0,0 +1,76 @@ +import { logger } from '@trigger.dev/sdk'; + +/** + * Checks whether a URL belongs to the given vendor domain (including subdomains). + * For example, if vendorDomain is "wix.com", accepts "wix.com", "www.wix.com", + * "trust.wix.com", but rejects "x.com" or "notwix.com". + */ +export function isUrlFromVendorDomain( + url: string, + vendorDomain: string, +): boolean { + try { + const hostname = new URL(url).hostname.toLowerCase(); + const domain = vendorDomain.toLowerCase(); + // Exact match or subdomain match (e.g., trust.wix.com for wix.com) + return hostname === domain || hostname.endsWith(`.${domain}`); + } catch { + return false; + } +} + +/** + * Extracts the vendor domain from a website URL, stripping www. prefix. + * Returns null if the URL is invalid. + */ +export function extractVendorDomain( + website: string, +): string | null { + try { + const urlObj = new URL( + /^https?:\/\//i.test(website) ? website : `https://${website}`, + ); + return urlObj.hostname.toLowerCase().replace(/^www\./, ''); + } catch { + return null; + } +} + +/** + * Validates and filters a URL, ensuring it belongs to the vendor domain. + * Returns null (with a warning log) if the URL is from a different domain. + */ +export function validateVendorUrl( + url: string | null | undefined, + vendorDomain: string, + label: string, +): string | null { + if (!url) return null; + const trimmed = url.trim(); + if (!trimmed) return null; + + // Normalize: add https if looks like a bare domain + const looksLikeDomain = + !/^https?:\/\//i.test(trimmed) && + /^[a-z0-9.-]+\.[a-z]{2,}([/].*)?$/i.test(trimmed); + const candidate = looksLikeDomain ? `https://${trimmed}` : trimmed; + + try { + const u = new URL(candidate); + if (!['http:', 'https:'].includes(u.protocol)) return null; + const normalized = u.toString(); + + if (!isUrlFromVendorDomain(normalized, vendorDomain)) { + logger.warn('Filtered out URL from wrong domain', { + vendorDomain, + label, + url: normalized, + }); + return null; + } + + return normalized; + } catch { + return null; + } +}