trycompai · Marfuen · Mar 18, 2026 · Mar 18, 2026
diff --git a/apps/api/src/trigger/policies/update-policy.ts b/apps/api/src/trigger/policies/update-policy.ts
@@ -2,7 +2,7 @@ import { logger, metadata, queue, schemaTask } from '@trigger.dev/sdk';
 import { z } from 'zod';
 import { processPolicyUpdate } from './update-policy-helpers';
 
-export const updatePolicyQueue = queue({ name: 'update-policy', concurrencyLimit: 50 });
+const updatePolicyQueue = queue({ name: 'update-policy', concurrencyLimit: 50 });
 
 export const updatePolicy = schemaTask({
   id: 'update-policy',

diff --git a/apps/api/src/trigger/vendor/vendor-risk-assessment/firecrawl-agent.ts b/apps/api/src/trigger/vendor/vendor-risk-assessment/firecrawl-agent.ts
@@ -2,6 +2,7 @@ import Firecrawl from '@mendable/firecrawl-js';
 import { logger } from '@trigger.dev/sdk';
 import { vendorRiskAssessmentAgentSchema } from './agent-schema';
 import type { VendorRiskAssessmentDataV1 } from './agent-types';
+import { extractVendorDomain, validateVendorUrl } from './url-validation';
 
 function normalizeUrl(url: string | null | undefined): string | null {
   if (!url) return null;
@@ -58,19 +59,30 @@ export async function firecrawlAgentVendorRiskAssessment(params: {
 
   const firecrawlClient = new Firecrawl({ apiKey });
 
+  // Extract vendor domain for URL validation
+  const vendorDomain = extractVendorDomain(vendorWebsite);
+  if (!vendorDomain) {
+    logger.warn('Could not extract vendor domain for URL validation', {
+      vendorWebsite,
+    });
+    return null;
+  }
+
   const prompt = `Complete cyber security research on the vendor "${vendorName}" with website ${vendorWebsite}.
 
+CRITICAL: Only return URLs that belong to the domain "${vendorDomain}" or its subdomains (e.g., trust.${vendorDomain}, security.${vendorDomain}). Do NOT return URLs from any other domain. If you cannot find a page on ${vendorDomain}, return an empty string for that field rather than a URL from another website.
+
 Extract the following information:
 1. **Certifications**: Find any security certifications they have (SOC 2 Type I, SOC 2 Type II, ISO 27001 etc). For each certification found, determine:
    - The type of certification
    - Whether it's verified/current, expired, or not certified
    - Any issue or expiry dates mentioned
-   - Link to the compliance/trust page or report if available
+   - Link to the compliance/trust page or report if available (must be on ${vendorDomain})
 
-2. **Legal & Security Documents**: Find the direct URLs to:
+2. **Legal & Security Documents**: Find the direct URLs on ${vendorDomain} to:
    - Privacy Policy page (usually at /privacy, /privacy-policy, or linked in the footer)
    - Terms of Service page (usually at /terms, /tos, /terms-of-service, or linked in the footer)
-   - Trust Center or Security page (typically could be at /trust, /security or trust.website.com or security.website.com)
+   - Trust Center or Security page (typically could be at /trust, /security or trust.${vendorDomain} or security.${vendorDomain})
 
 3. **Recent News**: Find recent news articles (last 12 months) about the company, especially:
    - Security incidents or data breaches
@@ -81,13 +93,26 @@ Extract the following information:
 
 4. **Summary**: Provide an overall assessment of the vendor's security posture.
 
-Focus on their official website (especially trust/security/compliance pages), press releases, and reputable news sources.`;
+Focus on their official website ${vendorWebsite} (especially trust/security/compliance pages), press releases, and reputable news sources.`;
+
+  // Provide seed URLs covering common legal/security paths so the agent
+  // stays on the vendor's domain instead of wandering to unrelated sites.
+  const seedUrls = [
+    origin,
+    `${origin}/privacy`,
+    `${origin}/privacy-policy`,
+    `${origin}/terms`,
+    `${origin}/terms-of-service`,
+    `${origin}/security`,
+    `${origin}/trust`,
+    `${origin}/legal`,
+    `${origin}/compliance`,
+  ];
 
-  // Using SDK (no maxCredits override, no explicit polling here)
-  // Important: avoid crawling huge sites with a wildcard (e.g. workspace.google.com).
   const agentResponse = await firecrawlClient.agent({
     prompt,
-    urls: [origin],
+    urls: seedUrls,
+    strictConstrainToURLs: false, // allow following links from seed URLs, but seeds anchor it to the right domain
     schema: {
       type: 'object',
       properties: {
@@ -173,7 +198,10 @@ Focus on their official website (especially trust/security/compliance pages), pr
     });
 
   const normalizedLinks = linkPairs
-    .map((l) => ({ ...l, url: normalizeUrl(l.url) }))
+    .map((l) => ({
+      ...l,
+      url: validateVendorUrl(l.url, vendorDomain, l.label),
+    }))
     .filter((l): l is { label: string; url: string } => Boolean(l.url));
 
   const certifications =
@@ -182,7 +210,7 @@ Focus on their official website (especially trust/security/compliance pages), pr
       status: c.status ?? 'unknown',
       issuedAt: normalizeIso(c.issued_at ?? null),
       expiresAt: normalizeIso(c.expires_at ?? null),
-      url: normalizeUrl(c.url ?? null),
+      url: validateVendorUrl(c.url ?? null, vendorDomain, `cert:${c.type}`),
     })) ?? [];
 
   const news =

diff --git a/apps/api/src/trigger/vendor/vendor-risk-assessment/firecrawl.ts b/apps/api/src/trigger/vendor/vendor-risk-assessment/firecrawl.ts
@@ -1,5 +1,6 @@
 import { logger } from '@trigger.dev/sdk';
 import { firecrawlVendorDataSchema, type FirecrawlVendorData } from './schema';
+import { extractVendorDomain, isUrlFromVendorDomain } from './url-validation';
 
 type FirecrawlStartResponse = {
   success: boolean;
@@ -48,6 +49,14 @@ function normalizeUrl(url: string | null | undefined): string | null {
 export async function firecrawlExtractVendorData(
   website: string,
 ): Promise<FirecrawlVendorData | null> {
+  // Extract vendor domain for URL validation
+  const vendorDomain = extractVendorDomain(website);
+  if (!vendorDomain) {
+    logger.warn('Could not extract vendor domain for URL validation', {
+      website,
+    });
+    return null;
+  }
   const apiKey = process.env.FIRECRAWL_API_KEY;
   if (!apiKey) {
     logger.warn(
@@ -77,9 +86,11 @@ export async function firecrawlExtractVendorData(
 
 Goal: return the MOST SPECIFIC, DIRECT URL for each document type below. Do not return general category pages.
 
-You may crawl the site (including subdomains) and follow internal links. Trust portals are often linked in the header/footer under: "Trust", "Trust Center", "Security", "Compliance", "Legal", "Governance", "Privacy", "Data Processing", "DPA".
+CRITICAL: Only return URLs that belong to the domain "${vendorDomain}" or its subdomains (e.g., trust.${vendorDomain}, security.${vendorDomain}). Do NOT return URLs from any other domain. If you cannot find a page on ${vendorDomain}, return an empty string for that field.
 
-Return ONLY absolute https URLs. If you cannot find a dedicated page that matches the definition, return an empty string.
+You may crawl the site (including subdomains of ${vendorDomain}) and follow internal links. Trust portals are often linked in the header/footer under: "Trust", "Trust Center", "Security", "Compliance", "Legal", "Governance", "Privacy", "Data Processing", "DPA".
+
+Return ONLY absolute https URLs on ${vendorDomain}. If you cannot find a dedicated page that matches the definition, return an empty string.
 
 DEFINITIONS (be strict):
 1) trust_portal_url:
@@ -193,13 +204,46 @@ When multiple candidates exist, choose the most direct URL that best matches the
         return null;
       }
 
+      // Normalize URLs and filter out any that don't belong to the vendor's domain
+      const validateVendorUrl = (
+        url: string | null | undefined,
+        label: string,
+      ): string | null => {
+        const normalized = normalizeUrl(url);
+        if (!normalized) return null;
+        if (!isUrlFromVendorDomain(normalized, vendorDomain)) {
+          logger.warn('Filtered out URL from wrong domain', {
+            vendorDomain,
+            label,
+            url: normalized,
+          });
+          return null;
+        }
+        return normalized;
+      };
+
       const normalized = {
         ...parsed.data,
-        privacy_policy_url: normalizeUrl(parsed.data.privacy_policy_url),
-        terms_of_service_url: normalizeUrl(parsed.data.terms_of_service_url),
-        security_overview_url: normalizeUrl(parsed.data.security_overview_url),
-        trust_portal_url: normalizeUrl(parsed.data.trust_portal_url),
-        soc2_report_url: normalizeUrl(parsed.data.soc2_report_url),
+        privacy_policy_url: validateVendorUrl(
+          parsed.data.privacy_policy_url,
+          'privacy_policy',
+        ),
+        terms_of_service_url: validateVendorUrl(
+          parsed.data.terms_of_service_url,
+          'terms_of_service',
+        ),
+        security_overview_url: validateVendorUrl(
+          parsed.data.security_overview_url,
+          'security_overview',
+        ),
+        trust_portal_url: validateVendorUrl(
+          parsed.data.trust_portal_url,
+          'trust_portal',
+        ),
+        soc2_report_url: validateVendorUrl(
+          parsed.data.soc2_report_url,
+          'soc2_report',
+        ),
       };
 
       logger.info('Firecrawl extraction completed', {

diff --git a/apps/api/src/trigger/vendor/vendor-risk-assessment/url-validation.spec.ts b/apps/api/src/trigger/vendor/vendor-risk-assessment/url-validation.spec.ts
@@ -0,0 +1,119 @@
+import {
+  isUrlFromVendorDomain,
+  extractVendorDomain,
+  validateVendorUrl,
+} from './url-validation';
+
+// Mock the logger so tests don't need @trigger.dev/sdk
+jest.mock('@trigger.dev/sdk', () => ({
+  logger: { warn: jest.fn(), info: jest.fn(), debug: jest.fn() },
+}));
+
+describe('isUrlFromVendorDomain', () => {
+  it('accepts exact domain match', () => {
+    expect(isUrlFromVendorDomain('https://wix.com/privacy', 'wix.com')).toBe(
+      true,
+    );
+  });
+
+  it('accepts www subdomain', () => {
+    expect(
+      isUrlFromVendorDomain('https://www.wix.com/terms', 'wix.com'),
+    ).toBe(true);
+  });
+
+  it('accepts other subdomains', () => {
+    expect(
+      isUrlFromVendorDomain('https://trust.wix.com', 'wix.com'),
+    ).toBe(true);
+    expect(
+      isUrlFromVendorDomain('https://security.wix.com/page', 'wix.com'),
+    ).toBe(true);
+  });
+
+  it('rejects completely different domains', () => {
+    expect(isUrlFromVendorDomain('https://x.com/privacy', 'wix.com')).toBe(
+      false,
+    );
+    expect(
+      isUrlFromVendorDomain('https://twitter.com/wix', 'wix.com'),
+    ).toBe(false);
+  });
+
+  it('rejects domains that end with vendor domain but are different', () => {
+    // "notwix.com" ends with "wix.com" as a string, but is a different domain
+    expect(
+      isUrlFromVendorDomain('https://notwix.com/privacy', 'wix.com'),
+    ).toBe(false);
+  });
+
+  it('is case-insensitive', () => {
+    expect(
+      isUrlFromVendorDomain('https://WWW.WIX.COM/privacy', 'wix.com'),
+    ).toBe(true);
+    expect(
+      isUrlFromVendorDomain('https://wix.com/privacy', 'WIX.COM'),
+    ).toBe(true);
+  });
+
+  it('returns false for invalid URLs', () => {
+    expect(isUrlFromVendorDomain('not-a-url', 'wix.com')).toBe(false);
+  });
+});
+
+describe('extractVendorDomain', () => {
+  it('extracts domain from full URL', () => {
+    expect(extractVendorDomain('https://www.wix.com')).toBe('wix.com');
+  });
+
+  it('strips www prefix', () => {
+    expect(extractVendorDomain('https://www.example.com/path')).toBe(
+      'example.com',
+    );
+  });
+
+  it('handles URLs without protocol', () => {
+    expect(extractVendorDomain('wix.com')).toBe('wix.com');
+    expect(extractVendorDomain('www.wix.com')).toBe('wix.com');
+  });
+
+  it('returns null for invalid input', () => {
+    expect(extractVendorDomain('')).toBe(null);
+  });
+
+  it('preserves subdomains other than www', () => {
+    expect(extractVendorDomain('https://trust.wix.com')).toBe('trust.wix.com');
+  });
+});
+
+describe('validateVendorUrl', () => {
+  it('returns normalized URL for valid vendor URLs', () => {
+    expect(validateVendorUrl('https://wix.com/privacy', 'wix.com', 'privacy')).toBe(
+      'https://wix.com/privacy',
+    );
+  });
+
+  it('returns null for URLs from wrong domain', () => {
+    expect(
+      validateVendorUrl('https://x.com/privacy', 'wix.com', 'privacy'),
+    ).toBe(null);
+  });
+
+  it('returns null for empty/null input', () => {
+    expect(validateVendorUrl(null, 'wix.com', 'test')).toBe(null);
+    expect(validateVendorUrl(undefined, 'wix.com', 'test')).toBe(null);
+    expect(validateVendorUrl('', 'wix.com', 'test')).toBe(null);
+  });
+
+  it('normalizes bare domains by adding https', () => {
+    expect(validateVendorUrl('wix.com/terms', 'wix.com', 'terms')).toBe(
+      'https://wix.com/terms',
+    );
+  });
+
+  it('accepts subdomain URLs', () => {
+    expect(
+      validateVendorUrl('https://trust.wix.com', 'wix.com', 'trust'),
+    ).toBe('https://trust.wix.com/');
+  });
+});
diff --git a/apps/api/src/trigger/vendor/vendor-risk-assessment/url-validation.ts b/apps/api/src/trigger/vendor/vendor-risk-assessment/url-validation.ts
@@ -0,0 +1,76 @@
+import { logger } from '@trigger.dev/sdk';
+
+/**
+ * Checks whether a URL belongs to the given vendor domain (including subdomains).
+ * For example, if vendorDomain is "wix.com", accepts "wix.com", "www.wix.com",
+ * "trust.wix.com", but rejects "x.com" or "notwix.com".
+ */
+export function isUrlFromVendorDomain(
+  url: string,
+  vendorDomain: string,
+): boolean {
+  try {
+    const hostname = new URL(url).hostname.toLowerCase();
+    const domain = vendorDomain.toLowerCase();
+    // Exact match or subdomain match (e.g., trust.wix.com for wix.com)
+    return hostname === domain || hostname.endsWith(`.${domain}`);
+  } catch {
+    return false;
+  }
+}
+
+/**
+ * Extracts the vendor domain from a website URL, stripping www. prefix.
+ * Returns null if the URL is invalid.
+ */
+export function extractVendorDomain(
+  website: string,
+): string | null {
+  try {
+    const urlObj = new URL(
+      /^https?:\/\//i.test(website) ? website : `https://${website}`,
+    );
+    return urlObj.hostname.toLowerCase().replace(/^www\./, '');
+  } catch {
+    return null;
+  }
+}
+
+/**
+ * Validates and filters a URL, ensuring it belongs to the vendor domain.
+ * Returns null (with a warning log) if the URL is from a different domain.
+ */
+export function validateVendorUrl(
+  url: string | null | undefined,
+  vendorDomain: string,
+  label: string,
+): string | null {
+  if (!url) return null;
+  const trimmed = url.trim();
+  if (!trimmed) return null;
+
+  // Normalize: add https if looks like a bare domain
+  const looksLikeDomain =
+    !/^https?:\/\//i.test(trimmed) &&
+    /^[a-z0-9.-]+\.[a-z]{2,}([/].*)?$/i.test(trimmed);
+  const candidate = looksLikeDomain ? `https://${trimmed}` : trimmed;
+
+  try {
+    const u = new URL(candidate);
+    if (!['http:', 'https:'].includes(u.protocol)) return null;
+    const normalized = u.toString();
+
+    if (!isUrlFromVendorDomain(normalized, vendorDomain)) {
+      logger.warn('Filtered out URL from wrong domain', {
+        vendorDomain,
+        label,
+        url: normalized,
+      });
+      return null;
+    }
+
+    return normalized;
+  } catch {
+    return null;
+  }
+}