Skip to content

Commit 9a60adc

Browse files
committed
feat: add topic dedup check to ingest cron
- Add isTopicAlreadyCovered() with GROQ title match + topic overlap - Add isSlugTaken() for slug collision detection - Walk ranked trends list, skip already-covered topics - Add dedupWindowDays field to contentConfig (default 90, 0 to disable) - Pass single selectedTrend to buildPrompt and createSanityDocuments - Graceful degradation: dedup failures don't block pipeline
1 parent 9811a6b commit 9a60adc

File tree

2 files changed

+197
-11
lines changed

2 files changed

+197
-11
lines changed

app/api/cron/ingest/route.ts

Lines changed: 189 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,140 @@ const FALLBACK_TRENDS: TrendResult[] = [
108108
},
109109
];
110110

111+
// ---------------------------------------------------------------------------
112+
// Common stop words stripped when extracting search terms for dedup
113+
// ---------------------------------------------------------------------------
114+
115+
const STOP_WORDS = new Set([
116+
"the", "a", "an", "is", "of", "in", "for", "and", "to", "how", "why",
117+
"what", "with", "new", "your", "are", "was", "be", "been", "being",
118+
"have", "has", "had", "do", "does", "did", "will", "would", "could",
119+
"should", "may", "might", "must", "shall", "can", "need", "dare",
120+
"ought", "used", "every", "all", "both", "few", "more", "most",
121+
"other", "some", "such", "no", "nor", "not", "only", "own", "same",
122+
"so", "than", "too", "very", "just", "because", "as", "until",
123+
"while", "about", "between", "through", "during", "before", "after",
124+
"above", "below", "from", "up", "down", "out", "on", "off", "over",
125+
"under", "again", "further", "then", "once", "that", "this", "these",
126+
"those", "it", "its", "we", "you", "they", "them", "their", "our",
127+
"my", "he", "she", "him", "her", "his", "who", "which", "when",
128+
"where", "there", "here",
129+
]);
130+
131+
// ---------------------------------------------------------------------------
132+
// Topic Dedup — check if a topic has already been covered recently
133+
// ---------------------------------------------------------------------------
134+
135+
/**
136+
* Extract 2-3 meaningful search terms from a topic title.
137+
* Strips stop words and short tokens, returns lowercase terms.
138+
*/
139+
function extractSearchTerms(title: string): string[] {
140+
const words = title
141+
.toLowerCase()
142+
.replace(/[^a-z0-9\s.-]/g, " ")
143+
.split(/\s+/)
144+
.filter((w) => w.length > 2 && !STOP_WORDS.has(w));
145+
146+
// Return up to 3 most meaningful terms (first terms tend to be most specific)
147+
return words.slice(0, 3);
148+
}
149+
150+
/**
151+
* Check whether a topic (by title + slug) has already been covered within
152+
* the configured dedup window. Queries both `contentIdea` and `automatedVideo`
153+
* documents in Sanity.
154+
*
155+
* Returns `true` if the topic should be skipped (already covered).
156+
*/
157+
async function isTopicAlreadyCovered(topic: string, topics: string[]): Promise<boolean> {
158+
const dedupWindowDays = await getConfigValue("content_config", "dedupWindowDays", 90);
159+
160+
// Dedup disabled when window is 0
161+
if (dedupWindowDays <= 0) {
162+
return false;
163+
}
164+
165+
const cutoff = new Date();
166+
cutoff.setDate(cutoff.getDate() - dedupWindowDays);
167+
const cutoffISO = cutoff.toISOString();
168+
169+
const searchTerms = extractSearchTerms(topic);
170+
if (searchTerms.length === 0) {
171+
return false;
172+
}
173+
174+
// Build a GROQ match pattern — each term becomes a wildcard prefix match
175+
// GROQ `match` supports patterns like "react*" and works with ||
176+
const matchPatterns = searchTerms.map((t) => `${t}*`);
177+
178+
// Query 1: Title-based match on contentIdea and automatedVideo
179+
// The `match` operator in GROQ does case-insensitive prefix matching
180+
const titleQuery = `{
181+
"ideas": *[_type == "contentIdea" && _createdAt > $cutoff && title match $patterns] { _id, title, topics },
182+
"videos": *[_type == "automatedVideo" && _createdAt > $cutoff && title match $patterns] { _id, title }
183+
}`;
184+
185+
try {
186+
const titleResults = await writeClient.fetch(titleQuery, {
187+
cutoff: cutoffISO,
188+
patterns: matchPatterns,
189+
});
190+
191+
const ideaMatches: Array<{ _id: string; title: string; topics?: string[] }> = titleResults.ideas ?? [];
192+
const videoMatches: Array<{ _id: string; title: string }> = titleResults.videos ?? [];
193+
194+
// If any title match is found, topic is covered
195+
if (ideaMatches.length > 0 || videoMatches.length > 0) {
196+
console.log(
197+
`[CRON/ingest] Dedup: title match found for "${topic}" — ${ideaMatches.length} ideas, ${videoMatches.length} videos`,
198+
);
199+
return true;
200+
}
201+
202+
// Query 2: Check topic tag overlap on contentIdea documents
203+
// We consider a topic covered if 2+ tags overlap
204+
if (topics.length > 0) {
205+
const topicLower = topics.map((t) => t.toLowerCase());
206+
const overlapQuery = `*[_type == "contentIdea" && _createdAt > $cutoff && count((topics[])[@ in $topicTags]) >= 2] { _id, title, topics }`;
207+
208+
const overlapResults = await writeClient.fetch(overlapQuery, {
209+
cutoff: cutoffISO,
210+
topicTags: topicLower,
211+
});
212+
213+
if (overlapResults.length > 0) {
214+
console.log(
215+
`[CRON/ingest] Dedup: topic overlap found for "${topic}" — ${overlapResults.length} matching ideas`,
216+
);
217+
return true;
218+
}
219+
}
220+
221+
return false;
222+
} catch (err) {
223+
// If dedup query fails, don't block the pipeline — log and continue
224+
console.warn("[CRON/ingest] Dedup query failed, allowing topic:", err);
225+
return false;
226+
}
227+
}
228+
229+
/**
230+
* Check if a slug already exists on an automatedVideo document.
231+
*/
232+
async function isSlugTaken(slug: string): Promise<boolean> {
233+
try {
234+
const results = await writeClient.fetch(
235+
`*[_type == "automatedVideo" && slug.current == $slug][0..0] { _id }`,
236+
{ slug },
237+
);
238+
return results.length > 0;
239+
} catch (err) {
240+
console.warn("[CRON/ingest] Slug check failed, allowing:", err);
241+
return false;
242+
}
243+
}
244+
111245
// ---------------------------------------------------------------------------
112246
// Gemini Script Generation
113247
// ---------------------------------------------------------------------------
@@ -332,7 +466,7 @@ Respond with ONLY the JSON object.`,
332466
async function createSanityDocuments(
333467
script: GeneratedScript,
334468
criticResult: CriticResult,
335-
trends: TrendResult[],
469+
selectedTrend: TrendResult,
336470
qualityThreshold: number,
337471
research?: ResearchPayload,
338472
researchMeta?: { notebookId: string; taskId: string },
@@ -374,8 +508,8 @@ async function createSanityDocuments(
374508
...(isFlagged && {
375509
flaggedReason: `Quality score ${criticResult.score}/100. Issues: ${criticResult.issues.join("; ") || "Low quality score"}`,
376510
}),
377-
trendScore: trends[0]?.score,
378-
trendSources: trends[0]?.signals.map(s => s.source).join(", "),
511+
trendScore: selectedTrend.score,
512+
trendSources: selectedTrend.signals.map(s => s.source).join(", "),
379513
researchNotebookId: researchMeta?.notebookId ?? research?.notebookId,
380514
...(researchMeta?.taskId && { researchTaskId: researchMeta.taskId }),
381515
});
@@ -442,23 +576,66 @@ export async function GET(request: NextRequest) {
442576
trends = FALLBACK_TRENDS;
443577
}
444578

445-
// Step 2: Optional deep research on top topic (fire-and-forget)
579+
// Step 1.5: Dedup — walk trends list, skip already-covered topics
580+
console.log("[CRON/ingest] Dedup: checking trends for already-covered topics...");
581+
let selectedTrend: TrendResult | undefined;
582+
let skippedCount = 0;
583+
584+
for (const trend of trends) {
585+
// Extract keyword-style topics from the trend title for tag overlap check
586+
const topicKeywords = extractSearchTerms(trend.topic);
587+
588+
const covered = await isTopicAlreadyCovered(trend.topic, topicKeywords);
589+
if (covered) {
590+
console.log(`[CRON/ingest] Dedup: skipping "${trend.topic}" (score: ${trend.score}) — already covered`);
591+
skippedCount++;
592+
continue;
593+
}
594+
595+
// Also check for slug collision
596+
if (trend.slug) {
597+
const slugTaken = await isSlugTaken(trend.slug);
598+
if (slugTaken) {
599+
console.log(`[CRON/ingest] Dedup: skipping "${trend.topic}" — slug "${trend.slug}" already exists`);
600+
skippedCount++;
601+
continue;
602+
}
603+
}
604+
605+
selectedTrend = trend;
606+
break;
607+
}
608+
609+
if (!selectedTrend) {
610+
console.log(`[CRON/ingest] Dedup: all ${trends.length} trending topics already covered. Skipping ingestion.`);
611+
return Response.json({
612+
success: true,
613+
skipped: true,
614+
message: "All trending topics already covered",
615+
trendCount: trends.length,
616+
skippedCount,
617+
});
618+
}
619+
620+
console.log(`[CRON/ingest] Dedup: selected "${selectedTrend.topic}" (score: ${selectedTrend.score}, skipped ${skippedCount} topics)`);
621+
622+
// Step 2: Optional deep research on selected topic (fire-and-forget)
446623
// When research is enabled, we create a notebook and start research
447624
// but DON'T wait for it — the check-research cron will poll and enrich later
448625
let researchMeta: { notebookId: string; taskId: string } | undefined;
449626
if (enableNotebookLmResearch) {
450-
console.log(`[CRON/ingest] Starting fire-and-forget research on: "${trends[0].topic}"...`);
627+
console.log(`[CRON/ingest] Starting fire-and-forget research on: "${selectedTrend.topic}"...`);
451628
try {
452629
const auth = await initAuth();
453630
const nbClient = new NotebookLMClient(auth);
454631

455632
// Create notebook
456-
const notebook = await nbClient.createNotebook(trends[0].topic);
633+
const notebook = await nbClient.createNotebook(selectedTrend.topic);
457634
const notebookId = notebook.id;
458635
console.log(`[CRON/ingest] Created notebook: ${notebookId}`);
459636

460637
// Add source URLs from trend signals
461-
const sourceUrls = (trends[0].signals ?? [])
638+
const sourceUrls = (selectedTrend.signals ?? [])
462639
.map((s: { url?: string }) => s.url)
463640
.filter((u): u is string => !!u && u.startsWith("http"))
464641
.slice(0, 5);
@@ -471,7 +648,7 @@ export async function GET(request: NextRequest) {
471648
console.log(`[CRON/ingest] Added ${sourceUrls.length} source URLs to notebook`);
472649

473650
// Start deep research (fire-and-forget — don't poll!)
474-
const researchTask = await nbClient.startResearch(notebookId, trends[0].topic, "deep");
651+
const researchTask = await nbClient.startResearch(notebookId, selectedTrend.topic, "deep");
475652
const researchTaskId = researchTask?.taskId ?? "";
476653
console.log(`[CRON/ingest] Research started — taskId: ${researchTaskId}. check-research cron will poll.`);
477654

@@ -484,7 +661,7 @@ export async function GET(request: NextRequest) {
484661
// Step 3: Generate script with Gemini (basic — without research data)
485662
// When research is enabled, check-research will re-generate an enriched script later
486663
console.log("[CRON/ingest] Generating script with Gemini...");
487-
const prompt = buildPrompt(trends);
664+
const prompt = buildPrompt([selectedTrend]);
488665
const rawResponse = await generateWithGemini(prompt, SYSTEM_INSTRUCTION);
489666

490667
let script: GeneratedScript;
@@ -515,7 +692,7 @@ export async function GET(request: NextRequest) {
515692
);
516693

517694
console.log("[CRON/ingest] Creating Sanity documents...");
518-
const result = await createSanityDocuments(script, criticResult, trends, qualityThreshold, undefined, researchMeta);
695+
const result = await createSanityDocuments(script, criticResult, selectedTrend, qualityThreshold, undefined, researchMeta);
519696

520697
console.log("[CRON/ingest] Done!", result);
521698

@@ -525,7 +702,8 @@ export async function GET(request: NextRequest) {
525702
title: script.title,
526703
criticScore: criticResult.score,
527704
trendCount: trends.length,
528-
trendScore: trends[0]?.score,
705+
trendScore: selectedTrend.score,
706+
skippedCount,
529707
researchStarted: !!researchMeta,
530708
researchNotebookId: researchMeta?.notebookId,
531709
});

sanity/schemas/singletons/contentConfig.ts

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,14 @@ export default defineType({
105105
description: "Maximum number of scenes per video. Keeps videos focused and within duration targets",
106106
initialValue: 5,
107107
}),
108+
defineField({
109+
name: "dedupWindowDays",
110+
title: "Dedup Window (days)",
111+
type: "number",
112+
description: "Number of days to look back when checking for duplicate topics. Topics covered within this window will be skipped during daily ingestion. Set to 0 to disable dedup.",
113+
initialValue: 90,
114+
validation: (Rule) => Rule.min(0).max(365),
115+
}),
108116
],
109117
preview: {
110118
prepare() {

0 commit comments

Comments
 (0)