@@ -108,6 +108,140 @@ const FALLBACK_TRENDS: TrendResult[] = [
108108 } ,
109109] ;
110110
111+ // ---------------------------------------------------------------------------
112+ // Common stop words stripped when extracting search terms for dedup
113+ // ---------------------------------------------------------------------------
114+
115+ const STOP_WORDS = new Set ( [
116+ "the" , "a" , "an" , "is" , "of" , "in" , "for" , "and" , "to" , "how" , "why" ,
117+ "what" , "with" , "new" , "your" , "are" , "was" , "be" , "been" , "being" ,
118+ "have" , "has" , "had" , "do" , "does" , "did" , "will" , "would" , "could" ,
119+ "should" , "may" , "might" , "must" , "shall" , "can" , "need" , "dare" ,
120+ "ought" , "used" , "every" , "all" , "both" , "few" , "more" , "most" ,
121+ "other" , "some" , "such" , "no" , "nor" , "not" , "only" , "own" , "same" ,
122+ "so" , "than" , "too" , "very" , "just" , "because" , "as" , "until" ,
123+ "while" , "about" , "between" , "through" , "during" , "before" , "after" ,
124+ "above" , "below" , "from" , "up" , "down" , "out" , "on" , "off" , "over" ,
125+ "under" , "again" , "further" , "then" , "once" , "that" , "this" , "these" ,
126+ "those" , "it" , "its" , "we" , "you" , "they" , "them" , "their" , "our" ,
127+ "my" , "he" , "she" , "him" , "her" , "his" , "who" , "which" , "when" ,
128+ "where" , "there" , "here" ,
129+ ] ) ;
130+
131+ // ---------------------------------------------------------------------------
132+ // Topic Dedup — check if a topic has already been covered recently
133+ // ---------------------------------------------------------------------------
134+
135+ /**
136+ * Extract 2-3 meaningful search terms from a topic title.
137+ * Strips stop words and short tokens, returns lowercase terms.
138+ */
139+ function extractSearchTerms ( title : string ) : string [ ] {
140+ const words = title
141+ . toLowerCase ( )
142+ . replace ( / [ ^ a - z 0 - 9 \s . - ] / g, " " )
143+ . split ( / \s + / )
144+ . filter ( ( w ) => w . length > 2 && ! STOP_WORDS . has ( w ) ) ;
145+
146+ // Return up to 3 most meaningful terms (first terms tend to be most specific)
147+ return words . slice ( 0 , 3 ) ;
148+ }
149+
150+ /**
151+ * Check whether a topic (by title + slug) has already been covered within
152+ * the configured dedup window. Queries both `contentIdea` and `automatedVideo`
153+ * documents in Sanity.
154+ *
155+ * Returns `true` if the topic should be skipped (already covered).
156+ */
157+ async function isTopicAlreadyCovered ( topic : string , topics : string [ ] ) : Promise < boolean > {
158+ const dedupWindowDays = await getConfigValue ( "content_config" , "dedupWindowDays" , 90 ) ;
159+
160+ // Dedup disabled when window is 0
161+ if ( dedupWindowDays <= 0 ) {
162+ return false ;
163+ }
164+
165+ const cutoff = new Date ( ) ;
166+ cutoff . setDate ( cutoff . getDate ( ) - dedupWindowDays ) ;
167+ const cutoffISO = cutoff . toISOString ( ) ;
168+
169+ const searchTerms = extractSearchTerms ( topic ) ;
170+ if ( searchTerms . length === 0 ) {
171+ return false ;
172+ }
173+
174+ // Build a GROQ match pattern — each term becomes a wildcard prefix match
175+ // GROQ `match` supports patterns like "react*" and works with ||
176+ const matchPatterns = searchTerms . map ( ( t ) => `${ t } *` ) ;
177+
178+ // Query 1: Title-based match on contentIdea and automatedVideo
179+ // The `match` operator in GROQ does case-insensitive prefix matching
180+ const titleQuery = `{
181+ "ideas": *[_type == "contentIdea" && _createdAt > $cutoff && title match $patterns] { _id, title, topics },
182+ "videos": *[_type == "automatedVideo" && _createdAt > $cutoff && title match $patterns] { _id, title }
183+ }` ;
184+
185+ try {
186+ const titleResults = await writeClient . fetch ( titleQuery , {
187+ cutoff : cutoffISO ,
188+ patterns : matchPatterns ,
189+ } ) ;
190+
191+ const ideaMatches : Array < { _id : string ; title : string ; topics ?: string [ ] } > = titleResults . ideas ?? [ ] ;
192+ const videoMatches : Array < { _id : string ; title : string } > = titleResults . videos ?? [ ] ;
193+
194+ // If any title match is found, topic is covered
195+ if ( ideaMatches . length > 0 || videoMatches . length > 0 ) {
196+ console . log (
197+ `[CRON/ingest] Dedup: title match found for "${ topic } " — ${ ideaMatches . length } ideas, ${ videoMatches . length } videos` ,
198+ ) ;
199+ return true ;
200+ }
201+
202+ // Query 2: Check topic tag overlap on contentIdea documents
203+ // We consider a topic covered if 2+ tags overlap
204+ if ( topics . length > 0 ) {
205+ const topicLower = topics . map ( ( t ) => t . toLowerCase ( ) ) ;
206+ const overlapQuery = `*[_type == "contentIdea" && _createdAt > $cutoff && count((topics[])[@ in $topicTags]) >= 2] { _id, title, topics }` ;
207+
208+ const overlapResults = await writeClient . fetch ( overlapQuery , {
209+ cutoff : cutoffISO ,
210+ topicTags : topicLower ,
211+ } ) ;
212+
213+ if ( overlapResults . length > 0 ) {
214+ console . log (
215+ `[CRON/ingest] Dedup: topic overlap found for "${ topic } " — ${ overlapResults . length } matching ideas` ,
216+ ) ;
217+ return true ;
218+ }
219+ }
220+
221+ return false ;
222+ } catch ( err ) {
223+ // If dedup query fails, don't block the pipeline — log and continue
224+ console . warn ( "[CRON/ingest] Dedup query failed, allowing topic:" , err ) ;
225+ return false ;
226+ }
227+ }
228+
229+ /**
230+ * Check if a slug already exists on an automatedVideo document.
231+ */
232+ async function isSlugTaken ( slug : string ) : Promise < boolean > {
233+ try {
234+ const results = await writeClient . fetch (
235+ `*[_type == "automatedVideo" && slug.current == $slug][0..0] { _id }` ,
236+ { slug } ,
237+ ) ;
238+ return results . length > 0 ;
239+ } catch ( err ) {
240+ console . warn ( "[CRON/ingest] Slug check failed, allowing:" , err ) ;
241+ return false ;
242+ }
243+ }
244+
111245// ---------------------------------------------------------------------------
112246// Gemini Script Generation
113247// ---------------------------------------------------------------------------
@@ -332,7 +466,7 @@ Respond with ONLY the JSON object.`,
332466async function createSanityDocuments (
333467 script : GeneratedScript ,
334468 criticResult : CriticResult ,
335- trends : TrendResult [ ] ,
469+ selectedTrend : TrendResult ,
336470 qualityThreshold : number ,
337471 research ?: ResearchPayload ,
338472 researchMeta ?: { notebookId : string ; taskId : string } ,
@@ -374,8 +508,8 @@ async function createSanityDocuments(
374508 ...( isFlagged && {
375509 flaggedReason : `Quality score ${ criticResult . score } /100. Issues: ${ criticResult . issues . join ( "; " ) || "Low quality score" } ` ,
376510 } ) ,
377- trendScore : trends [ 0 ] ? .score ,
378- trendSources : trends [ 0 ] ? .signals . map ( s => s . source ) . join ( ", " ) ,
511+ trendScore : selectedTrend . score ,
512+ trendSources : selectedTrend . signals . map ( s => s . source ) . join ( ", " ) ,
379513 researchNotebookId : researchMeta ?. notebookId ?? research ?. notebookId ,
380514 ...( researchMeta ?. taskId && { researchTaskId : researchMeta . taskId } ) ,
381515 } ) ;
@@ -442,23 +576,66 @@ export async function GET(request: NextRequest) {
442576 trends = FALLBACK_TRENDS ;
443577 }
444578
445- // Step 2: Optional deep research on top topic (fire-and-forget)
579+ // Step 1.5: Dedup — walk trends list, skip already-covered topics
580+ console . log ( "[CRON/ingest] Dedup: checking trends for already-covered topics..." ) ;
581+ let selectedTrend : TrendResult | undefined ;
582+ let skippedCount = 0 ;
583+
584+ for ( const trend of trends ) {
585+ // Extract keyword-style topics from the trend title for tag overlap check
586+ const topicKeywords = extractSearchTerms ( trend . topic ) ;
587+
588+ const covered = await isTopicAlreadyCovered ( trend . topic , topicKeywords ) ;
589+ if ( covered ) {
590+ console . log ( `[CRON/ingest] Dedup: skipping "${ trend . topic } " (score: ${ trend . score } ) — already covered` ) ;
591+ skippedCount ++ ;
592+ continue ;
593+ }
594+
595+ // Also check for slug collision
596+ if ( trend . slug ) {
597+ const slugTaken = await isSlugTaken ( trend . slug ) ;
598+ if ( slugTaken ) {
599+ console . log ( `[CRON/ingest] Dedup: skipping "${ trend . topic } " — slug "${ trend . slug } " already exists` ) ;
600+ skippedCount ++ ;
601+ continue ;
602+ }
603+ }
604+
605+ selectedTrend = trend ;
606+ break ;
607+ }
608+
609+ if ( ! selectedTrend ) {
610+ console . log ( `[CRON/ingest] Dedup: all ${ trends . length } trending topics already covered. Skipping ingestion.` ) ;
611+ return Response . json ( {
612+ success : true ,
613+ skipped : true ,
614+ message : "All trending topics already covered" ,
615+ trendCount : trends . length ,
616+ skippedCount,
617+ } ) ;
618+ }
619+
620+ console . log ( `[CRON/ingest] Dedup: selected "${ selectedTrend . topic } " (score: ${ selectedTrend . score } , skipped ${ skippedCount } topics)` ) ;
621+
622+ // Step 2: Optional deep research on selected topic (fire-and-forget)
446623 // When research is enabled, we create a notebook and start research
447624 // but DON'T wait for it — the check-research cron will poll and enrich later
448625 let researchMeta : { notebookId : string ; taskId : string } | undefined ;
449626 if ( enableNotebookLmResearch ) {
450- console . log ( `[CRON/ingest] Starting fire-and-forget research on: "${ trends [ 0 ] . topic } "...` ) ;
627+ console . log ( `[CRON/ingest] Starting fire-and-forget research on: "${ selectedTrend . topic } "...` ) ;
451628 try {
452629 const auth = await initAuth ( ) ;
453630 const nbClient = new NotebookLMClient ( auth ) ;
454631
455632 // Create notebook
456- const notebook = await nbClient . createNotebook ( trends [ 0 ] . topic ) ;
633+ const notebook = await nbClient . createNotebook ( selectedTrend . topic ) ;
457634 const notebookId = notebook . id ;
458635 console . log ( `[CRON/ingest] Created notebook: ${ notebookId } ` ) ;
459636
460637 // Add source URLs from trend signals
461- const sourceUrls = ( trends [ 0 ] . signals ?? [ ] )
638+ const sourceUrls = ( selectedTrend . signals ?? [ ] )
462639 . map ( ( s : { url ?: string } ) => s . url )
463640 . filter ( ( u ) : u is string => ! ! u && u . startsWith ( "http" ) )
464641 . slice ( 0 , 5 ) ;
@@ -471,7 +648,7 @@ export async function GET(request: NextRequest) {
471648 console . log ( `[CRON/ingest] Added ${ sourceUrls . length } source URLs to notebook` ) ;
472649
473650 // Start deep research (fire-and-forget — don't poll!)
474- const researchTask = await nbClient . startResearch ( notebookId , trends [ 0 ] . topic , "deep" ) ;
651+ const researchTask = await nbClient . startResearch ( notebookId , selectedTrend . topic , "deep" ) ;
475652 const researchTaskId = researchTask ?. taskId ?? "" ;
476653 console . log ( `[CRON/ingest] Research started — taskId: ${ researchTaskId } . check-research cron will poll.` ) ;
477654
@@ -484,7 +661,7 @@ export async function GET(request: NextRequest) {
484661 // Step 3: Generate script with Gemini (basic — without research data)
485662 // When research is enabled, check-research will re-generate an enriched script later
486663 console . log ( "[CRON/ingest] Generating script with Gemini..." ) ;
487- const prompt = buildPrompt ( trends ) ;
664+ const prompt = buildPrompt ( [ selectedTrend ] ) ;
488665 const rawResponse = await generateWithGemini ( prompt , SYSTEM_INSTRUCTION ) ;
489666
490667 let script : GeneratedScript ;
@@ -515,7 +692,7 @@ export async function GET(request: NextRequest) {
515692 ) ;
516693
517694 console . log ( "[CRON/ingest] Creating Sanity documents..." ) ;
518- const result = await createSanityDocuments ( script , criticResult , trends , qualityThreshold , undefined , researchMeta ) ;
695+ const result = await createSanityDocuments ( script , criticResult , selectedTrend , qualityThreshold , undefined , researchMeta ) ;
519696
520697 console . log ( "[CRON/ingest] Done!" , result ) ;
521698
@@ -525,7 +702,8 @@ export async function GET(request: NextRequest) {
525702 title : script . title ,
526703 criticScore : criticResult . score ,
527704 trendCount : trends . length ,
528- trendScore : trends [ 0 ] ?. score ,
705+ trendScore : selectedTrend . score ,
706+ skippedCount,
529707 researchStarted : ! ! researchMeta ,
530708 researchNotebookId : researchMeta ?. notebookId ,
531709 } ) ;
0 commit comments