%%{init: {'theme': 'dark', 'themeVariables': { 'primaryColor': '#1E1E28', 'primaryTextColor': '#fff', 'primaryBorderColor': '#953EF1', 'lineColor': '#555', 'secondaryColor': '#16161D', 'tertiaryColor': '#111'}}}%%
flowchart TD
%% Styles
classDef default fill:#1E1E28,stroke:#2A2A35,stroke-width:1px,color:#fff;
classDef api fill:#1A1A22,stroke:#953EF1,stroke-width:1px,stroke-dasharray: 5 5,color:#d8b4fe;
classDef config fill:#16161D,stroke:#F59E0B,stroke-width:2px,color:#FCD34D;
classDef reject fill:#2A1515,stroke:#EF4444,stroke-width:1px,color:#FCA5A5;
classDef success fill:#102A20,stroke:#10B981,stroke-width:1px,color:#6EE7B7;
classDef db fill:#111,stroke:#2A2A35,stroke-width:2px,color:#9CA3AF;
classDef parallel fill:#1a1a2e,stroke:#3B82F6,stroke-width:2px,color:#93C5FD;
%% --- START ---
Start([User Clicks Start]) --> SelectHashtag[Select Hashtag by Category Rotation]
%% --- PHASE 1: SEARCH ---
subgraph Phase1 [Phase 1: Discovery Search]
SelectHashtag --> SelectTemplate[Select Query Template]
Config_Template["Setting: 5 Templates - Standard, Coaching, Engagement, Expertise, Small Creator"]:::config -.-> SelectTemplate
SelectTemplate --> CheckProvider{Check Provider}
Config_Provider["Setting: Search Provider - Google vs DDG"]:::config -.-> CheckProvider
Config_Geo["Setting: Target Countries - Adds location keywords"]:::config -.-> GoogleSearch
%% DuckDuckGo Path with Backend Rotation
CheckProvider -- DuckDuckGo --> DDGBackend[Backend Rotation]
DDGBackend --> DDGStrategy["Strategies: Natural + Dork"]
DDGStrategy --> DDGSearch["python ddgs library"]
DDGSearch --> DDGAPI("DDG/Brave/Yahoo/Mojeek/Yandex"):::api
Config_DDG["Anti-Detection: Adaptive delays, blocked backend tracking"]:::config -.-> DDGBackend
%% Google Path
CheckProvider -- Google --> GoogleSearch[Build Location-Filtered Query]
GoogleSearch --> ApifyGoogle["Apify: google-search-scraper"]
ApifyGoogle --> GoogleAPI(Google Search):::api
%% Results Processing
DDGAPI --> FilterResults[Filter Raw Results]
GoogleAPI --> FilterResults
FilterResults --> PlatformFilter{Platform Handle Check}
PlatformFilter -- Official Account --> RejectPlatform("Skip: Platform Account"):::reject
PlatformFilter -- Creator --> DupeCheck{In-Memory Dupe Check}
DupeCheck -- Duplicate --> SkipDupe("Skip: Already Seen"):::reject
DupeCheck -- New --> SaveFound[Save to DB: status=found]:::db
end
%% --- PHASE 2: PRE-VALIDATION ---
subgraph Phase2 ["Phase 2: Pre-Validation (Parallel Workers)"]
SaveFound --> ExtractSnippetEmail[Extract Emails from Snippet]
ExtractSnippetEmail --> ParseFollowers[Parse Follower Count from Snippet]
%% Follower Filter
ParseFollowers --> CheckSnippetCount{Snippet Follower Check}
Config_Snippet["Setting: Snippet Filtering - Min Followers"]:::config -.-> CheckSnippetCount
CheckSnippetCount -- Too Low --> RejectSnippet("Reject: Low Followers"):::reject
%% Link Probing
CheckSnippetCount -- Pass/Unknown --> LinkProbe[HTTP Probe External Link]
Config_Probe["Setting: Link Probing Enabled"]:::config -.-> LinkProbe
LinkProbe --> ProbeResult{Probe Result}
ProbeResult -- 200 OK --> ExtractLinkEmail[Extract Emails from HTML]
ProbeResult -- JS Rendered --> PlaywrightFallback["Playwright Headless Browser"]
PlaywrightFallback --> JSONMine["Mine __NEXT_DATA__ / __NUXT__"]
JSONMine --> ExtractLinkEmail
ProbeResult -- Hard Block Keywords --> RejectNSFWLink("Reject: NSFW Domain"):::reject
%% AI Pre-Validation
ExtractLinkEmail --> AIClassify[AI Gatekeeper Analysis]
Config_Prompt["Setting: Business Prompt - Customize System Prompt"]:::config -.-> AIClassify
AIClassify --> GPT5Nano("GPT-5 Nano"):::api
GPT5Nano --> AIResult{AI Decision}
AIResult -- is_business=true --> RejectBiz("Reject: Business/Agency"):::reject
AIResult -- is_target_geo=false --> RejectGeo("Reject: Wrong Geography"):::reject
AIResult -- is_nsfw=true --> RejectNSFW("Reject: NSFW Content"):::reject
AIResult -- Pass --> PreValPass(["Mark: Pre-Validated"]):::success
end
%% --- PHASE 3: ENRICHMENT ---
subgraph Phase3 [Phase 3: Data Enrichment]
PreValPass --> ApifyScrape["Apify: instagram-profile-scraper"]
ApifyScrape --> IGScraper(Instagram Scraper):::api
IGScraper --> ExtractProfile[Extract Profile Data]
ExtractProfile --> ExtractBizInfo[Extract Business Flag + Category]
ExtractBizInfo --> DetectCountry[Detect Country from Bio/Location]
DetectCountry --> EmailToggle{Email Extraction?}
Config_Email["Setting: Skip Email Extraction"]:::config -.-> EmailToggle
EmailToggle -- Enabled --> ExtractBioEmail[Extract Emails from Bio]
EmailToggle -- Disabled --> SkipEmail[Skip]
ExtractBioEmail --> CheckMissing{Profile Found?}
SkipEmail --> CheckMissing
CheckMissing -- Not Found --> RejectMissing("Reject: Profile Not Found"):::reject
CheckMissing -- Found --> DBUpdate["Update DB: status=scraped"]:::db
end
%% --- PHASE 4: INTELLIGENCE ---
subgraph Phase4 ["Phase 4: Local Intelligence (Batch)"]
DBUpdate --> LocalFilters{Follower Check}
%% Settings
Config_MinFollow["Setting: Min Followers"]:::config -.-> LocalFilters
LocalFilters -- Below Min Followers --> RejectLocal("Reject: Low Followers"):::reject
%% Batch Image Analysis (Text NSFW handled by AI in Phase 2)
LocalFilters -- Pass --> BatchNSFW["Batch NSFW Image Scan"]:::parallel
Config_NSFW["Setting: NSFW Threshold"]:::config -.-> BatchNSFW
Config_Workers["Setting: NSFW Workers Count"]:::config -.-> BatchNSFW
BatchNSFW --> FalconsAI("FalconsAI NSFW Model - Local"):::api
FalconsAI -- Unsafe Score Above Threshold --> RejectImage("Reject: NSFW Image"):::reject
%% Individual Score Calculation
FalconsAI -- Safe --> CalcScore[Calculate Individual Score]
CalcScore --> ScoreResult{Score Result}
ScoreResult -- "Score < -0.2" --> RejectBiz2("Reject: Business Account"):::reject
ScoreResult -- "Score > 0.2" --> PassDirect[Individual - Pass]
%% Gemini for Borderline Only
ScoreResult -- "-0.3 to 0.3 Borderline" --> GeminiClassify[Gemini Classification]
Config_Gemini["Setting: Use Gemini + Prefer Individuals"]:::config -.-> GeminiClassify
GeminiClassify --> GeminiFlash("Gemini 2.0 Flash"):::api
GeminiFlash -- Business --> RejectBiz2
GeminiFlash -- Individual --> PassGemini[Pass]
PassDirect --> CountryFilter{Country Filter}
PassGemini --> CountryFilter
Config_Country["Setting: Target Countries - US/Canada/Any"]:::config -.-> CountryFilter
CountryFilter -- Wrong Country --> RejectCountry("Reject: Wrong Country"):::reject
CountryFilter -- Pass --> Qualified(["Mark: Qualified"]):::success
end
%% --- END STATE ---
Qualified --> ManualReview["Candidates Queue - Awaiting Manual Review"]:::db
Pipeline Legend
Phase 1: Discovery Search
Performs intelligent searches using 5 rotating query templates (Standard, Coaching, Engagement, Expertise, Small Creator) to find diverse creators.
- Templates: Each targets different creator signals (e.g., "book a call" for coaches, "DM for" for engagement).
- DDG Mode: Rotates through 5 backends (DDG, Brave, Yahoo, Mojeek, Yandex) with adaptive delays.
- Google Mode: Uses Apify google-search-scraper (~$0.25/1k results).
- Filtering: Automatically excludes platform accounts (@whop, @stanstore, etc.).
Phase 2: Pre-Validation
Runs cheap checks before paying for full scraping. Uses parallel workers for speed.
- Email Mining: Extracts emails from search snippets and probed links (free!).
- Playwright Fallback: For JS-rendered pages (Stan Store, Linktree), uses headless browser + JSON mining.
- GPT-5 Nano: AI gatekeeper that analyzes content for business indicators (rejects only definitive businesses), geography (defaults to US/Canada unless clearly foreign), and NSFW content (text-based detection).
Phase 3: Enrichment
Only candidates who passed the Gatekeeper get enriched via Apify Instagram scraper.
- Business Detection: Extracts Instagram's business account flag and category.
- Country Detection: Infers location from bio text, location field, and full name.
- Email Option: Can skip email extraction here (curation pipeline handles it better).
Phase 4: Intelligence
Final quality assurance with batch processing for efficiency.
- Batch NSFW: Parallel download and classification of profile pictures using local FalconsAI model. Text-based NSFW is handled by the AI check in Phase 2.
- Individual Score: Calculates -1 to 1 score using bio keywords and Instagram flags.
- Gemini 2.0: Only called for borderline cases (score between -0.3 and 0.3). Defaults to "individual" when uncertain.
- Country Filter: Final check against target geography setting.
End State: Manual Review
All qualified candidates are placed in the Candidates Queue for manual review. There is no auto-approve - you review and approve each candidate before they are imported to your Creators list.