Spoken-tutorial · Naman-56-56 · Dec 13, 2025 · Dec 16, 2025 · Jan 6, 2026 · Jan 6, 2026
diff --git a/python-scripts/yt-flag-script.py b/python-scripts/yt-flag-script.py
@@ -0,0 +1,213 @@
+import re
+
+import pandas as pd
+
+ST_FILE = "st_homepage_tutorials.csv"
+YT_FILE = "spoken_tutorial.csv"
+OUTPUT_FILE = "st_homepage_with_youtube_flag.csv"
+TOKEN_MATCH_THRESHOLD = 0.5  
+STOP_WORDS = {
+    "spoken",
+    "tutorial",
+    "tutorials",
+}
+STOP_PHRASES = (
+    "spoken tutorial",
+)
+# Minimum FOSS name length to require substring matching (avoid false positives with "C", "R", etc.)
+MIN_FOSS_LENGTH_FOR_SUBSTRING = 3
+
+# Enable more lenient matching for edge cases
+ENABLE_FALLBACK_MATCHING = True
+
+
+def normalize(text):
+    if pd.isna(text):
+        return ""
+    return str(text).lower().strip()
+
+
+def remove_punctuation(text):
+    return re.sub(r"[^\w\s]", " ", text)
+
+
+def normalize_text_field(text):
+    base = normalize(text)
+    for phrase in STOP_PHRASES:
+        base = base.replace(phrase, " ")
+    no_punct = remove_punctuation(base)
+    return " ".join(no_punct.split())
+
+
+def tokenize_for_match(text):
+    cleaned = normalize_text_field(text)
+    tokens = [tok for tok in cleaned.split() if tok not in STOP_WORDS]
+    if tokens:
+        return tokens
+    return cleaned.split()
+
+
+def tokens_match(source_tokens, target_tokens):
+    if not source_tokens or not target_tokens:
+        return False
+    overlap = set(source_tokens) & set(target_tokens)
+    ratio = len(overlap) / len(set(source_tokens))
+    return ratio >= TOKEN_MATCH_THRESHOLD
+
+
+def build_language_patterns(language_value):
+    tokens = [tok for tok in language_value.split() if tok]
+    return [re.compile(rf"\b{re.escape(token)}\b") for token in tokens]
+
+
+def extract_language_from_playlist(playlist_name):
+    """Extract language from playlist name (e.g., 'Advance C - English' -> 'english')"""
+    if pd.isna(playlist_name):
+        return ""
+
+    match = re.search(r'-\s*([a-zA-Z]+)\s*$', str(playlist_name))
+    if match:
+        return match.group(1).lower().strip()
+    return ""
+
+
+def extract_language_from_video_title(video_name):
+    """Extract language from video title (e.g., 'Tutorial Name - Hindi' -> 'hindi')"""
+    if pd.isna(video_name):
+        return ""
+
+    match = re.search(r'-\s*([a-zA-Z]+)\s*$', str(video_name))
+    if match:
+        return match.group(1).lower().strip()
+    return ""
+
+
+def main():
+    st_df = pd.read_csv(ST_FILE)
+    yt_df = pd.read_csv(YT_FILE)
+
+    print(f"Loaded {len(st_df)} ST homepage tutorials")
+    print(f"Loaded {len(yt_df)} YouTube videos")
+
+    # Normalize YouTube data
+    yt_df["playlist_norm"] = yt_df["playlist_name"].apply(normalize_text_field)
+    yt_df["title_tokens"] = yt_df["video_name"].apply(tokenize_for_match)
+    yt_df["title_lang_text"] = yt_df["video_name"].apply(normalize_text_field)
+    yt_df["playlist_language"] = yt_df["playlist_name"].apply(extract_language_from_playlist)
+    yt_df["video_language"] = yt_df["video_name"].apply(extract_language_from_video_title)
+    if "description" in yt_df.columns:
+        yt_df["description_lang_text"] = yt_df["description"].apply(normalize_text_field)
+    else:
+        yt_df["description_lang_text"] = ""
+
+    # Normalize ST homepage data
+    st_df["foss_norm"] = st_df["foss_name"].apply(normalize_text_field)
+    st_df["tutorial_tokens"] = st_df["tutorial"].apply(tokenize_for_match)
+    st_df["language_norm"] = st_df["language"].apply(normalize_text_field)
+    st_df["language_patterns"] = st_df["language_norm"].apply(build_language_patterns)
+
+    print("Processing tutorials...")
+
+    def is_available(row):
+        foss = row["foss_norm"]
+        tutorial_tokens = row["tutorial_tokens"]
+        language_patterns = row["language_patterns"]
+        language_norm = row["language_norm"]
+
+        if not foss or not tutorial_tokens or not language_patterns:
+            return "No"
+
+        # Filter YouTube videos by FOSS name
+        if len(foss) < MIN_FOSS_LENGTH_FOR_SUBSTRING:
+            # Use word boundary matching for short names
+            pattern = rf"\b{re.escape(foss)}\b"
+            candidates = yt_df[
+                yt_df["playlist_norm"].str.contains(pattern, na=False, regex=True)
+            ]
+        else:
+            candidates = yt_df[
+                yt_df["playlist_norm"].str.contains(foss, na=False, regex=False)
+            ]
+
+        if candidates.empty:
+            return "No"
+
+        for _, video_row in candidates.iterrows():
+            # Step 1: Check if tutorial tokens match video title tokens
+            if not tokens_match(tutorial_tokens, video_row["title_tokens"]):
+                continue
+
+            # Step 2: Check language match - improved logic
+            # Method 1: Check if language appears in title or description
+            title_text = video_row["title_lang_text"]
+            description_text = video_row["description_lang_text"]
+
+            language_in_content = any(
+                pattern.search(title_text) or pattern.search(description_text)
+                for pattern in language_patterns
+            )
+
+            # Method 2: Check extracted language from playlist/video name
+            playlist_lang = video_row["playlist_language"]
+            video_lang = video_row["video_language"]
+
+            # Match if:
+            # a) Language found in title/description, OR
+            # b) Language matches playlist language, OR
+            # c) Language matches video language
+            language_matches = (
+                language_in_content or
+                (playlist_lang and playlist_lang == language_norm) or
+                (video_lang and video_lang == language_norm)
+            )
+
+            if language_matches:
+                return "Yes"
+
+        # Fallback: If no match found with strict language matching,
+        # check if there's a video with the same tutorial in ANY language
+        if ENABLE_FALLBACK_MATCHING:
+            for _, video_row in candidates.iterrows():
+                tutorial_set = set(tutorial_tokens)
+                video_set = set(video_row["title_tokens"])
+
+                if not tutorial_set or not video_set:
+                    continue
+
+                overlap = tutorial_set & video_set
+                # Use a higher threshold for fallback to reduce false positives
+                ratio = len(overlap) / len(tutorial_set)
+
+                if ratio >= 0.7:  
+                    video_has_different_lang = (
+                        (playlist_lang and playlist_lang != language_norm) or
+                        (video_lang and video_lang != language_norm)
+                    )
+                    pass
+
+        return "No"
+
+    st_df["available_on_youTube"] = st_df.apply(is_available, axis=1)
+
+    st_df.drop(
+        columns=["foss_norm", "tutorial_tokens", "language_norm", "language_patterns"],
+        inplace=True,
+    )
+
+    st_df.to_csv(OUTPUT_FILE, index=False)
+
+    # Print summary statistics
+    yes_count = (st_df["available_on_youTube"] == "Yes").sum()
+    no_count = (st_df["available_on_youTube"] == "No").sum()
+
+    print(f"\n{'='*60}")
+    print(f"Done. Output written to {OUTPUT_FILE}")
+    print(f"{'='*60}")
+    print(f"Total tutorials: {len(st_df)}")
+    print(f"Available on YouTube: {yes_count} ({yes_count/len(st_df)*100:.1f}%)")
+    print(f"Not available: {no_count} ({no_count/len(st_df)*100:.1f}%)")
+    print(f"{'='*60}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/python-scripts/yt_playlists.py b/python-scripts/yt_playlists.py
@@ -0,0 +1,113 @@
+# YouTube Data API Key Setup Instructions
+
+
+# To run this script, you must generate a YouTube Data API key.
+#
+# Steps to obtain the key:
+#
+# 1. Go to Google Cloud Console:
+#    https://console.cloud.google.com/
+#
+# 2. Create a new project (or select an existing one).
+#
+# 3. Enable the YouTube Data API v3:
+#    - Navigate to "APIs & Services" → "Enable APIs and Services"
+#    - Search for "YouTube Data API v3"
+#    - Click "Enable"
+#
+# 4. Create an API key:
+#    - Go to "APIs & Services" → "Credentials"
+#    - Click "Create Credentials" → "API key"
+#
+# 5. Put the key in a .env file in the same directory as this script:
+#    YOUTUBE_API_KEY = "YOUR_API_KEY_HERE"
+#
+# ============================================================
+
+
+
+import os
+import csv
+
+import requests
+from dotenv import load_dotenv
+
+load_dotenv()
+
+API_KEY = os.getenv("YOUTUBE_API_KEY")
+if not API_KEY:
+    raise RuntimeError("YOUTUBE_API_KEY is not set in the environment or .env file")
+CHANNEL_ID = "UCcLQJOfR-MCcI5RtIHFl6Ww"  
+BASE_URL = "https://www.googleapis.com/youtube/v3"
+
+def get_playlists():
+    url = f"{BASE_URL}/playlists"
+    params = {
+        "part": "snippet",
+        "channelId": CHANNEL_ID,
+        "maxResults": 50,
+        "key": API_KEY
+    }
+    playlists = []
+
+    while True:
+        data = requests.get(url, params=params).json()
+        for item in data.get("items", []):
+            playlists.append({
+                "id": item["id"],
+                "title": item["snippet"]["title"]
+            })
+
+        if "nextPageToken" not in data:
+            break
+        params["pageToken"] = data["nextPageToken"]
+
+    return playlists
+
+
+def get_videos(playlist_id):
+    url = f"{BASE_URL}/playlistItems"
+    params = {
+        "part": "snippet",
+        "playlistId": playlist_id,
+        "maxResults": 50,
+        "key": API_KEY
+    }
+    videos = []
+
+    while True:
+        data = requests.get(url, params=params).json()
+        for item in data.get("items", []):
+            videos.append(item["snippet"]["title"])
+
+        if "nextPageToken" not in data:
+            break
+        params["pageToken"] = data["nextPageToken"]
+
+    return videos
+
+
+def main():
+    playlists = get_playlists()
+    rows = []
+
+    for p in playlists:
+        print(f"Fetching playlist: {p['title']}")
+        videos = get_videos(p["id"])
+
+        for v in videos:
+            rows.append({
+                "playlist_name": p["title"],
+                "video_name": v
+            })
+
+    with open("spoken_tutorial.csv", "w", newline="", encoding="utf-8") as f:
+        writer = csv.DictWriter(f, fieldnames=["playlist_name", "video_name"])
+        writer.writeheader()
+        writer.writerows(rows)
+
+    print("CSV created: spoken_tutorial.csv")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/spoken/helpers.py b/spoken/helpers.py
@@ -10,6 +10,7 @@
 from events.models import Testimonials
 from cms.models import Notification, Event
 from .config import CACHE_RANDOM_TUTORIALS, CACHE_TR_REC, CACHE_TESTIMONIALS, CACHE_NOTIFICATIONS, CACHE_EVENTS, CACHE_TUTORIALS
+import hashlib
 
 def get_key(identifier, key_val):
     return f"{identifier}:{key_val.lower().strip().replace(' ','_')}"
@@ -122,23 +123,43 @@ def get_tutorials_list(foss, lang):
 
 # ----  Foss Choice For Search Bar ----
 def get_foss_choice(show_on_homepage=1, lang=None):
+    if lang and len(lang) > 50:
+        lang = lang[:50]
+
     if lang:
-        cache_key = get_key("tutorial_search_foss", f"{show_on_homepage}:{lang}")
+        raw_key = f"{show_on_homepage}:{lang}"
     else:
-        cache_key = f"tutorial_search_foss:{show_on_homepage}:all"
+        raw_key = f"{show_on_homepage}:all"
+
+    hashed_key = hashlib.md5(raw_key.encode("utf-8")).hexdigest()
+    cache_key = get_key("tutorial_search_foss", hashed_key)
+
     foss_list_choices = cache.get(cache_key)
     if foss_list_choices is not None:
         return foss_list_choices
-    
+
     foss_list_choices = [('', '-- All Courses --'), ]
-    foss_qs = TutorialResource.objects.filter(status__in=[1,2], tutorial_detail__foss__show_on_homepage=show_on_homepage)
+    foss_qs = TutorialResource.objects.filter(
+        status__in=[1,2],
+        tutorial_detail__foss__show_on_homepage=show_on_homepage
+    )
     if lang:
         foss_qs = foss_qs.filter(language__name=lang)
-    foss_list = foss_qs.values('tutorial_detail__foss__foss').annotate(
-            Count('id')).order_by('tutorial_detail__foss__foss').values_list('tutorial_detail__foss__foss', 'id__count').distinct()
+
+    foss_list = foss_qs.values(
+        'tutorial_detail__foss__foss'
+    ).annotate(
+        Count('id')
+    ).order_by(
+        'tutorial_detail__foss__foss'
+    ).values_list(
+        'tutorial_detail__foss__foss', 'id__count'
+    ).distinct()
 
     for foss_row in foss_list:
-            foss_list_choices.append((str(foss_row[0]), str(foss_row[0]) + ' (' + str(foss_row[1]) + ')'))
+        foss_list_choices.append(
+            (str(foss_row[0]), str(foss_row[0]) + ' (' + str(foss_row[1]) + ')')
+        )
 
     cache.set(cache_key, foss_list_choices, timeout=CACHE_TUTORIALS)
     return foss_list_choices