Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
213 changes: 213 additions & 0 deletions python-scripts/yt-flag-script.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,213 @@
import re

import pandas as pd

ST_FILE = "st_homepage_tutorials.csv"
YT_FILE = "spoken_tutorial.csv"
OUTPUT_FILE = "st_homepage_with_youtube_flag.csv"
TOKEN_MATCH_THRESHOLD = 0.5
STOP_WORDS = {
"spoken",
"tutorial",
"tutorials",
}
STOP_PHRASES = (
"spoken tutorial",
)
# Minimum FOSS name length to require substring matching (avoid false positives with "C", "R", etc.)
MIN_FOSS_LENGTH_FOR_SUBSTRING = 3

# Enable more lenient matching for edge cases
ENABLE_FALLBACK_MATCHING = True


def normalize(text):
if pd.isna(text):
return ""
return str(text).lower().strip()


def remove_punctuation(text):
return re.sub(r"[^\w\s]", " ", text)


def normalize_text_field(text):
base = normalize(text)
for phrase in STOP_PHRASES:
base = base.replace(phrase, " ")
no_punct = remove_punctuation(base)
return " ".join(no_punct.split())


def tokenize_for_match(text):
cleaned = normalize_text_field(text)
tokens = [tok for tok in cleaned.split() if tok not in STOP_WORDS]
if tokens:
return tokens
return cleaned.split()


def tokens_match(source_tokens, target_tokens):
if not source_tokens or not target_tokens:
return False
overlap = set(source_tokens) & set(target_tokens)
ratio = len(overlap) / len(set(source_tokens))
return ratio >= TOKEN_MATCH_THRESHOLD


def build_language_patterns(language_value):
tokens = [tok for tok in language_value.split() if tok]
return [re.compile(rf"\b{re.escape(token)}\b") for token in tokens]


def extract_language_from_playlist(playlist_name):
"""Extract language from playlist name (e.g., 'Advance C - English' -> 'english')"""
if pd.isna(playlist_name):
return ""

match = re.search(r'-\s*([a-zA-Z]+)\s*$', str(playlist_name))
if match:
return match.group(1).lower().strip()
return ""


def extract_language_from_video_title(video_name):
"""Extract language from video title (e.g., 'Tutorial Name - Hindi' -> 'hindi')"""
if pd.isna(video_name):
return ""

match = re.search(r'-\s*([a-zA-Z]+)\s*$', str(video_name))
if match:
return match.group(1).lower().strip()
return ""


def main():
st_df = pd.read_csv(ST_FILE)
yt_df = pd.read_csv(YT_FILE)

print(f"Loaded {len(st_df)} ST homepage tutorials")
print(f"Loaded {len(yt_df)} YouTube videos")

# Normalize YouTube data
yt_df["playlist_norm"] = yt_df["playlist_name"].apply(normalize_text_field)
yt_df["title_tokens"] = yt_df["video_name"].apply(tokenize_for_match)
yt_df["title_lang_text"] = yt_df["video_name"].apply(normalize_text_field)
yt_df["playlist_language"] = yt_df["playlist_name"].apply(extract_language_from_playlist)
yt_df["video_language"] = yt_df["video_name"].apply(extract_language_from_video_title)
if "description" in yt_df.columns:
yt_df["description_lang_text"] = yt_df["description"].apply(normalize_text_field)
else:
yt_df["description_lang_text"] = ""

# Normalize ST homepage data
st_df["foss_norm"] = st_df["foss_name"].apply(normalize_text_field)
st_df["tutorial_tokens"] = st_df["tutorial"].apply(tokenize_for_match)
st_df["language_norm"] = st_df["language"].apply(normalize_text_field)
st_df["language_patterns"] = st_df["language_norm"].apply(build_language_patterns)

print("Processing tutorials...")

def is_available(row):
foss = row["foss_norm"]
tutorial_tokens = row["tutorial_tokens"]
language_patterns = row["language_patterns"]
language_norm = row["language_norm"]

if not foss or not tutorial_tokens or not language_patterns:
return "No"

# Filter YouTube videos by FOSS name
if len(foss) < MIN_FOSS_LENGTH_FOR_SUBSTRING:
# Use word boundary matching for short names
pattern = rf"\b{re.escape(foss)}\b"
candidates = yt_df[
yt_df["playlist_norm"].str.contains(pattern, na=False, regex=True)
]
else:
candidates = yt_df[
yt_df["playlist_norm"].str.contains(foss, na=False, regex=False)
]

if candidates.empty:
return "No"

for _, video_row in candidates.iterrows():
# Step 1: Check if tutorial tokens match video title tokens
if not tokens_match(tutorial_tokens, video_row["title_tokens"]):
continue

# Step 2: Check language match - improved logic
# Method 1: Check if language appears in title or description
title_text = video_row["title_lang_text"]
description_text = video_row["description_lang_text"]

language_in_content = any(
pattern.search(title_text) or pattern.search(description_text)
for pattern in language_patterns
)

# Method 2: Check extracted language from playlist/video name
playlist_lang = video_row["playlist_language"]
video_lang = video_row["video_language"]

# Match if:
# a) Language found in title/description, OR
# b) Language matches playlist language, OR
# c) Language matches video language
language_matches = (
language_in_content or
(playlist_lang and playlist_lang == language_norm) or
(video_lang and video_lang == language_norm)
)

if language_matches:
return "Yes"

# Fallback: If no match found with strict language matching,
# check if there's a video with the same tutorial in ANY language
if ENABLE_FALLBACK_MATCHING:
for _, video_row in candidates.iterrows():
tutorial_set = set(tutorial_tokens)
video_set = set(video_row["title_tokens"])

if not tutorial_set or not video_set:
continue

overlap = tutorial_set & video_set
# Use a higher threshold for fallback to reduce false positives
ratio = len(overlap) / len(tutorial_set)

if ratio >= 0.7:
video_has_different_lang = (
(playlist_lang and playlist_lang != language_norm) or
(video_lang and video_lang != language_norm)
)
pass

return "No"

st_df["available_on_youTube"] = st_df.apply(is_available, axis=1)

st_df.drop(
columns=["foss_norm", "tutorial_tokens", "language_norm", "language_patterns"],
inplace=True,
)

st_df.to_csv(OUTPUT_FILE, index=False)

# Print summary statistics
yes_count = (st_df["available_on_youTube"] == "Yes").sum()
no_count = (st_df["available_on_youTube"] == "No").sum()

print(f"\n{'='*60}")
print(f"Done. Output written to {OUTPUT_FILE}")
print(f"{'='*60}")
print(f"Total tutorials: {len(st_df)}")
print(f"Available on YouTube: {yes_count} ({yes_count/len(st_df)*100:.1f}%)")
print(f"Not available: {no_count} ({no_count/len(st_df)*100:.1f}%)")
print(f"{'='*60}")


if __name__ == "__main__":
main()
113 changes: 113 additions & 0 deletions python-scripts/yt_playlists.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
# YouTube Data API Key Setup Instructions


# To run this script, you must generate a YouTube Data API key.
#
# Steps to obtain the key:
#
# 1. Go to Google Cloud Console:
# https://console.cloud.google.com/
#
# 2. Create a new project (or select an existing one).
#
# 3. Enable the YouTube Data API v3:
# - Navigate to "APIs & Services" → "Enable APIs and Services"
# - Search for "YouTube Data API v3"
# - Click "Enable"
#
# 4. Create an API key:
# - Go to "APIs & Services" → "Credentials"
# - Click "Create Credentials" → "API key"
#
# 5. Put the key in a .env file in the same directory as this script:
# YOUTUBE_API_KEY = "YOUR_API_KEY_HERE"
#
# ============================================================



import os
import csv

import requests
from dotenv import load_dotenv

load_dotenv()

API_KEY = os.getenv("YOUTUBE_API_KEY")
if not API_KEY:
raise RuntimeError("YOUTUBE_API_KEY is not set in the environment or .env file")
CHANNEL_ID = "UCcLQJOfR-MCcI5RtIHFl6Ww"
BASE_URL = "https://www.googleapis.com/youtube/v3"

def get_playlists():
url = f"{BASE_URL}/playlists"
params = {
"part": "snippet",
"channelId": CHANNEL_ID,
"maxResults": 50,
"key": API_KEY
}
playlists = []

while True:
data = requests.get(url, params=params).json()
for item in data.get("items", []):
playlists.append({
"id": item["id"],
"title": item["snippet"]["title"]
})

if "nextPageToken" not in data:
break
params["pageToken"] = data["nextPageToken"]

return playlists


def get_videos(playlist_id):
url = f"{BASE_URL}/playlistItems"
params = {
"part": "snippet",
"playlistId": playlist_id,
"maxResults": 50,
"key": API_KEY
}
videos = []

while True:
data = requests.get(url, params=params).json()
for item in data.get("items", []):
videos.append(item["snippet"]["title"])

if "nextPageToken" not in data:
break
params["pageToken"] = data["nextPageToken"]

return videos


def main():
playlists = get_playlists()
rows = []

for p in playlists:
print(f"Fetching playlist: {p['title']}")
videos = get_videos(p["id"])

for v in videos:
rows.append({
"playlist_name": p["title"],
"video_name": v
})

with open("spoken_tutorial.csv", "w", newline="", encoding="utf-8") as f:
writer = csv.DictWriter(f, fieldnames=["playlist_name", "video_name"])
writer.writeheader()
writer.writerows(rows)

print("CSV created: spoken_tutorial.csv")


if __name__ == "__main__":
main()
35 changes: 28 additions & 7 deletions spoken/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from events.models import Testimonials
from cms.models import Notification, Event
from .config import CACHE_RANDOM_TUTORIALS, CACHE_TR_REC, CACHE_TESTIMONIALS, CACHE_NOTIFICATIONS, CACHE_EVENTS, CACHE_TUTORIALS
import hashlib

def get_key(identifier, key_val):
return f"{identifier}:{key_val.lower().strip().replace(' ','_')}"
Expand Down Expand Up @@ -122,23 +123,43 @@ def get_tutorials_list(foss, lang):

# ---- Foss Choice For Search Bar ----
def get_foss_choice(show_on_homepage=1, lang=None):
if lang and len(lang) > 50:
lang = lang[:50]

if lang:
cache_key = get_key("tutorial_search_foss", f"{show_on_homepage}:{lang}")
raw_key = f"{show_on_homepage}:{lang}"
else:
cache_key = f"tutorial_search_foss:{show_on_homepage}:all"
raw_key = f"{show_on_homepage}:all"

hashed_key = hashlib.md5(raw_key.encode("utf-8")).hexdigest()
cache_key = get_key("tutorial_search_foss", hashed_key)

foss_list_choices = cache.get(cache_key)
if foss_list_choices is not None:
return foss_list_choices

foss_list_choices = [('', '-- All Courses --'), ]
foss_qs = TutorialResource.objects.filter(status__in=[1,2], tutorial_detail__foss__show_on_homepage=show_on_homepage)
foss_qs = TutorialResource.objects.filter(
status__in=[1,2],
tutorial_detail__foss__show_on_homepage=show_on_homepage
)
if lang:
foss_qs = foss_qs.filter(language__name=lang)
foss_list = foss_qs.values('tutorial_detail__foss__foss').annotate(
Count('id')).order_by('tutorial_detail__foss__foss').values_list('tutorial_detail__foss__foss', 'id__count').distinct()

foss_list = foss_qs.values(
'tutorial_detail__foss__foss'
).annotate(
Count('id')
).order_by(
'tutorial_detail__foss__foss'
).values_list(
'tutorial_detail__foss__foss', 'id__count'
).distinct()

for foss_row in foss_list:
foss_list_choices.append((str(foss_row[0]), str(foss_row[0]) + ' (' + str(foss_row[1]) + ')'))
foss_list_choices.append(
(str(foss_row[0]), str(foss_row[0]) + ' (' + str(foss_row[1]) + ')')
)

cache.set(cache_key, foss_list_choices, timeout=CACHE_TUTORIALS)
return foss_list_choices
Expand Down