Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
8bc38cc
test push
Carokay87 May 7, 2022
40e150e
[FEAT] - add button on delta.py show category
Carokay87 May 7, 2022
d604ad2
[FEAT] - README to read data
Carokay87 May 7, 2022
d47ef1e
[FIX] - change file name
Carokay87 May 7, 2022
628785d
[FEAT] - add image in case of pb with dataset
Carokay87 May 7, 2022
cb976d9
add class
mattmaxXXX May 7, 2022
06c2643
Merge branch 'main' of https://github.com/Carokay87/delta
mattmaxXXX May 7, 2022
84b3570
[FIX] - delta should be able to acess with button
Carokay87 May 7, 2022
3686403
[FEAT] - music trend evaluation
Carokay87 May 7, 2022
ac2283b
[FEAT] - add image in case data error
Carokay87 May 7, 2022
17991af
[FIX] - Add second graph on display
Carokay87 May 8, 2022
595c3ec
style
mattmaxXXX May 8, 2022
33755e3
[FIX] - set database in archive catergory
Carokay87 May 8, 2022
3e2f706
add with name
Carokay87 May 9, 2022
9691a26
[FIX] - Change repository name
Carokay87 Jun 3, 2022
b457ac2
[FEAT] - loading data in get_data.py instead of ytb_data.py
Carokay87 Jun 3, 2022
f17e4f0
no lib bootstrap
mattmaxXXX Jun 5, 2022
51199c2
architecture
mattmaxXXX Jun 5, 2022
a13ec45
Create README.md
mattmaxXXX Jun 5, 2022
bde7a10
Update README.md
mattmaxXXX Jun 5, 2022
bb5057d
Update get_data.py
mattmaxXXX Jun 5, 2022
c8710c6
Update youtube.py
mattmaxXXX Jun 5, 2022
6252cac
Update README.md
mattmaxXXX Jun 5, 2022
5497963
Merge branch 'main' into main
Carokay87 Jun 23, 2022
3637aa8
delta
mattmaxXXX Jul 1, 2022
ef121b5
fix
mattmaxXXX Jul 1, 2022
8a692ed
mkdir data
mattmaxXXX Jul 1, 2022
812deee
readme
mattmaxXXX Jul 1, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions CDMS_trending_youtube/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
## The dataset that was required was not pushable due to a great amont of datas !

- Here is the link of the kaggle dataset :
https://www.kaggle.com/datasets/rsrishav/youtube-trending-video-dataset/download
- Run the `get_data.py` file to download and extract the datas


87 changes: 87 additions & 0 deletions CDMS_trending_youtube/get_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
import pandas as pd
from zipfile import ZipFile
import os

os.environ['KAGGLE_USERNAME'] = "matthieuschlienger"
os.environ['KAGGLE_KEY'] = "1fae511f2e110e02b5ace27a55a50fec"

from kaggle.api.kaggle_api_extended import KaggleApi
api = KaggleApi()
api.authenticate()
api.dataset_download_files('rsrishav/youtube-trending-video-dataset', path="./tmp")
with ZipFile('./tmp/youtube-trending-video-dataset.zip', 'r') as zipObj:
# Extract all the contents of zip file in current directory
zipObj.extractall('./tmp')

list_country = ["Brésil", "Canada", "Allemagne", "France",
"Royaume-Uni", "Inde", "Japon", "Corée", "Méxique", "Russie", "US"]
list_country_kor = ["Brésil", "Canada", "Allemagne", "France",
"Royaume-Uni", "Inde", "Japon", "Méxique", "Russie", "US"]

def load_data(list_file, list_country):
datas = pd.DataFrame()
for i in range(len(list_file)):
data = pd.read_csv(list_file[i], sep=',')
data['country'] = list_country[i]
datas = pd.concat([datas, data])
return datas

'''def create_dataframe():
list_file = ["tmp/archive/BR_youtube_trending_data.csv",
"tmp/archive/CA_youtube_trending_data.csv",
"tmp/archive/DE_youtube_trending_data.csv",
"tmp/archive/FR_youtube_trending_data.csv",
"tmp/archive/GB_youtube_trending_data.csv",
"tmp/archive/IN_youtube_trending_data.csv",
"tmp/archive/JP_youtube_trending_data.csv",
"tmp/archive/KR_youtube_trending_data.csv",
"tmp/archive/MX_youtube_trending_data.csv",
"tmp/archive/RU_youtube_trending_data.csv",
"tmp/archive/US_youtube_trending_data.csv"]
'''
def create_dataframe():
list_file = ["tmp/BR_youtube_trending_data.csv",
"tmp/CA_youtube_trending_data.csv",
"tmp/DE_youtube_trending_data.csv",
"tmp/FR_youtube_trending_data.csv",
"tmp/GB_youtube_trending_data.csv",
"tmp/IN_youtube_trending_data.csv",
"tmp/JP_youtube_trending_data.csv",
"tmp/KR_youtube_trending_data.csv",
"tmp/MX_youtube_trending_data.csv",
"tmp/RU_youtube_trending_data.csv",
"tmp/US_youtube_trending_data.csv"]

data = load_data(list_file, list_country)
data.drop(columns=['channelId', 'description','thumbnail_link','video_id'], inplace=True, axis=1)

result = data.copy()
result['publishedAt'] = pd.to_datetime(result['publishedAt'], format='%Y-%m-%d')
replace_categories = { 2: 'Autos & Vehicles',
1: 'Film & Animation',
10: 'Music',
15: 'Pets & Animals',
17: 'Sports',
18: 'Short Movies',
19: 'Travel & Events',
20: 'Gaming',
21: 'Videoblogging',
22: 'People & Blogs',
23: 'Comedy',
24: 'Entertainment',
25: 'News & Politics',
26: 'Howto & Style',
27: 'Education',
28: 'Science & Technology',
29: 'Nonprofits & Activism'}

df = result.replace({"categoryId": replace_categories})

return df


youtube_df = create_dataframe()

os.mkdir("./data")

youtube_df.to_pickle("./data/df_youtube.pkl")
265 changes: 265 additions & 0 deletions CDMS_trending_youtube/youtube.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,265 @@
# import required packages
import dash
from dash import dcc
from dash import html
import plotly.graph_objs as go
import numpy as np
import pandas as pd
from datetime import datetime
import plotly.express as px

def ratio(list_country_kor, datas_year):
ratio = []
for country in list_country_kor:
country_data = datas_year[datas_year.country == country]
country_music = country_data[country_data.categoryId == 'Music']
country_korean = is_korean(country_music.title)
ratio.append(len(country_korean) * 100 / len(country_music))
return ratio

list_country_kor = ["Brésil", "Canada", "Allemagne", "France",
"Royaume-Uni", "Inde", "Japon", "Méxique", "Russie", "US"]

d1 = "\u3131"
f1 = "\u3163"

d2 = "\uAC00"
f2 = "\uAF00"

d3 = "\uB000"
f3 = "\uBFE1"

d4 = "\uC058"
f4 = "\uCFFC"

d5 = "\uD018"
f5 = "\uD79D"

d6 = "\u3181"
f6 = "\uCB4C"

def is_korean(word):
l = []
for j in word:
for i in j:
if ord(i) >= ord(d1) and ord(i) <= ord(f1):
l.append(j)
if ord(i) >= ord(d2) and ord(i) <= ord(f2):
l.append(j)
if ord(i) >= ord(d3) and ord(i) <= ord(f3):
l.append(j)
if ord(i) >= ord(d4) and ord(i) <= ord(f4):
l.append(j)
if ord(i) >= ord(d5) and ord(i) <= ord(f5):
l.append(j)
return l


class YoutubeTrendsStats():
def __init__(self, application = None):

try:
self.df = pd.read_pickle('./data/df_youtube.pkl')
except FileNotFoundError:
self.df = pd.read_pickle('./CDMS_trending_youtube/data/df_youtube.pkl')
self.figure = self.create_figure(self.df)
self.other_figure = self.create_figure2(self.df)

# page layout
if application:
self.app = application
# application should have its own layout and use self.main_layout as a page or in a component
else:
self.app = dash.Dash()


div_content = html.Div(children=[


html.Div(dcc.Graph(id = 'main-graph',
figure = self.figure)),

html.Div(dcc.Graph(id = 'second-graph',
figure = self.other_figure)),
html.Br(),
dcc.Markdown("""
Le graphique est interactif. En passant la souris sur les courbes vous avez une infobulle.

Notes :
* La catégorie Entertainement est la plus en tendance dans la plupart des pays.
* Entre le 13/04/20021 et le 01/06/2021, la catégorie Gaming représentait 13,3%/ des tendances en France
* On peut voir que la musique Coréenne représente la majorité des musiques de plusieurs pays.


#### À propos

* Sources :
* https://www.kaggle.com/datasets/rsrishav/youtube-trending-video-dataset?select=FR_youtube_trending_data.csv
""")

])#, style={'display': 'inline-block', 'vertical-align': 'top'})

self.app.layout = div_content
self.main_layout = div_content

def create_figure2(self, datas):
datas.trending_date = pd.to_datetime(datas.trending_date)
datas['year'] = pd.DatetimeIndex(datas['trending_date']).year

datas_2020 = datas[datas.year == 2020]
datas_2021 = datas[datas.year == 2021]
datas_2022 = datas[datas.year == 2022]

ratio_2020 = ratio(list_country_kor, datas_2020)
ratio_2021 = ratio(list_country_kor, datas_2021)
ratio_2022 = ratio(list_country_kor, datas_2022)

df = pd.DataFrame(index = list_country_kor)
df['2020'] = ratio_2020
df['2021'] = ratio_2021
df['2022'] = ratio_2022

y0 = np.array(df['2020'])
fig = px.scatter(df, size=y0*5, hover_name=df.index,
title='Pourcentage de musique coréenne en tendance sur le total de musique en tendance pour chaque pays',
opacity = 0.6, labels={
"value": "Ratio in %",
"index": "Country",
"variable": "Year"
})
return fig

# define figure creation function
def create_figure(self,result):
dates = self.divide_dates("2020-08-11", "2022-03-22", 12)

# make list of continents
countries = result['country'].unique()

domains = [
{'x': [0.0, 0.25], 'y': [0.0, 0.33]},
{'x': [0.0, 0.25], 'y': [0.33, 0.66]},
{'x': [0.0, 0.25], 'y': [0.66, 1.0]},
{'x': [0.25, 0.5], 'y': [0.0, 0.33]},
{'x': [0.25, 0.5], 'y': [0.33, 0.66]},
{'x': [0.25, 0.5], 'y': [0.66, 1.0]},
{'x': [0.5, 0.75], 'y': [0.0, 0.33]},
{'x': [0.5, 0.75], 'y': [0.33, 0.66]},
{'x': [0.5, 0.75], 'y': [0.66, 1.0]},
{'x': [0.75, 1.0], 'y': [0.0, 0.33]},
{'x': [0.75, 1.0], 'y': [0.33, 0.66]},
{'x': [0.75, 1.0], 'y': [0.66, 1.0]}
]
# make figure
fig_dict = {
"data": [],
"layout": {},
"frames": []
}

# fill in most of layout
fig_dict["layout"]["title"] = "Évolution des proportions des catégories youtube en tendances dans le monde"
fig_dict["layout"]["height"] = 700
fig_dict["layout"]["hovermode"] = "closest"
fig_dict["layout"]["updatemenus"] = [
{
"buttons": [
{
"args": [None, {"frame": {"duration": 1000, "redraw": True},
"fromcurrent": True, "transition": {"duration": 300,
"easing": "quadratic-in-out"}}],
"label": "Play",
"method": "animate"
},
{
"args": [[None], {"frame": {"duration": 0, "redraw": True},
"mode": "immediate",
"transition": {"duration": 0}}],
"label": "Pause",
"method": "animate"
}
],
"direction": "left",
"pad": {"r": 10, "t": 87},
"showactive": False,
"type": "buttons",
"x": 0.1,
"xanchor": "right",
"y": 0,
"yanchor": "top"
}
]

sliders_dict = {
"active": 0,
"yanchor": "top",
"xanchor": "left",
"currentvalue": {
"font": {"size": 20},
"visible": True,
"xanchor": "right"
},
"transition": {"duration": 300, "easing": "cubic-in-out"},
"pad": {"b": 10, "t": 50},
"len": 0.9,
"x": 0.1,
"y": 0,
"steps": []
}

# make data
i = 0
for country in countries:
res_filtered = self.filter_dates(result, dates[0], dates[1])
pie = go.Pie(labels=res_filtered[res_filtered["country"] == country]["categoryId"], domain = domains[i], title=country, textinfo='none', hole=.6)
fig_dict["data"].append(pie)
i+=1

# make frames
for x in range(1,len(dates)-1):
frame = {"data": [], "name": str(dates[x])}
i = 0
for country in countries:
res_filtered = self.filter_dates(result, dates[x], dates[x+1])
pie = go.Pie(labels=res_filtered[res_filtered["country"] == country]["categoryId"], domain = domains[i], title=country, textinfo='none', hole=.6)
frame["data"].append(pie)
i+=1
fig_dict["frames"].append(frame)
slider_step = {"args": [
[dates[x]],
{"frame": {"duration": 300, "redraw": True},
"mode": "immediate",
"transition": {"duration": 300}}
],
"label": dates[x],
"method": "animate"}
sliders_dict["steps"].append(slider_step)

fig_dict["layout"]["sliders"] = [sliders_dict]

fig = go.Figure(fig_dict)
return fig

def divide_dates(self, start, end, N):
test_date1 = datetime.strptime(start, '%Y-%m-%d')
test_date2 = datetime.strptime(end, '%Y-%m-%d')
temp = []
diff = ( test_date2 - test_date1) // N
for idx in range(0, N+1):
temp.append((test_date1 + idx * diff).strftime("%Y-%m-%d"))
# format
return temp

def filter_dates(self, result, start_date, end_date):
after_start_date = result["trending_date"] >= start_date
before_end_date = result["trending_date"] <= end_date
between_two_dates = after_start_date & before_end_date
filtered_dates = result.loc[between_two_dates]
return filtered_dates

def run(self, debug=False, port=8050):
self.app.run_server(host="0.0.0.0", debug=debug, port=port)

if __name__ == "__main__":
yt = YoutubeTrendsStats()
yt.run(port=8055)
Loading