diff --git a/CDMS_trending_youtube/README.md b/CDMS_trending_youtube/README.md new file mode 100644 index 00000000..c23c2230 --- /dev/null +++ b/CDMS_trending_youtube/README.md @@ -0,0 +1,7 @@ +## The dataset that was required was not pushable due to a great amont of datas ! + +- Here is the link of the kaggle dataset : +https://www.kaggle.com/datasets/rsrishav/youtube-trending-video-dataset/download +- Run the `get_data.py` file to download and extract the datas + + diff --git a/CDMS_trending_youtube/get_data.py b/CDMS_trending_youtube/get_data.py new file mode 100644 index 00000000..b1667aa7 --- /dev/null +++ b/CDMS_trending_youtube/get_data.py @@ -0,0 +1,87 @@ +import pandas as pd +from zipfile import ZipFile +import os + +os.environ['KAGGLE_USERNAME'] = "matthieuschlienger" +os.environ['KAGGLE_KEY'] = "1fae511f2e110e02b5ace27a55a50fec" + +from kaggle.api.kaggle_api_extended import KaggleApi +api = KaggleApi() +api.authenticate() +api.dataset_download_files('rsrishav/youtube-trending-video-dataset', path="./tmp") +with ZipFile('./tmp/youtube-trending-video-dataset.zip', 'r') as zipObj: + # Extract all the contents of zip file in current directory + zipObj.extractall('./tmp') + +list_country = ["Brésil", "Canada", "Allemagne", "France", + "Royaume-Uni", "Inde", "Japon", "Corée", "Méxique", "Russie", "US"] +list_country_kor = ["Brésil", "Canada", "Allemagne", "France", + "Royaume-Uni", "Inde", "Japon", "Méxique", "Russie", "US"] + +def load_data(list_file, list_country): + datas = pd.DataFrame() + for i in range(len(list_file)): + data = pd.read_csv(list_file[i], sep=',') + data['country'] = list_country[i] + datas = pd.concat([datas, data]) + return datas + +'''def create_dataframe(): + list_file = ["tmp/archive/BR_youtube_trending_data.csv", + "tmp/archive/CA_youtube_trending_data.csv", + "tmp/archive/DE_youtube_trending_data.csv", + "tmp/archive/FR_youtube_trending_data.csv", + "tmp/archive/GB_youtube_trending_data.csv", + "tmp/archive/IN_youtube_trending_data.csv", + "tmp/archive/JP_youtube_trending_data.csv", + "tmp/archive/KR_youtube_trending_data.csv", + "tmp/archive/MX_youtube_trending_data.csv", + "tmp/archive/RU_youtube_trending_data.csv", + "tmp/archive/US_youtube_trending_data.csv"] +''' +def create_dataframe(): + list_file = ["tmp/BR_youtube_trending_data.csv", + "tmp/CA_youtube_trending_data.csv", + "tmp/DE_youtube_trending_data.csv", + "tmp/FR_youtube_trending_data.csv", + "tmp/GB_youtube_trending_data.csv", + "tmp/IN_youtube_trending_data.csv", + "tmp/JP_youtube_trending_data.csv", + "tmp/KR_youtube_trending_data.csv", + "tmp/MX_youtube_trending_data.csv", + "tmp/RU_youtube_trending_data.csv", + "tmp/US_youtube_trending_data.csv"] + + data = load_data(list_file, list_country) + data.drop(columns=['channelId', 'description','thumbnail_link','video_id'], inplace=True, axis=1) + + result = data.copy() + result['publishedAt'] = pd.to_datetime(result['publishedAt'], format='%Y-%m-%d') + replace_categories = { 2: 'Autos & Vehicles', + 1: 'Film & Animation', + 10: 'Music', + 15: 'Pets & Animals', + 17: 'Sports', + 18: 'Short Movies', + 19: 'Travel & Events', + 20: 'Gaming', + 21: 'Videoblogging', + 22: 'People & Blogs', + 23: 'Comedy', + 24: 'Entertainment', + 25: 'News & Politics', + 26: 'Howto & Style', + 27: 'Education', + 28: 'Science & Technology', + 29: 'Nonprofits & Activism'} + + df = result.replace({"categoryId": replace_categories}) + + return df + + +youtube_df = create_dataframe() + +os.mkdir("./data") + +youtube_df.to_pickle("./data/df_youtube.pkl") diff --git a/CDMS_trending_youtube/youtube.py b/CDMS_trending_youtube/youtube.py new file mode 100644 index 00000000..34c98762 --- /dev/null +++ b/CDMS_trending_youtube/youtube.py @@ -0,0 +1,265 @@ +# import required packages +import dash +from dash import dcc +from dash import html +import plotly.graph_objs as go +import numpy as np +import pandas as pd +from datetime import datetime +import plotly.express as px + +def ratio(list_country_kor, datas_year): + ratio = [] + for country in list_country_kor: + country_data = datas_year[datas_year.country == country] + country_music = country_data[country_data.categoryId == 'Music'] + country_korean = is_korean(country_music.title) + ratio.append(len(country_korean) * 100 / len(country_music)) + return ratio + +list_country_kor = ["Brésil", "Canada", "Allemagne", "France", + "Royaume-Uni", "Inde", "Japon", "Méxique", "Russie", "US"] + +d1 = "\u3131" +f1 = "\u3163" + +d2 = "\uAC00" +f2 = "\uAF00" + +d3 = "\uB000" +f3 = "\uBFE1" + +d4 = "\uC058" +f4 = "\uCFFC" + +d5 = "\uD018" +f5 = "\uD79D" + +d6 = "\u3181" +f6 = "\uCB4C" + +def is_korean(word): + l = [] + for j in word: + for i in j: + if ord(i) >= ord(d1) and ord(i) <= ord(f1): + l.append(j) + if ord(i) >= ord(d2) and ord(i) <= ord(f2): + l.append(j) + if ord(i) >= ord(d3) and ord(i) <= ord(f3): + l.append(j) + if ord(i) >= ord(d4) and ord(i) <= ord(f4): + l.append(j) + if ord(i) >= ord(d5) and ord(i) <= ord(f5): + l.append(j) + return l + + +class YoutubeTrendsStats(): + def __init__(self, application = None): + + try: + self.df = pd.read_pickle('./data/df_youtube.pkl') + except FileNotFoundError: + self.df = pd.read_pickle('./CDMS_trending_youtube/data/df_youtube.pkl') + self.figure = self.create_figure(self.df) + self.other_figure = self.create_figure2(self.df) + + # page layout + if application: + self.app = application + # application should have its own layout and use self.main_layout as a page or in a component + else: + self.app = dash.Dash() + + + div_content = html.Div(children=[ + + + html.Div(dcc.Graph(id = 'main-graph', + figure = self.figure)), + + html.Div(dcc.Graph(id = 'second-graph', + figure = self.other_figure)), + html.Br(), + dcc.Markdown(""" + Le graphique est interactif. En passant la souris sur les courbes vous avez une infobulle. + + Notes : + * La catégorie Entertainement est la plus en tendance dans la plupart des pays. + * Entre le 13/04/20021 et le 01/06/2021, la catégorie Gaming représentait 13,3%/ des tendances en France + * On peut voir que la musique Coréenne représente la majorité des musiques de plusieurs pays. + + + #### À propos + + * Sources : + * https://www.kaggle.com/datasets/rsrishav/youtube-trending-video-dataset?select=FR_youtube_trending_data.csv + """) + + ])#, style={'display': 'inline-block', 'vertical-align': 'top'}) + + self.app.layout = div_content + self.main_layout = div_content + + def create_figure2(self, datas): + datas.trending_date = pd.to_datetime(datas.trending_date) + datas['year'] = pd.DatetimeIndex(datas['trending_date']).year + + datas_2020 = datas[datas.year == 2020] + datas_2021 = datas[datas.year == 2021] + datas_2022 = datas[datas.year == 2022] + + ratio_2020 = ratio(list_country_kor, datas_2020) + ratio_2021 = ratio(list_country_kor, datas_2021) + ratio_2022 = ratio(list_country_kor, datas_2022) + + df = pd.DataFrame(index = list_country_kor) + df['2020'] = ratio_2020 + df['2021'] = ratio_2021 + df['2022'] = ratio_2022 + + y0 = np.array(df['2020']) + fig = px.scatter(df, size=y0*5, hover_name=df.index, + title='Pourcentage de musique coréenne en tendance sur le total de musique en tendance pour chaque pays', + opacity = 0.6, labels={ + "value": "Ratio in %", + "index": "Country", + "variable": "Year" + }) + return fig + + # define figure creation function + def create_figure(self,result): + dates = self.divide_dates("2020-08-11", "2022-03-22", 12) + + # make list of continents + countries = result['country'].unique() + + domains = [ + {'x': [0.0, 0.25], 'y': [0.0, 0.33]}, + {'x': [0.0, 0.25], 'y': [0.33, 0.66]}, + {'x': [0.0, 0.25], 'y': [0.66, 1.0]}, + {'x': [0.25, 0.5], 'y': [0.0, 0.33]}, + {'x': [0.25, 0.5], 'y': [0.33, 0.66]}, + {'x': [0.25, 0.5], 'y': [0.66, 1.0]}, + {'x': [0.5, 0.75], 'y': [0.0, 0.33]}, + {'x': [0.5, 0.75], 'y': [0.33, 0.66]}, + {'x': [0.5, 0.75], 'y': [0.66, 1.0]}, + {'x': [0.75, 1.0], 'y': [0.0, 0.33]}, + {'x': [0.75, 1.0], 'y': [0.33, 0.66]}, + {'x': [0.75, 1.0], 'y': [0.66, 1.0]} + ] + # make figure + fig_dict = { + "data": [], + "layout": {}, + "frames": [] + } + + # fill in most of layout + fig_dict["layout"]["title"] = "Évolution des proportions des catégories youtube en tendances dans le monde" + fig_dict["layout"]["height"] = 700 + fig_dict["layout"]["hovermode"] = "closest" + fig_dict["layout"]["updatemenus"] = [ + { + "buttons": [ + { + "args": [None, {"frame": {"duration": 1000, "redraw": True}, + "fromcurrent": True, "transition": {"duration": 300, + "easing": "quadratic-in-out"}}], + "label": "Play", + "method": "animate" + }, + { + "args": [[None], {"frame": {"duration": 0, "redraw": True}, + "mode": "immediate", + "transition": {"duration": 0}}], + "label": "Pause", + "method": "animate" + } + ], + "direction": "left", + "pad": {"r": 10, "t": 87}, + "showactive": False, + "type": "buttons", + "x": 0.1, + "xanchor": "right", + "y": 0, + "yanchor": "top" + } + ] + + sliders_dict = { + "active": 0, + "yanchor": "top", + "xanchor": "left", + "currentvalue": { + "font": {"size": 20}, + "visible": True, + "xanchor": "right" + }, + "transition": {"duration": 300, "easing": "cubic-in-out"}, + "pad": {"b": 10, "t": 50}, + "len": 0.9, + "x": 0.1, + "y": 0, + "steps": [] + } + + # make data + i = 0 + for country in countries: + res_filtered = self.filter_dates(result, dates[0], dates[1]) + pie = go.Pie(labels=res_filtered[res_filtered["country"] == country]["categoryId"], domain = domains[i], title=country, textinfo='none', hole=.6) + fig_dict["data"].append(pie) + i+=1 + + # make frames + for x in range(1,len(dates)-1): + frame = {"data": [], "name": str(dates[x])} + i = 0 + for country in countries: + res_filtered = self.filter_dates(result, dates[x], dates[x+1]) + pie = go.Pie(labels=res_filtered[res_filtered["country"] == country]["categoryId"], domain = domains[i], title=country, textinfo='none', hole=.6) + frame["data"].append(pie) + i+=1 + fig_dict["frames"].append(frame) + slider_step = {"args": [ + [dates[x]], + {"frame": {"duration": 300, "redraw": True}, + "mode": "immediate", + "transition": {"duration": 300}} + ], + "label": dates[x], + "method": "animate"} + sliders_dict["steps"].append(slider_step) + + fig_dict["layout"]["sliders"] = [sliders_dict] + + fig = go.Figure(fig_dict) + return fig + + def divide_dates(self, start, end, N): + test_date1 = datetime.strptime(start, '%Y-%m-%d') + test_date2 = datetime.strptime(end, '%Y-%m-%d') + temp = [] + diff = ( test_date2 - test_date1) // N + for idx in range(0, N+1): + temp.append((test_date1 + idx * diff).strftime("%Y-%m-%d")) + # format + return temp + + def filter_dates(self, result, start_date, end_date): + after_start_date = result["trending_date"] >= start_date + before_end_date = result["trending_date"] <= end_date + between_two_dates = after_start_date & before_end_date + filtered_dates = result.loc[between_two_dates] + return filtered_dates + + def run(self, debug=False, port=8050): + self.app.run_server(host="0.0.0.0", debug=debug, port=port) + +if __name__ == "__main__": + yt = YoutubeTrendsStats() + yt.run(port=8055) diff --git a/caroline_devaux_mathieu_schlinger_trending_youtube/category/ytb_data.py b/caroline_devaux_mathieu_schlinger_trending_youtube/category/ytb_data.py new file mode 100644 index 00000000..37390076 --- /dev/null +++ b/caroline_devaux_mathieu_schlinger_trending_youtube/category/ytb_data.py @@ -0,0 +1,315 @@ +# import required packages +import dash +from dash import dcc +from dash import html +import dash_bootstrap_components as dbc +import plotly.graph_objs as go +import numpy as np +import pandas as pd +from datetime import datetime +import plotly.express as px + +def ratio(list_country_kor, datas_year): + ratio = [] + for country in list_country_kor: + country_data = datas_year[datas_year.country == country] + country_music = country_data[country_data.categoryId == 'Music'] + country_korean = is_korean(country_music.title) + ratio.append(len(country_korean) * 100 / len(country_music)) + return ratio + +list_country = ["Brésil", "Canada", "Allemagne", "France", + "Royaume-Uni", "Inde", "Japon", "Corée", "Méxique", "Russie", "US"] +list_country_kor = ["Brésil", "Canada", "Allemagne", "France", + "Royaume-Uni", "Inde", "Japon", "Méxique", "Russie", "US"] + +d1 = "\u3131" +f1 = "\u3163" + +d2 = "\uAC00" +f2 = "\uAF00" + +d3 = "\uB000" +f3 = "\uBFE1" + +d4 = "\uC058" +f4 = "\uCFFC" + +d5 = "\uD018" +f5 = "\uD79D" + +d6 = "\u3181" +f6 = "\uCB4C" + +def is_korean(word): + l = [] + for j in word: + for i in j: + if ord(i) >= ord(d1) and ord(i) <= ord(f1): + l.append(j) + if ord(i) >= ord(d2) and ord(i) <= ord(f2): + l.append(j) + if ord(i) >= ord(d3) and ord(i) <= ord(f3): + l.append(j) + if ord(i) >= ord(d4) and ord(i) <= ord(f4): + l.append(j) + if ord(i) >= ord(d5) and ord(i) <= ord(f5): + l.append(j) + return l + +def load_data(list_file, list_country): + datas = pd.DataFrame() + for i in range(len(list_file)): + data = pd.read_csv(list_file[i], sep=',') + data['country'] = list_country[i] + datas = pd.concat([datas, data]) + return datas + +class YoutubeTrendsStats(): + def __init__(self, application = None): + self.df = self.create_dataframe() + self.figure = self.create_figure(self.df) + self.other_figure = self.create_figure2(self.df) + + # page layout + self.app = dash.Dash(external_stylesheets = [dbc.themes.BOOTSTRAP]) + + div_content = html.Div(children=[ + # html.H3(children='Évolution des proportions des catégories youtube en tendances dans le monde'), + + html.Div(dcc.Graph(id = 'main-graph', + figure = self.figure)), + # html.H3(children='Pourcentage de musique coréenne en tendance sur le total de musique en tendance pour chaque pays'), + html.Div(dcc.Graph(id = 'second-graph', + figure = self.other_figure)), + html.Br(), + dcc.Markdown(""" + Le graphique est interactif. En passant la souris sur les courbes vous avez une infobulle. + + Notes : + * La catégorie Entertainement est la plus en tendance dans la plupart des pays. + * Entre le 13/04/20021 et le 01/06/2021, la catégorie Gaming représentait 13,3%/ des tendances en France + * On peut voir que la musique Coréenne représente la majorité des musiques de plusieurs pays. + + + #### À propos + + * Sources : + * https://www.kaggle.com/datasets/rsrishav/youtube-trending-video-dataset?select=FR_youtube_trending_data.csv + """) + + ])#, style={'display': 'inline-block', 'vertical-align': 'top'}) + + self.app.layout = div_content + self.main_layout = div_content + + def create_figure2(self, datas): + datas.trending_date = pd.to_datetime(datas.trending_date) + datas['year'] = pd.DatetimeIndex(datas['trending_date']).year + + datas_2020 = datas[datas.year == 2020] + datas_2021 = datas[datas.year == 2021] + datas_2022 = datas[datas.year == 2022] + + ratio_2020 = ratio(list_country_kor, datas_2020) + ratio_2021 = ratio(list_country_kor, datas_2021) + ratio_2022 = ratio(list_country_kor, datas_2022) + + df = pd.DataFrame(index = list_country_kor) + df['2020'] = ratio_2020 + df['2021'] = ratio_2021 + df['2022'] = ratio_2022 + + y0 = np.array(df['2020']) + fig = px.scatter(df, size=y0*5, hover_name=df.index, + title='Pourcentage de musique coréenne en tendance sur le total de musique en tendance pour chaque pays', + opacity = 0.6, labels={ + "value": "Ratio in %", + "index": "Country", + "variable": "Year" + }) + #fig.update_layout( + # title_font_size=22, + # font_family="Serif", + # title_font_family="Times New Roman", + # title_font_color="black" + #) + #fig.update_xaxes(title_font_family="Serif") + return fig + + # define figure creation function + def create_figure(self,result): + dates = self.divide_dates("2020-08-11", "2022-03-22", 12) + + # make list of continents + countries = result['country'].unique() + print(countries) + + domains = [ + {'x': [0.0, 0.25], 'y': [0.0, 0.33]}, + {'x': [0.0, 0.25], 'y': [0.33, 0.66]}, + {'x': [0.0, 0.25], 'y': [0.66, 1.0]}, + {'x': [0.25, 0.5], 'y': [0.0, 0.33]}, + {'x': [0.25, 0.5], 'y': [0.33, 0.66]}, + {'x': [0.25, 0.5], 'y': [0.66, 1.0]}, + {'x': [0.5, 0.75], 'y': [0.0, 0.33]}, + {'x': [0.5, 0.75], 'y': [0.33, 0.66]}, + {'x': [0.5, 0.75], 'y': [0.66, 1.0]}, + {'x': [0.75, 1.0], 'y': [0.0, 0.33]}, + {'x': [0.75, 1.0], 'y': [0.33, 0.66]}, + {'x': [0.75, 1.0], 'y': [0.66, 1.0]} + ] + #countries = ["France", "Canada"] + # make figure + fig_dict = { + "data": [], + "layout": {}, + "frames": [] + } + + # fill in most of layout + fig_dict["layout"]["title"] = "Évolution des proportions des catégories youtube en tendances dans le monde" + fig_dict["layout"]["height"] = 700 + fig_dict["layout"]["hovermode"] = "closest" + fig_dict["layout"]["updatemenus"] = [ + { + "buttons": [ + { + "args": [None, {"frame": {"duration": 1000, "redraw": True}, + "fromcurrent": True, "transition": {"duration": 300, + "easing": "quadratic-in-out"}}], + "label": "Play", + "method": "animate" + }, + { + "args": [[None], {"frame": {"duration": 0, "redraw": True}, + "mode": "immediate", + "transition": {"duration": 0}}], + "label": "Pause", + "method": "animate" + } + ], + "direction": "left", + "pad": {"r": 10, "t": 87}, + "showactive": False, + "type": "buttons", + "x": 0.1, + "xanchor": "right", + "y": 0, + "yanchor": "top" + } + ] + + sliders_dict = { + "active": 0, + "yanchor": "top", + "xanchor": "left", + "currentvalue": { + "font": {"size": 20}, + "visible": True, + "xanchor": "right" + }, + "transition": {"duration": 300, "easing": "cubic-in-out"}, + "pad": {"b": 10, "t": 50}, + "len": 0.9, + "x": 0.1, + "y": 0, + "steps": [] + } + + # make data + i = 0 + for country in countries: + res_filtered = self.filter_dates(result, dates[0], dates[1]) + pie = go.Pie(labels=res_filtered[res_filtered["country"] == country]["categoryId"], domain = domains[i], title=country, textinfo='none', hole=.6) + fig_dict["data"].append(pie) + i+=1 + + # make frames + for x in range(1,len(dates)-1): + frame = {"data": [], "name": str(dates[x])} + i = 0 + for country in countries: + res_filtered = self.filter_dates(result, dates[x], dates[x+1]) + pie = go.Pie(labels=res_filtered[res_filtered["country"] == country]["categoryId"], domain = domains[i], title=country, textinfo='none', hole=.6) + frame["data"].append(pie) + i+=1 + fig_dict["frames"].append(frame) + slider_step = {"args": [ + [dates[x]], + {"frame": {"duration": 300, "redraw": True}, + "mode": "immediate", + "transition": {"duration": 300}} + ], + "label": dates[x], + "method": "animate"} + sliders_dict["steps"].append(slider_step) + + fig_dict["layout"]["sliders"] = [sliders_dict] + + fig = go.Figure(fig_dict) + return fig + + def create_dataframe(self): + list_file = ["category/archive/BR_youtube_trending_data.csv", + "category/archive/CA_youtube_trending_data.csv", + "category/archive/DE_youtube_trending_data.csv", + "category/archive/FR_youtube_trending_data.csv", + "category/archive/GB_youtube_trending_data.csv", + "category/archive/IN_youtube_trending_data.csv", + "category/archive/JP_youtube_trending_data.csv", + "category/archive/KR_youtube_trending_data.csv", + "category/archive/MX_youtube_trending_data.csv", + "category/archive/RU_youtube_trending_data.csv", + "category/archive/US_youtube_trending_data.csv"] + + data = load_data(list_file, list_country) + data.drop(columns=['channelId', 'description','thumbnail_link','video_id'], inplace=True, axis=1) + + result = data.copy() + result['publishedAt'] = pd.to_datetime(result['publishedAt'], format='%Y-%m-%d') + replace_categories = { 2: 'Autos & Vehicles', + 1: 'Film & Animation', + 10: 'Music', + 15: 'Pets & Animals', + 17: 'Sports', + 18: 'Short Movies', + 19: 'Travel & Events', + 20: 'Gaming', + 21: 'Videoblogging', + 22: 'People & Blogs', + 23: 'Comedy', + 24: 'Entertainment', + 25: 'News & Politics', + 26: 'Howto & Style', + 27: 'Education', + 28: 'Science & Technology', + 29: 'Nonprofits & Activism'} + + df = result.replace({"categoryId": replace_categories}) + + return df + + def divide_dates(self, start, end, N): + test_date1 = datetime.strptime(start, '%Y-%m-%d') + test_date2 = datetime.strptime(end, '%Y-%m-%d') + temp = [] + diff = ( test_date2 - test_date1) // N + for idx in range(0, N+1): + temp.append((test_date1 + idx * diff).strftime("%Y-%m-%d")) + # format + return temp + + def filter_dates(self, result, start_date, end_date): + after_start_date = result["trending_date"] >= start_date + before_end_date = result["trending_date"] <= end_date + between_two_dates = after_start_date & before_end_date + filtered_dates = result.loc[between_two_dates] + return filtered_dates + + def run(self, debug=False, port=8050): + self.app.run_server(host="0.0.0.0", debug=debug, port=port) + +if __name__ == "__main__": + yt = YoutubeTrendsStats() + yt.run(port=8055) \ No newline at end of file diff --git a/category/README.txt b/category/README.txt new file mode 100644 index 00000000..8f93d2cf --- /dev/null +++ b/category/README.txt @@ -0,0 +1,22 @@ +The dataset that was required for this category was not available due to a great amont of datas. + +You should be able to see by downloading the complete dataset within this link (it requires a Kaggle account thow) : +https://www.kaggle.com/datasets/rsrishav/youtube-trending-video-dataset?select=FR_youtube_trending_data.csv +You should check if the dataset is about 800Mo. + +The ytb_data.py needs a reference to the dataset in a directory names archive/*.csv + +category +├── archive +│   ├── BR_youtube_trending_data.csv +│   ├── CA_youtube_trending_data.csv +│   ├── DE_youtube_trending_data.csv +│   ├── FR_youtube_trending_data.csv +│   ├── GB_youtube_trending_data.csv +│   ├── IN_youtube_trending_data.csv +│   ├── JP_youtube_trending_data.csv +│   ├── KR_youtube_trending_data.csv +│   ├── MX_youtube_trending_data.csv +│   ├── RU_youtube_trending_data.csv +│   └── US_youtube_trending_data.csv + diff --git a/category/graphe_image_trend.png b/category/graphe_image_trend.png new file mode 100644 index 00000000..afb7f562 Binary files /dev/null and b/category/graphe_image_trend.png differ diff --git a/category/ytb_data.py b/category/ytb_data.py new file mode 100644 index 00000000..37390076 --- /dev/null +++ b/category/ytb_data.py @@ -0,0 +1,315 @@ +# import required packages +import dash +from dash import dcc +from dash import html +import dash_bootstrap_components as dbc +import plotly.graph_objs as go +import numpy as np +import pandas as pd +from datetime import datetime +import plotly.express as px + +def ratio(list_country_kor, datas_year): + ratio = [] + for country in list_country_kor: + country_data = datas_year[datas_year.country == country] + country_music = country_data[country_data.categoryId == 'Music'] + country_korean = is_korean(country_music.title) + ratio.append(len(country_korean) * 100 / len(country_music)) + return ratio + +list_country = ["Brésil", "Canada", "Allemagne", "France", + "Royaume-Uni", "Inde", "Japon", "Corée", "Méxique", "Russie", "US"] +list_country_kor = ["Brésil", "Canada", "Allemagne", "France", + "Royaume-Uni", "Inde", "Japon", "Méxique", "Russie", "US"] + +d1 = "\u3131" +f1 = "\u3163" + +d2 = "\uAC00" +f2 = "\uAF00" + +d3 = "\uB000" +f3 = "\uBFE1" + +d4 = "\uC058" +f4 = "\uCFFC" + +d5 = "\uD018" +f5 = "\uD79D" + +d6 = "\u3181" +f6 = "\uCB4C" + +def is_korean(word): + l = [] + for j in word: + for i in j: + if ord(i) >= ord(d1) and ord(i) <= ord(f1): + l.append(j) + if ord(i) >= ord(d2) and ord(i) <= ord(f2): + l.append(j) + if ord(i) >= ord(d3) and ord(i) <= ord(f3): + l.append(j) + if ord(i) >= ord(d4) and ord(i) <= ord(f4): + l.append(j) + if ord(i) >= ord(d5) and ord(i) <= ord(f5): + l.append(j) + return l + +def load_data(list_file, list_country): + datas = pd.DataFrame() + for i in range(len(list_file)): + data = pd.read_csv(list_file[i], sep=',') + data['country'] = list_country[i] + datas = pd.concat([datas, data]) + return datas + +class YoutubeTrendsStats(): + def __init__(self, application = None): + self.df = self.create_dataframe() + self.figure = self.create_figure(self.df) + self.other_figure = self.create_figure2(self.df) + + # page layout + self.app = dash.Dash(external_stylesheets = [dbc.themes.BOOTSTRAP]) + + div_content = html.Div(children=[ + # html.H3(children='Évolution des proportions des catégories youtube en tendances dans le monde'), + + html.Div(dcc.Graph(id = 'main-graph', + figure = self.figure)), + # html.H3(children='Pourcentage de musique coréenne en tendance sur le total de musique en tendance pour chaque pays'), + html.Div(dcc.Graph(id = 'second-graph', + figure = self.other_figure)), + html.Br(), + dcc.Markdown(""" + Le graphique est interactif. En passant la souris sur les courbes vous avez une infobulle. + + Notes : + * La catégorie Entertainement est la plus en tendance dans la plupart des pays. + * Entre le 13/04/20021 et le 01/06/2021, la catégorie Gaming représentait 13,3%/ des tendances en France + * On peut voir que la musique Coréenne représente la majorité des musiques de plusieurs pays. + + + #### À propos + + * Sources : + * https://www.kaggle.com/datasets/rsrishav/youtube-trending-video-dataset?select=FR_youtube_trending_data.csv + """) + + ])#, style={'display': 'inline-block', 'vertical-align': 'top'}) + + self.app.layout = div_content + self.main_layout = div_content + + def create_figure2(self, datas): + datas.trending_date = pd.to_datetime(datas.trending_date) + datas['year'] = pd.DatetimeIndex(datas['trending_date']).year + + datas_2020 = datas[datas.year == 2020] + datas_2021 = datas[datas.year == 2021] + datas_2022 = datas[datas.year == 2022] + + ratio_2020 = ratio(list_country_kor, datas_2020) + ratio_2021 = ratio(list_country_kor, datas_2021) + ratio_2022 = ratio(list_country_kor, datas_2022) + + df = pd.DataFrame(index = list_country_kor) + df['2020'] = ratio_2020 + df['2021'] = ratio_2021 + df['2022'] = ratio_2022 + + y0 = np.array(df['2020']) + fig = px.scatter(df, size=y0*5, hover_name=df.index, + title='Pourcentage de musique coréenne en tendance sur le total de musique en tendance pour chaque pays', + opacity = 0.6, labels={ + "value": "Ratio in %", + "index": "Country", + "variable": "Year" + }) + #fig.update_layout( + # title_font_size=22, + # font_family="Serif", + # title_font_family="Times New Roman", + # title_font_color="black" + #) + #fig.update_xaxes(title_font_family="Serif") + return fig + + # define figure creation function + def create_figure(self,result): + dates = self.divide_dates("2020-08-11", "2022-03-22", 12) + + # make list of continents + countries = result['country'].unique() + print(countries) + + domains = [ + {'x': [0.0, 0.25], 'y': [0.0, 0.33]}, + {'x': [0.0, 0.25], 'y': [0.33, 0.66]}, + {'x': [0.0, 0.25], 'y': [0.66, 1.0]}, + {'x': [0.25, 0.5], 'y': [0.0, 0.33]}, + {'x': [0.25, 0.5], 'y': [0.33, 0.66]}, + {'x': [0.25, 0.5], 'y': [0.66, 1.0]}, + {'x': [0.5, 0.75], 'y': [0.0, 0.33]}, + {'x': [0.5, 0.75], 'y': [0.33, 0.66]}, + {'x': [0.5, 0.75], 'y': [0.66, 1.0]}, + {'x': [0.75, 1.0], 'y': [0.0, 0.33]}, + {'x': [0.75, 1.0], 'y': [0.33, 0.66]}, + {'x': [0.75, 1.0], 'y': [0.66, 1.0]} + ] + #countries = ["France", "Canada"] + # make figure + fig_dict = { + "data": [], + "layout": {}, + "frames": [] + } + + # fill in most of layout + fig_dict["layout"]["title"] = "Évolution des proportions des catégories youtube en tendances dans le monde" + fig_dict["layout"]["height"] = 700 + fig_dict["layout"]["hovermode"] = "closest" + fig_dict["layout"]["updatemenus"] = [ + { + "buttons": [ + { + "args": [None, {"frame": {"duration": 1000, "redraw": True}, + "fromcurrent": True, "transition": {"duration": 300, + "easing": "quadratic-in-out"}}], + "label": "Play", + "method": "animate" + }, + { + "args": [[None], {"frame": {"duration": 0, "redraw": True}, + "mode": "immediate", + "transition": {"duration": 0}}], + "label": "Pause", + "method": "animate" + } + ], + "direction": "left", + "pad": {"r": 10, "t": 87}, + "showactive": False, + "type": "buttons", + "x": 0.1, + "xanchor": "right", + "y": 0, + "yanchor": "top" + } + ] + + sliders_dict = { + "active": 0, + "yanchor": "top", + "xanchor": "left", + "currentvalue": { + "font": {"size": 20}, + "visible": True, + "xanchor": "right" + }, + "transition": {"duration": 300, "easing": "cubic-in-out"}, + "pad": {"b": 10, "t": 50}, + "len": 0.9, + "x": 0.1, + "y": 0, + "steps": [] + } + + # make data + i = 0 + for country in countries: + res_filtered = self.filter_dates(result, dates[0], dates[1]) + pie = go.Pie(labels=res_filtered[res_filtered["country"] == country]["categoryId"], domain = domains[i], title=country, textinfo='none', hole=.6) + fig_dict["data"].append(pie) + i+=1 + + # make frames + for x in range(1,len(dates)-1): + frame = {"data": [], "name": str(dates[x])} + i = 0 + for country in countries: + res_filtered = self.filter_dates(result, dates[x], dates[x+1]) + pie = go.Pie(labels=res_filtered[res_filtered["country"] == country]["categoryId"], domain = domains[i], title=country, textinfo='none', hole=.6) + frame["data"].append(pie) + i+=1 + fig_dict["frames"].append(frame) + slider_step = {"args": [ + [dates[x]], + {"frame": {"duration": 300, "redraw": True}, + "mode": "immediate", + "transition": {"duration": 300}} + ], + "label": dates[x], + "method": "animate"} + sliders_dict["steps"].append(slider_step) + + fig_dict["layout"]["sliders"] = [sliders_dict] + + fig = go.Figure(fig_dict) + return fig + + def create_dataframe(self): + list_file = ["category/archive/BR_youtube_trending_data.csv", + "category/archive/CA_youtube_trending_data.csv", + "category/archive/DE_youtube_trending_data.csv", + "category/archive/FR_youtube_trending_data.csv", + "category/archive/GB_youtube_trending_data.csv", + "category/archive/IN_youtube_trending_data.csv", + "category/archive/JP_youtube_trending_data.csv", + "category/archive/KR_youtube_trending_data.csv", + "category/archive/MX_youtube_trending_data.csv", + "category/archive/RU_youtube_trending_data.csv", + "category/archive/US_youtube_trending_data.csv"] + + data = load_data(list_file, list_country) + data.drop(columns=['channelId', 'description','thumbnail_link','video_id'], inplace=True, axis=1) + + result = data.copy() + result['publishedAt'] = pd.to_datetime(result['publishedAt'], format='%Y-%m-%d') + replace_categories = { 2: 'Autos & Vehicles', + 1: 'Film & Animation', + 10: 'Music', + 15: 'Pets & Animals', + 17: 'Sports', + 18: 'Short Movies', + 19: 'Travel & Events', + 20: 'Gaming', + 21: 'Videoblogging', + 22: 'People & Blogs', + 23: 'Comedy', + 24: 'Entertainment', + 25: 'News & Politics', + 26: 'Howto & Style', + 27: 'Education', + 28: 'Science & Technology', + 29: 'Nonprofits & Activism'} + + df = result.replace({"categoryId": replace_categories}) + + return df + + def divide_dates(self, start, end, N): + test_date1 = datetime.strptime(start, '%Y-%m-%d') + test_date2 = datetime.strptime(end, '%Y-%m-%d') + temp = [] + diff = ( test_date2 - test_date1) // N + for idx in range(0, N+1): + temp.append((test_date1 + idx * diff).strftime("%Y-%m-%d")) + # format + return temp + + def filter_dates(self, result, start_date, end_date): + after_start_date = result["trending_date"] >= start_date + before_end_date = result["trending_date"] <= end_date + between_two_dates = after_start_date & before_end_date + filtered_dates = result.loc[between_two_dates] + return filtered_dates + + def run(self, debug=False, port=8050): + self.app.run_server(host="0.0.0.0", debug=debug, port=port) + +if __name__ == "__main__": + yt = YoutubeTrendsStats() + yt.run(port=8055) \ No newline at end of file diff --git a/delta.py b/delta.py index deeea1d1..82e5b3a5 100644 --- a/delta.py +++ b/delta.py @@ -1,3 +1,4 @@ +from unicodedata import category import re import dash import flask @@ -6,6 +7,7 @@ from energies import energies from population import population from deces import deces +from CDMS_trending_youtube import youtube from MC_AB_consommationEtProductionEnergétique import petrole from SG_AH_pollution_des_transports import pollution from pbmc_accidents_routiers import pbmc_accidents_routiers as pbmc @@ -136,6 +138,7 @@ def init(): ukr = ukraine.Ukraine(app) c_i = corp_impact.CorporateImpact(app) popfr = dash_pop.Population(app) + ytb_trd = youtube.YoutubeTrendsStats(app) # pint = pib.Pib(app) # external_stylesheets = ['https://codepen.io/chriddyp/pen/bWLwgP.css'] @@ -211,6 +214,7 @@ def init(): dcc.Link(html.Button('Ukraine', style={'width':"100%"}), href='/ukraine'), dcc.Link(html.Button('Corporate Envt Impact', style={'width':"100%"}), href='/corp_impact'), dcc.Link(html.Button('Population Française', style={'width':"100%"}), href='/popfr'), + dcc.Link(html.Button('Trending Youtube', style={'width':"100%"}), href='/ytb_trd'), # dcc.Link(html.Button('Accès à Internet vs PIB', style={'width':"100%"}), href='/pib'), html.Br(), html.Br(), @@ -375,13 +379,14 @@ def display_page(pathname): elif pathname == '/corp_impact': return c_i.main_layout elif pathname == '/popfr': - return popfr.main_layout + return popfr.main_layout + elif pathname == '/ytb_trd': + return ytb_trd.main_layout # elif pathname == "/pib": # return pint.main_layout else: return home_page return app - app = init() server = app.server diff --git a/music/graphe_image_music.png b/music/graphe_image_music.png new file mode 100644 index 00000000..2b15ecae Binary files /dev/null and b/music/graphe_image_music.png differ diff --git a/music/music_trend.py b/music/music_trend.py new file mode 100644 index 00000000..df703f1a --- /dev/null +++ b/music/music_trend.py @@ -0,0 +1,185 @@ +# import required packages +import dash +from dash import dcc +from dash import html +import dash_bootstrap_components as dbc +import plotly.graph_objs as go +import numpy as np +import pandas as pd +from datetime import datetime +import plotly.express as px + +d1 = "\u3131" +f1 = "\u3163" + +d2 = "\uAC00" +f2 = "\uAF00" + +d3 = "\uB000" +f3 = "\uBFE1" + +d4 = "\uC058" +f4 = "\uCFFC" + +d5 = "\uD018" +f5 = "\uD79D" + +d6 = "\u3181" +f6 = "\uCB4C" + + +list_file = ["category/archive/BR_youtube_trending_data.csv", + "category/archive/CA_youtube_trending_data.csv", + "category/archive/DE_youtube_trending_data.csv", + "category/archive/FR_youtube_trending_data.csv", + "category/archive/GB_youtube_trending_data.csv", + "category/archive/IN_youtube_trending_data.csv", + "category/archive/JP_youtube_trending_data.csv", + "category/archive/KR_youtube_trending_data.csv", + "category/archive/MX_youtube_trending_data.csv", + "category/archive/RU_youtube_trending_data.csv", + "category/archive/US_youtube_trending_data.csv"] +list_country = ["Brésil", "Canada", "Allemagne", "France", + "Royaume-uni", "Inde", "Japon", "Corée", "Méxique", "Russier", "US"] +list_country_kor = ["Brésil", "Canada", "Allemagne", "France", + "Royaume-uni", "Inde", "Japon", "Méxique", "Russier", "US"] + +def load_data(list_file, list_country): + datas = pd.DataFrame() + for i in range(len(list_file)): + data = pd.read_csv(list_file[i], sep=',') + data['country'] = list_country[i] + datas = pd.concat([datas, data]) + return datas + +def is_korean(word): + l = [] + for j in word: + for i in j: + if ord(i) >= ord(d1) and ord(i) <= ord(f1): + l.append(j) + if ord(i) >= ord(d2) and ord(i) <= ord(f2): + l.append(j) + if ord(i) >= ord(d3) and ord(i) <= ord(f3): + l.append(j) + if ord(i) >= ord(d4) and ord(i) <= ord(f4): + l.append(j) + if ord(i) >= ord(d5) and ord(i) <= ord(f5): + l.append(j) + return l + +def ratio(list_country_kor, datas_year): + ratio = [] + for country in list_country_kor: + country_data = datas_year[datas_year.country == country] + country_music = country_data[country_data.categoryId == 'Music'] + country_korean = is_korean(country_music.title) + ratio.append(len(country_korean) * 100 / len(country_music)) + return ratio + +# define figure creation function +def create_figure(result, list_country_kor): + datas = result + datas.trending_date = pd.to_datetime(datas.trending_date) + datas['year'] = pd.DatetimeIndex(datas['trending_date']).year + + datas_2020 = datas[datas.year == 2020] + datas_2021 = datas[datas.year == 2021] + datas_2022 = datas[datas.year == 2022] + + ratio_2020 = ratio(list_country_kor, datas_2020) + ratio_2021 = ratio(list_country_kor, datas_2021) + ratio_2022 = ratio(list_country_kor, datas_2022) + + df = pd.DataFrame(index = list_country_kor) + df['2020'] = ratio_2020 + df['2021'] = ratio_2021 + df['2022'] = ratio_2022 + + fig = px.scatter(df, size=df*5, hover_name=df.index, + title='Pourcentage de musique coréenne en tendance sur le total de musique en tendance pour chaque pays', + opacity = 0.6, labels={ + "value": "Ratio in %", + "index": "Country", + "variable": "Year" + }) + fig.update_layout( + title_font_size=22, + font_family="Serif", + title_font_family="Times New Roman", + title_font_color="black" + ) + fig.update_xaxes(title_font_family="Serif") + return fig + +# define dataframe creation function +def create_dataframe(): + + data = load_data(list_file, list_country) + data.drop(columns=['channelId', 'description','thumbnail_link','video_id'], inplace=True, axis=1) + + result = data.copy() + result['publishedAt'] = pd.to_datetime(result['publishedAt'], format='%Y-%m-%d') + replace_categories = { 2: 'Autos & Vehicles', + 1: 'Film & Animation', + 10: 'Music', + 15: 'Pets & Animals', + 17: 'Sports', + 18: 'Short Movies', + 19: 'Travel & Events', + 20: 'Gaming', + 21: 'Videoblogging', + 22: 'People & Blogs', + 23: 'Comedy', + 24: 'Entertainment', + 25: 'News & Politics', + 26: 'Howto & Style', + 27: 'Education', + 28: 'Science & Technology', + 29: 'Nonprofits & Activism'} + + df = result.replace({"categoryId": replace_categories}) + + return df + +def divide_dates(start, end, N): + test_date1 = datetime.strptime(start, '%Y-%m-%d') + test_date2 = datetime.strptime(end, '%Y-%m-%d') + temp = [] + diff = ( test_date2 - test_date1) // N + for idx in range(0, N+1): + temp.append((test_date1 + idx * diff).strftime("%Y-%m-%d")) + # format + return temp + +def filter_dates(result, start_date, end_date): + after_start_date = result["trending_date"] >= start_date + before_end_date = result["trending_date"] <= end_date + between_two_dates = after_start_date & before_end_date + filtered_dates = result.loc[between_two_dates] + return filtered_dates + + +# call figure and dataframe functions +df = create_dataframe() +figure = create_figure(df, list_country_kor) + + + +# page layout +app = dash.Dash(external_stylesheets = [dbc.themes.BOOTSTRAP]) + +app.layout = html.Div([ + + html.Div(children=[ + html.H3(children='Évolution des proportions des catégories youtube en tendances dans le monde'), + + html.Div(dcc.Graph(id = 'main-graph', + figure = figure)), + + ], style={'display': 'inline-block', 'vertical-align': 'top'}), + +]) + +if __name__ == "__main__": + app.run_server(debug=True, port=8057) \ No newline at end of file