From 51b89041fba09f5c8abc282a09ac51c467529bb2 Mon Sep 17 00:00:00 2001 From: Martial Simon Date: Mon, 2 Mar 2026 13:11:18 +0100 Subject: feat: TP2 dataviz --- dataviz/TP2_DataViz_24_02_26.ipynb | 2623 ++++++++++++++++++++++++++++++++++++ 1 file changed, 2623 insertions(+) create mode 100644 dataviz/TP2_DataViz_24_02_26.ipynb (limited to 'dataviz/TP2_DataViz_24_02_26.ipynb') diff --git a/dataviz/TP2_DataViz_24_02_26.ipynb b/dataviz/TP2_DataViz_24_02_26.ipynb new file mode 100644 index 0000000..f9a94b3 --- /dev/null +++ b/dataviz/TP2_DataViz_24_02_26.ipynb @@ -0,0 +1,2623 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "92304ea2", + "metadata": { + "id": "92304ea2", + "outputId": "255b408c-cdd9-47bc-8c71-c86572188d2d", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 206 + } + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " Name Miles_per_Gallon Cylinders Displacement \\\n", + "0 chevrolet chevelle malibu 18.0 8 307.0 \n", + "1 buick skylark 320 15.0 8 350.0 \n", + "2 plymouth satellite 18.0 8 318.0 \n", + "3 amc rebel sst 16.0 8 304.0 \n", + "4 ford torino 17.0 8 302.0 \n", + "\n", + " Horsepower Weight_in_lbs Acceleration Year Origin \n", + "0 130.0 3504 12.0 1970-01-01 USA \n", + "1 165.0 3693 11.5 1970-01-01 USA \n", + "2 150.0 3436 11.0 1970-01-01 USA \n", + "3 150.0 3433 12.0 1970-01-01 USA \n", + "4 140.0 3449 10.5 1970-01-01 USA " + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
NameMiles_per_GallonCylindersDisplacementHorsepowerWeight_in_lbsAccelerationYearOrigin
0chevrolet chevelle malibu18.08307.0130.0350412.01970-01-01USA
1buick skylark 32015.08350.0165.0369311.51970-01-01USA
2plymouth satellite18.08318.0150.0343611.01970-01-01USA
3amc rebel sst16.08304.0150.0343312.01970-01-01USA
4ford torino17.08302.0140.0344910.51970-01-01USA
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "variable_name": "cars", + "summary": "{\n \"name\": \"cars\",\n \"rows\": 406,\n \"fields\": [\n {\n \"column\": \"Name\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 311,\n \"samples\": [\n \"amc concord dl\",\n \"amc ambassador dpl\",\n \"plymouth cricket\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Miles_per_Gallon\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 7.815984312565782,\n \"min\": 9.0,\n \"max\": 46.6,\n \"num_unique_values\": 129,\n \"samples\": [\n 17.7,\n 30.5,\n 30.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Cylinders\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 1,\n \"min\": 3,\n \"max\": 8,\n \"num_unique_values\": 5,\n \"samples\": [\n 4,\n 5,\n 6\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Displacement\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 104.92245837948877,\n \"min\": 68.0,\n \"max\": 455.0,\n \"num_unique_values\": 83,\n \"samples\": [\n 258.0,\n 307.0,\n 107.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Horsepower\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 38.76877918310522,\n \"min\": 46.0,\n \"max\": 230.0,\n \"num_unique_values\": 93,\n \"samples\": [\n 155.0,\n 200.0,\n 83.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Weight_in_lbs\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 847,\n \"min\": 1613,\n \"max\": 5140,\n \"num_unique_values\": 356,\n \"samples\": [\n 2051,\n 3288,\n 2405\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Acceleration\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 2.803358816342545,\n \"min\": 8.0,\n \"max\": 24.8,\n \"num_unique_values\": 96,\n \"samples\": [\n 17.9,\n 12.9,\n 14.3\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Year\",\n \"properties\": {\n \"dtype\": \"date\",\n \"min\": \"1970-01-01 00:00:00\",\n \"max\": \"1982-01-01 00:00:00\",\n \"num_unique_values\": 12,\n \"samples\": [\n \"1980-01-01 00:00:00\",\n \"1979-01-01 00:00:00\",\n \"1970-01-01 00:00:00\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Origin\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 3,\n \"samples\": [\n \"USA\",\n \"Europe\",\n \"Japan\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" + } + }, + "metadata": {}, + "execution_count": 4 + } + ], + "source": [ + "import altair as alt\n", + "from vega_datasets import data\n", + "import pandas as pd\n", + "cars = data.cars()\n", + "cars.head()" + ] + }, + { + "cell_type": "markdown", + "id": "83fa262b", + "metadata": { + "id": "83fa262b" + }, + "source": [ + "Dès l'import du dataset, on peut commencer à visualiser les données brutes sous forme de tableau." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f988b49f-c78b-4fe1-adee-1cf99a1a1dd8", + "metadata": { + "id": "f988b49f-c78b-4fe1-adee-1cf99a1a1dd8", + "outputId": "ebb3906e-8986-4841-aa84-2afa7018f1cd", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 + } + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.Chart(...)" + ] + }, + "metadata": {}, + "execution_count": 7 + } + ], + "source": [ + "source = pd.DataFrame.from_records(\n", + " [\n", + " {\n", + " \"x\": 0.5,\n", + " \"y\": 0.5,\n", + " \"img\": \"https://i.kym-cdn.com/entries/icons/original/000/050/684/cover2.jpg\",\n", + " },\n", + " ]\n", + ")\n", + "\n", + "alt.Chart(source).mark_image(width=1920, height=1080).encode(x=\"x\", y=\"y\", url=\"img\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "06a146c7", + "metadata": { + "id": "06a146c7", + "outputId": "a4400c08-a540-4d89-b235-3d56cc1e95cf", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 123 + } + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.Chart(...)" + ] + }, + "metadata": {}, + "execution_count": 9 + } + ], + "source": [ + "cars1 = cars.iloc[:1]\n", + "alt.Chart(cars1).mark_point().to_dict()\n", + "alt.Chart(cars).mark_bar().encode(\n", + " x='mean(Weight_in_lbs)',\n", + " y='Origin'\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "fec7d694-1416-402b-9796-e541f848d011", + "metadata": { + "id": "fec7d694-1416-402b-9796-e541f848d011" + }, + "source": [ + "Cet histogramme montre le poids moyen des voitures selon leur origine. On constate sans grande surprise que les voitures américaines sont considérablement plus lourdes que les européennes et japonaises, ces dernières étant en moyenne moins lourdes que les européennes." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5346513c", + "metadata": { + "id": "5346513c", + "outputId": "f9413903-3e10-4c87-88d6-32c01d2c4824", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 368 + } + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.Chart(...)" + ] + }, + "metadata": {}, + "execution_count": 10 + } + ], + "source": [ + "alt.Chart(cars).mark_point().encode(\n", + " x='Cylinders',\n", + " y='Weight_in_lbs'\n", + ")\n", + "\n", + "alt.Chart(cars).mark_boxplot().encode(\n", + " x=alt.X('Cylinders', scale=alt.Scale(domain=[2, 9])),\n", + " y='Weight_in_lbs',\n", + "\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "c0553928-ecc1-42f9-a434-168edbe161b8", + "metadata": { + "id": "c0553928-ecc1-42f9-a434-168edbe161b8" + }, + "source": [ + "Ce graphique montre la répartition des poids des voitures en fonction de leur nombre de cylindres: on observe que les voitures avec plus de cylindres sont généralement plus lourdes." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "937e0a5b", + "metadata": { + "id": "937e0a5b", + "outputId": "f5be60ae-65bf-4a76-a47f-6db37e18876c", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 368 + } + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.Chart(...)" + ] + }, + "metadata": {}, + "execution_count": 11 + } + ], + "source": [ + "alt.Chart(cars).mark_point().encode(\n", + " x='Horsepower',\n", + " y='Miles_per_Gallon',\n", + " color='Origin'\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "dc497eff-69af-4475-9db3-1f336b91d27c", + "metadata": { + "id": "dc497eff-69af-4475-9db3-1f336b91d27c" + }, + "source": [ + "Il est cohérent de constater dans ce graphique que des voitures plus puissantes consomment plus de carburant donc parcourent moins de distance par gallon de carburant. On constate aussi que les voiture les plus puissantes donc consommatrices sont de facture américaine." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "aa3fa68d", + "metadata": { + "id": "aa3fa68d", + "outputId": "93986253-a5f8-4399-e698-eb67818d58a6", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 368 + } + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.Chart(...)" + ] + }, + "metadata": {}, + "execution_count": 12 + } + ], + "source": [ + "alt.Chart(cars).mark_line().encode(\n", + " x='Year',\n", + " y='mean(Miles_per_Gallon)'\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "31e167e3-1094-4c69-8fa0-294863052216", + "metadata": { + "id": "31e167e3-1094-4c69-8fa0-294863052216" + }, + "source": [ + "Ce graphique montre une augmentation globale de la distance parcourable sur un gallon de carburant au fil des années, c'est-à-dire une baisse de la consommation des voitures, probablement grâce aux progrès technologiques." + ] + }, + { + "cell_type": "markdown", + "id": "8a05f6c2-91f4-429c-9f78-4472ea76ffc2", + "metadata": { + "id": "8a05f6c2-91f4-429c-9f78-4472ea76ffc2" + }, + "source": [ + "# Exercice II : Binning et aggregation" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "49c28012-ab3e-4a4e-a374-592060d0eec7", + "metadata": { + "id": "49c28012-ab3e-4a4e-a374-592060d0eec7", + "outputId": "3ab58f83-03c4-4f43-9b87-d8668ed575ea", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 368 + } + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.Chart(...)" + ] + }, + "metadata": {}, + "execution_count": 13 + } + ], + "source": [ + "binned = alt.Chart(cars).transform_bin(\n", + " as_=['hp_start','hp_end'],\n", + " field='Horsepower',\n", + " bin=alt.Bin(maxbins=20)\n", + ").transform_calculate(\n", + " origin_num='indexof([\"Europe\",\"Japan\",\"USA\"], datum.Origin)'\n", + ").transform_calculate(\n", + " bin_width='datum.hp_end - datum.hp_start',\n", + " x_pos='datum.hp_start + datum.origin_num'\n", + ")\n", + "\n", + "binned.mark_bar(width=4).encode(\n", + " x=alt.X('x_pos:Q', title='Horsepower'),\n", + " y='count()',\n", + " color='Origin:N'\n", + ").properties(width=700, height=300)" + ] + }, + { + "cell_type": "markdown", + "id": "e0d707e4-0c55-47b5-b1a9-25ac676b3c48", + "metadata": { + "id": "e0d707e4-0c55-47b5-b1a9-25ac676b3c48" + }, + "source": [ + "Cet histogramme montre la répartition des puissances de moteurs de voitures pour chaque origine. On remarque premièrement que les USA fabriquent des moteurs avec des puissances excédant largement les européens et japonais, allant jusqu'à 220cv tandis que les autres fabricants ne vont que jusqu'à 130cv. Les européens produisent majoritairement des moteurs de 70cv, les japonais de 60cv, et les USA de 80cv." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "130ea819", + "metadata": { + "id": "130ea819", + "outputId": "43dac60a-31b3-44c1-ab71-4670a1e7dcf4", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 363 + } + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.Chart(...)" + ] + }, + "metadata": {}, + "execution_count": 14 + } + ], + "source": [ + "alt.Chart(cars).mark_bar().encode(\n", + " x=alt.X('Acceleration', bin=alt.Bin(maxbins=20)),\n", + " y='count()',\n", + " color='Cylinders'\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "011e064b-ae2d-435b-8883-25606eeb6027", + "metadata": { + "id": "011e064b-ae2d-435b-8883-25606eeb6027" + }, + "source": [ + "Cet histogramme montre la variation de l'accélération en fonction du nombre de cylindres dans un moteur. On note que l'accélération est plus forte avec moins de cylindres." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "877df56e-0397-4252-96d6-f05f44eabb4a", + "metadata": { + "id": "877df56e-0397-4252-96d6-f05f44eabb4a" + }, + "outputs": [], + "source": [ + "from vega_datasets import data\n", + "stocks = data.stocks()\n", + "stocks.head()\n", + "base = alt.Chart(stocks).encode(\n", + " x='date:T',\n", + " y='price:Q',\n", + " color='symbol:N'\n", + ").transform_filter(\n", + " alt.datum.symbol == 'GOOG'\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5647a8a6-c99b-4171-a14d-51015ca52c3e", + "metadata": { + "id": "5647a8a6-c99b-4171-a14d-51015ca52c3e", + "outputId": "d278040d-7b3a-44f1-881a-880e36dade61", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 370 + } + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.HConcatChart(...)" + ] + }, + "metadata": {}, + "execution_count": 16 + } + ], + "source": [ + "alt.hconcat(base.mark_line(),\n", + "base.mark_circle())" + ] + }, + { + "cell_type": "markdown", + "id": "069bd2e4-0074-408b-91fe-e7621539079c", + "metadata": { + "id": "069bd2e4-0074-408b-91fe-e7621539079c" + }, + "source": [ + "Ces graphiques montrent l'évolution du prix des actions d'Alphabet (GOOG) au fil des années. On note une augmentation globale entre 2005 et 2010, avec logiquement une baisse conséquente au cours de l'année 2008." + ] + }, + { + "cell_type": "markdown", + "id": "12e0ed6c-9cb7-4c35-b425-bdc65bf98866", + "metadata": { + "id": "12e0ed6c-9cb7-4c35-b425-bdc65bf98866" + }, + "source": [ + "# Exercice III : Interactivité" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "10b22397-31de-41d9-b7da-50c8344c7a76", + "metadata": { + "id": "10b22397-31de-41d9-b7da-50c8344c7a76", + "outputId": "7483317e-db73-44cd-d387-7bc9a8e51b80", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 368 + } + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.Chart(...)" + ] + }, + "metadata": {}, + "execution_count": 17 + } + ], + "source": [ + "alt.Chart(cars).mark_point().encode(\n", + " x=\"Horsepower:Q\",\n", + " y=\"Miles_per_Gallon:Q\",\n", + " color=\"Origin\",\n", + " tooltip=[\"Name:N\", \"Horsepower:Q\", \"Miles_per_Gallon:Q\", \"Origin:N\"]\n", + ").interactive()" + ] + }, + { + "cell_type": "markdown", + "id": "31858600-5cd5-4eba-8f33-5c17c7e15edb", + "metadata": { + "id": "31858600-5cd5-4eba-8f33-5c17c7e15edb" + }, + "source": [ + "Ce graphique est identique à celui présenté au début du notebook, avec la fonctionnalité supplémentaire du zoom qui permet de regarder de plus près une certaine zone du graphique." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f7b084dc-a5d5-4707-8714-a9d19a6d474e", + "metadata": { + "id": "f7b084dc-a5d5-4707-8714-a9d19a6d474e", + "outputId": "55aca8fc-8b86-40c7-9040-24b03345abcd", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 654 + } + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.VConcatChart(...)" + ] + }, + "metadata": {}, + "execution_count": 18 + } + ], + "source": [ + "brush = alt.selection_interval(encodings=['x','y'])\n", + "\n", + "points = alt.Chart(cars).mark_point().encode(\n", + " x=alt.X('Horsepower:Q', title='Horsepower'),\n", + " y=alt.Y('Miles_per_Gallon:Q', title='Miles per Gallon'),\n", + " color=alt.condition(brush, 'Origin:N', alt.value('lightgray')),\n", + " tooltip=['Name:N','Horsepower:Q','Miles_per_Gallon:Q','Origin:N']\n", + ").add_params(brush).properties(width=520, height=400)\n", + "\n", + "hist = alt.Chart(cars).mark_bar().transform_filter(brush).encode(\n", + " y=alt.Y('Origin:N', title='Origin'),\n", + " x=alt.X('count()', title='Count'),\n", + ").properties(width=520, height=130)\n", + "\n", + "(points & hist)\n" + ] + }, + { + "cell_type": "markdown", + "id": "f2be0ab8-7e7b-4917-8891-c214d3a89d51", + "metadata": { + "id": "f2be0ab8-7e7b-4917-8891-c214d3a89d51" + }, + "source": [ + "Ce graphique interactif permet de visualiser dans un histogramme le nombre de voitures de chaque origine sélectionnée dans le graphique." + ] + }, + { + "cell_type": "markdown", + "id": "72dc9be6-40d7-4124-9be7-7622b11e9a93", + "metadata": { + "id": "72dc9be6-40d7-4124-9be7-7622b11e9a93" + }, + "source": [ + "\n", + "# Exercice IV : IMDB\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "aa47711a-8cee-4fa2-95b1-37f17c60c14c", + "metadata": { + "id": "aa47711a-8cee-4fa2-95b1-37f17c60c14c" + }, + "outputs": [], + "source": [ + "from vega_datasets import data\n", + "movies = data.movies()\n", + "from sklearn.preprocessing import StandardScaler, MinMaxScaler\n", + "import numpy as np\n", + "\n", + "movies_clean = movies.copy()\n", + "numeric_cols = ['US_Gross', 'Worldwide_Gross', 'US_DVD_Sales', 'Production_Budget',\n", + " 'Running_Time_min', 'Rotten_Tomatoes_Rating', 'IMDB_Rating', 'IMDB_Votes']\n", + "for col in numeric_cols:\n", + " if movies_clean[col].isnull().sum() > 0:\n", + " movies_clean[col] = movies_clean[col].fillna(movies_clean[col].median())\n", + "\n", + "categorical_cols = ['MPAA_Rating', 'Source', 'Major_Genre', 'Creative_Type', 'Director']\n", + "for col in categorical_cols:\n", + " if movies_clean[col].isnull().sum() > 0:\n", + " movies_clean[col] = movies_clean[col].fillna('Unknown')\n", + "\n", + "movies_clean['Release_Date'] = pd.to_datetime(movies_clean['Release_Date'])\n", + "movies_clean = movies_clean[movies_clean.Release_Date.dt.year <= 2010]\n", + "\n", + "np.random.seed(42)\n", + "cols_to_dummy = ['US_Gross', 'Worldwide_Gross', 'US_DVD_Sales', 'Production_Budget',\n", + " 'Running_Time_min', 'IMDB_Rating', 'IMDB_Votes']\n", + "for col in cols_to_dummy:\n", + " nan_mask = movies_clean[col].isnull()\n", + " if nan_mask.sum() > 0:\n", + " mean = movies_clean[col].mean()\n", + " std = movies_clean[col].std()\n", + " dummy_values = np.random.normal(loc=mean, scale=std, size=nan_mask.sum())\n", + " if col in ['US_Gross', 'Worldwide_Gross', 'US_DVD_Sales', 'Production_Budget']:\n", + " dummy_values = np.clip(dummy_values, a_min=0, a_max=None)\n", + " movies_clean.loc[nan_mask, col] = dummy_values\n", + "\n", + "categorical_to_encode = ['MPAA_Rating', 'Major_Genre', 'Creative_Type']\n", + "movies_encoded = pd.get_dummies(movies_clean, columns=categorical_to_encode, drop_first=True)\n", + "\n", + "movies_clean['Year'] = movies_clean['Release_Date'].dt.year\n", + "\n", + "\n", + "movies = movies_clean" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dfc6359d-3359-4b9f-b47d-7e6f1a5096b9", + "metadata": { + "id": "dfc6359d-3359-4b9f-b47d-7e6f1a5096b9", + "outputId": "6c76bb59-e488-4ffb-8516-e43063491578", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 323 + } + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.Chart(...)" + ] + }, + "metadata": {}, + "execution_count": 20 + } + ], + "source": [ + "cars1 = cars.iloc[:1]\n", + "alt.Chart(movies).mark_bar().encode(\n", + " y=alt.X('Major_Genre', sort=\"-x\"),\n", + " x='count()'\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "6df99e2a-204f-4117-993c-f8220ee2b862", + "metadata": { + "id": "6df99e2a-204f-4117-993c-f8220ee2b862" + }, + "source": [ + "Cet histogramme montre le nombre de films par genre principal dans le dataset. Le drame et la comédie sont les plus présents, devant les films d'action." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e291fc07-0cfc-4622-adea-07196d1e112b", + "metadata": { + "id": "e291fc07-0cfc-4622-adea-07196d1e112b", + "outputId": "3f7271e8-817c-4959-ae98-94886ff3f21d", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 453 + } + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.Chart(...)" + ] + }, + "metadata": {}, + "execution_count": 21 + } + ], + "source": [ + "cars1 = cars.iloc[:1]\n", + "alt.Chart(movies).mark_boxplot().encode(\n", + " x=alt.X('Major_Genre', sort=\"-x\"),\n", + " y=alt.Y('Worldwide_Gross', scale=alt.Scale(type='sqrt'))\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "387c7a35-e0ea-40cc-929a-61db36f91179", + "metadata": { + "id": "387c7a35-e0ea-40cc-929a-61db36f91179" + }, + "source": [ + "Ce graphique présente les revenus générés par les films selon leur genre principal. Les films d'action et d'aventure engendrent généralement plus de revenus que les films d'autres genres." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5ed27f34-c25a-4483-b4d9-6459da0e02bd", + "metadata": { + "id": "5ed27f34-c25a-4483-b4d9-6459da0e02bd", + "outputId": "994b5585-f50a-4332-ec3b-6758060c194b", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 718 + } + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.Chart(...)" + ] + }, + "metadata": {} + }, + { + "output_type": "execute_result", + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.Chart(...)" + ] + }, + "metadata": {}, + "execution_count": 22 + } + ], + "source": [ + "#print(movies['Production_Budget'])\n", + "#print(movies.groupby(['Year'])['Production_Budget'].mean())\n", + "dico = {\n", + " 'years': np.sort(movies[\"Year\"].unique()),\n", + " 'budget': movies.groupby(['Year'])['Production_Budget'].mean(),\n", + " 'max_budg': movies.groupby(['Year'])['Production_Budget'].max(),\n", + "}\n", + "\n", + "a = alt.Chart(pd.DataFrame(dico)).transform_fold(\n", + " ['budget','max_budg']\n", + ").mark_point().encode(\n", + " x=alt.X('years',scale=alt.Scale(domain=[min(dico[\"years\"]), max(dico[\"years\"])])),\n", + " y='value:Q',\n", + " color='key:N',\n", + ")\n", + "a.show()\n", + "\n", + "alt.Chart(movies).mark_point().encode(\n", + " x=alt.X('Year',scale=alt.Scale(domain=[min(dico[\"years\"]), max(dico[\"years\"])])),\n", + " y='Production_Budget'\n", + ").configure_mark(\n", + " opacity=0.1,\n", + " color='red'\n", + ")" + ] + }, + { + "cell_type": "markdown", + "source": [ + "Ces 2 graphiques montrent l'évolution des budgets des films au cours des années. On constate au passage une forte augmentation du nombre de films produits. Le budget moyen a plus que doublé depuis 1980, et le record du plus haut budget n'a cessé d'être brisé." + ], + "metadata": { + "id": "9V2xx5Fa9RDD" + }, + "id": "9V2xx5Fa9RDD" + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ac750d09-7f36-4cb3-a3f8-5827dcab278d", + "metadata": { + "id": "ac750d09-7f36-4cb3-a3f8-5827dcab278d", + "outputId": "ee1bfc4e-7ed6-427d-97ba-08187ab7b066", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 718 + } + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.Chart(...)" + ] + }, + "metadata": {} + }, + { + "output_type": "execute_result", + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.Chart(...)" + ] + }, + "metadata": {}, + "execution_count": 23 + } + ], + "source": [ + "\"\"\"alt.Chart(movies).mark_point().encode(\n", + " x=alt.X('Year',scale=alt.Scale(domain=[min(dico[\"years\"]), max(dico[\"years\"])])),\n", + " y='MPAA_Rating',\n", + " color='Major_Genre'\n", + ").configure_mark(\n", + " opacity=0.6,\n", + ")\"\"\"\n", + "\n", + "alt.Chart(movies).mark_point().encode(\n", + " x=alt.X('Year',scale=alt.Scale(domain=[min(dico[\"years\"]), max(dico[\"years\"])])),\n", + " y='Rotten_Tomatoes_Rating',\n", + " color='Major_Genre'\n", + ").configure_mark(\n", + " opacity=0.6,\n", + ").interactive().show()\n", + "\n", + "alt.Chart(movies).mark_point().encode(\n", + " x=alt.X('Year',scale=alt.Scale(domain=[min(dico[\"years\"]), max(dico[\"years\"])])),\n", + " y='IMDB_Rating',\n", + " color='Major_Genre'\n", + ").configure_mark(\n", + " opacity=0.6,\n", + ").interactive()" + ] + }, + { + "cell_type": "markdown", + "source": [ + "Ces graphiques affichent les notes des films sur les sites IMDB et Rotten Tomatoes, en fonction de leur année de sortie, ainsi que leur genre principal. La quantité massive de données ne permet pas d'identifier de genre au score plus élevé que les autres." + ], + "metadata": { + "id": "gNpFr13wekkn" + }, + "id": "gNpFr13wekkn" + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9170640b-6e8d-4454-85a8-fefd48a64b57", + "metadata": { + "id": "9170640b-6e8d-4454-85a8-fefd48a64b57", + "outputId": "3409c1d1-c0a4-4935-9351-fb2f47d5dc77", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 482 + } + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.HConcatChart(...)" + ] + }, + "metadata": {}, + "execution_count": 24 + } + ], + "source": [ + "producer_counts = movies_clean['Director'].value_counts().reset_index()\n", + "producer_counts.columns = ['Director', 'Movie_Count']\n", + "producer_counts = producer_counts[producer_counts['Director'] != 'Unknown'].head(10)\n", + "activity = alt.Chart(producer_counts).mark_bar().encode(\n", + " x='Movie_Count:Q',\n", + " y=alt.Y('Director:N', sort='-x')\n", + ").properties(title='Top 10 Most Active Producers', width=600, height=400)\n", + "movies_clean['Profit'] = movies_clean['Worldwide_Gross'] - movies_clean['Production_Budget']\n", + "producer_profit = movies_clean.groupby('Director')['Profit'].mean().reset_index()\n", + "producer_profit = producer_profit[producer_profit['Director'] != 'Unknown'].sort_values(by='Profit', ascending=False).head(10)\n", + "profit = alt.Chart(producer_profit).mark_bar().encode(\n", + " x='Profit:Q',\n", + " y=alt.Y('Director:N', sort='-x')\n", + ").properties(title='Top 10 Producers by Average Profit', width=600, height=400)\n", + "\n", + "( profit | activity )" + ] + }, + { + "cell_type": "markdown", + "source": [ + "David Yates est le directeur des harry potter depuis le 5ème, ce qui explique sa place de numéro 1 en profit.\n", + "\n", + "On remarque que les directeurs les plus actifs ne font très logiquement pas partie des plus lucratifs par film, malgré leur renommée." + ], + "metadata": { + "id": "UOtKCPk2KU7q" + }, + "id": "UOtKCPk2KU7q" + }, + { + "cell_type": "code", + "execution_count": null, + "id": "eb116ef8-194a-4939-b143-c19a7b9eec8c", + "metadata": { + "scrolled": true, + "id": "eb116ef8-194a-4939-b143-c19a7b9eec8c", + "outputId": "e7a17b6c-18e6-4609-9945-78c7f68bdbe8", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 736 + } + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.Chart(...)" + ] + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.Chart(...)" + ] + }, + "metadata": {} + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "array([2169836.8])" + ] + }, + "metadata": {}, + "execution_count": 25 + } + ], + "source": [ + "a = alt.Chart(movies).mark_point().encode(\n", + " x=alt.X('Running_Time_min'),\n", + " y='IMDB_Rating',\n", + " color='Major_Genre'\n", + ").configure_mark(\n", + " opacity=0.6,\n", + ")\n", + "a.show()\n", + "\n", + "a = alt.Chart(movies).mark_point().encode(\n", + " x=alt.X('Running_Time_min'),\n", + " y='Rotten_Tomatoes_Rating',\n", + " color='Major_Genre'\n", + ").configure_mark(\n", + " opacity=0.6,\n", + ")\n", + "a.show()\n", + "\n", + "import numpy as np\n", + "\n", + "np.correlate(movies['IMDB_Rating'], movies['Running_Time_min'])" + ] + }, + { + "cell_type": "markdown", + "id": "20485d20-5c5f-4b42-8935-7e812af9b101", + "metadata": { + "id": "20485d20-5c5f-4b42-8935-7e812af9b101" + }, + "source": [ + "En plissant les yeux, on observe qu'il y a assez peu de films très longs (> 2h) qui obtienne de vraiment mauvaise note, notamment sur IMDB." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9f72872a-a405-44a9-8074-fddc2610935b", + "metadata": { + "id": "9f72872a-a405-44a9-8074-fddc2610935b", + "outputId": "8f2a7b52-f0cf-4299-d0ef-b93adf54b960", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 368 + } + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.Chart(...)" + ] + }, + "metadata": {}, + "execution_count": 26 + } + ], + "source": [ + "fat_corr = pd.concat([movies_clean[movies_clean.IMDB_Rating < 4], movies_clean[movies_clean.IMDB_Rating > 8]], ignore_index=True)\n", + "\n", + "alt.Chart(fat_corr).mark_point(filled=True).encode(\n", + " x=alt.Y('Worldwide_Gross', scale=alt.Scale(type='sqrt')),\n", + " y=alt.Y('Production_Budget', scale=alt.Scale(type='sqrt')),\n", + " color=alt.Color(field=\"IMDB_Rating\", bin=alt.Bin(maxbins=10), scale={\"range\":\"diverging\"}),\n", + ").configure_mark(\n", + " opacity=0.8,\n", + ").interactive()" + ] + }, + { + "cell_type": "markdown", + "id": "d29b8a7f-c8ca-4e82-a7ee-bcfacf7c2b42", + "metadata": { + "id": "d29b8a7f-c8ca-4e82-a7ee-bcfacf7c2b42" + }, + "source": [ + "On observe que les films les moins bien notés sont des films à haut budget qui engrangent le moins de profit, ce qui semble logique (ce sont les plus médiatisés, et donc les plus décevants). On observe également que des films de tous budgets sont bien notés, mais que parmi les films à grosse production, seuls ceux ayant engrangé le plus de profits le sont.\n", + "\n", + "On peut en déduire que plus le budget est élevé, plus les attentes de la critique et du public sont élevées et difficiles à satisfaire." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7d513fa8-51b5-4eae-9ea6-000e502caed3", + "metadata": { + "id": "7d513fa8-51b5-4eae-9ea6-000e502caed3" + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.14.3" + }, + "colab": { + "provenance": [] + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} \ No newline at end of file -- cgit v1.2.3