import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
1 Analysis of French PhD theses
1.1 Load data and libraries
# Set the plotting style
"../bioinfo-fr.mplstyle") plt.style.use(
# Load the dataframe
= pd.read_csv(
df "../tmp/theses-soutenues-filtered.csv", quotechar='"', sep=",", header=0
)"date_soutenance"] = pd.to_datetime(df["date_soutenance"])
df[ df.head()
/tmp/ipykernel_66465/3583511864.py:2: DtypeWarning: Columns (5,6,7) have mixed types. Specify dtype option on import or set low_memory=False.
df = pd.read_csv(
auteurs.0.idref | auteurs.0.nom | auteurs.0.prenom | date_soutenance | directeurs_these.6.idref | directeurs_these.6.nom | directeurs_these.6.prenom | discipline.en | discipline.fr | langue | resumes.en | resumes.fr | titres.en | titres.fr | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 076645665 | Wu | Tao | 2003-01-01 | NaN | NaN | NaN | NaN | Physique | en | In the most central collisions of Pb-Pb nuclei... | Dans les collisions les plus centrales d'ions ... | Phi and omega production in Pb-Pb collisions a... | NaN |
1 | 102611777 | Simonin | Clémence | 2011-11-09 | NaN | NaN | NaN | NaN | Neurosciences | fr | Objectives: First, to study the mid- and long ... | Objectifs: D’une part étudier les effets de la... | From deep brain stimulation to physiopathologi... | De la stimulation cérébrale profonde à l’étude... |
2 | 198371845 | Poupon | Lenaïc | 2017-02-15 | NaN | NaN | NaN | NaN | Psychologie | fr | This thesis aims to study the dimensions that ... | Cette thèse vise à étudier les conditions psyc... | Acceptation of the electric car : study of a p... | L'acceptation de la voiture électrique : étude... |
3 | 251153770 | Snider-Giovannone | Marie-Noëlle | 2015-12-15 | NaN | NaN | NaN | NaN | Histoire moderne et contemporaine | fr | The title of my thesis: The Allied and Associa... | Cette thèse intitulée : Les Forces alliées et ... | The Allied and Associated Forces in the Far Ea... | Les Forces alliées et associées en Extrême-Ori... |
4 | 158874897 | Teixeira | Cédric | 2011-11-21 | NaN | NaN | NaN | NaN | Histoire du droit | fr | With the appearance of several sources of obli... | Avec l’apparition de plusieurs sources d’oblig... | Classification of sources of obligations from ... | La classification des sources des obligations ... |
1.2 Distribution of Theses Defense Dates
# Select only theses defended after 1985
= 2000
start_year = 2023 # For the dataset we have.
current_year # Load precomputed dataframe
= pd.read_csv("../tmp/year_distribution.csv") df_after
plt.figure()"year"], df_after["count"], color="C0", zorder=3, align="edge")
plt.bar(df_after["Année")
plt.xlabel("Nombre de thèses soutenues")
plt.ylabel("Thèses soutenues depuis 2000 par année")
plt.title( plt.show()
1.3 Distribution of Theses by Discipline
Code inspiré de https://github.com/richarddelome/theses_fr/
= (
df_discipline "discipline.fr"].explode().value_counts()[:20].sort_values(ascending=True)
df[
) df_discipline.head()
Terre, océan, espace 3481
Sciences de l'éducation 3594
Géographie 4391
Philosophie 4616
Droit 4732
Name: discipline.fr, dtype: int64
plt.figure()
plt.barh(
df_discipline.index,
df_discipline.values,="C0",
color=3,
zorder="edge",
align
) plt.show()