import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import datetime1 Analysis of French PhD theses
1.1 Load data and libraries
# Set the plotting style
plt.style.use("../bioinfo-fr.mplstyle")# Load the dataframe
df = pd.read_csv(
"../tmp/theses-soutenues-filtered.csv", quotechar='"', sep=",", header=0
)
df["date_soutenance"] = pd.to_datetime(df["date_soutenance"])
df.head()/tmp/ipykernel_66465/3583511864.py:2: DtypeWarning: Columns (5,6,7) have mixed types. Specify dtype option on import or set low_memory=False.
df = pd.read_csv(
| auteurs.0.idref | auteurs.0.nom | auteurs.0.prenom | date_soutenance | directeurs_these.6.idref | directeurs_these.6.nom | directeurs_these.6.prenom | discipline.en | discipline.fr | langue | resumes.en | resumes.fr | titres.en | titres.fr | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 076645665 | Wu | Tao | 2003-01-01 | NaN | NaN | NaN | NaN | Physique | en | In the most central collisions of Pb-Pb nuclei... | Dans les collisions les plus centrales d'ions ... | Phi and omega production in Pb-Pb collisions a... | NaN |
| 1 | 102611777 | Simonin | Clémence | 2011-11-09 | NaN | NaN | NaN | NaN | Neurosciences | fr | Objectives: First, to study the mid- and long ... | Objectifs: D’une part étudier les effets de la... | From deep brain stimulation to physiopathologi... | De la stimulation cérébrale profonde à l’étude... |
| 2 | 198371845 | Poupon | Lenaïc | 2017-02-15 | NaN | NaN | NaN | NaN | Psychologie | fr | This thesis aims to study the dimensions that ... | Cette thèse vise à étudier les conditions psyc... | Acceptation of the electric car : study of a p... | L'acceptation de la voiture électrique : étude... |
| 3 | 251153770 | Snider-Giovannone | Marie-Noëlle | 2015-12-15 | NaN | NaN | NaN | NaN | Histoire moderne et contemporaine | fr | The title of my thesis: The Allied and Associa... | Cette thèse intitulée : Les Forces alliées et ... | The Allied and Associated Forces in the Far Ea... | Les Forces alliées et associées en Extrême-Ori... |
| 4 | 158874897 | Teixeira | Cédric | 2011-11-21 | NaN | NaN | NaN | NaN | Histoire du droit | fr | With the appearance of several sources of obli... | Avec l’apparition de plusieurs sources d’oblig... | Classification of sources of obligations from ... | La classification des sources des obligations ... |
1.2 Distribution of Theses Defense Dates
# Select only theses defended after 1985
start_year = 2000
current_year = 2023 # For the dataset we have.
# Load precomputed dataframe
df_after = pd.read_csv("../tmp/year_distribution.csv")plt.figure()
plt.bar(df_after["year"], df_after["count"], color="C0", zorder=3, align="edge")
plt.xlabel("Année")
plt.ylabel("Nombre de thèses soutenues")
plt.title("Thèses soutenues depuis 2000 par année")
plt.show()
1.3 Distribution of Theses by Discipline
Code inspiré de https://github.com/richarddelome/theses_fr/
df_discipline = (
df["discipline.fr"].explode().value_counts()[:20].sort_values(ascending=True)
)
df_discipline.head()Terre, océan, espace 3481
Sciences de l'éducation 3594
Géographie 4391
Philosophie 4616
Droit 4732
Name: discipline.fr, dtype: int64
plt.figure()
plt.barh(
df_discipline.index,
df_discipline.values,
color="C0",
zorder=3,
align="edge",
)
plt.show()