#Analyse lettres de motivations dossiers ParcoursSup

Extraction des lettres de motivations

On suppose que le contenu textuel (copier coller tout le document) du PDF des dossiers est dans dossiers.txt

Tiré quasi-directement de http://brandonrose.org/clustering

In [126]:
f = 'dossiers.txt'
lines = open(f,'rb').read().decode('utf-8').split('\n')

lettres = []
eleves = []
lycees = []
for i, l in enumerate(lines):
    l = l.strip()
    if l.startswith('Projet de formation motivé (Sais'):
        eleves.append(lines[i+1].strip())
    if l == 'Scolarité antérieure':
        l = lines[i+4].strip()
        try:
            k = l.index(' / ')
            lycees.append( l[:k] )
        except:
            lycees.append(l)
    if 'Objet : Projet de formation motivé' == l:
        j = i + 1
        lettre = []
        while not lines[j].startswith('Dossier n'):
            lettre.append(lines[j].strip())
            j += 1
        lettres.append( ' '.join(lettre) )
In [ ]:
import numpy as np
import pandas as pd
import nltk
import re
import os
import codecs
from sklearn import feature_extraction
import mpld3
In [ ]:
nltk.download('stopwords')
nltk.download('punkt')
In [ ]:
stopwords = nltk.corpus.stopwords.words('french')
In [ ]:
stopwords[:10]
In [ ]:
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer('french')
In [ ]:
# here I define a tokenizer and stemmer which returns the set of stems in the text that it is passed

def tokenize_and_stem(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems

def tokenize_only(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    return filtered_tokens
In [ ]:
#not super pythonic, no, not at all.
#use extend so it's a big flat list of vocab
totalvocab_stemmed = []
totalvocab_tokenized = []
for i in lettres:
    allwords_stemmed = tokenize_and_stem(i) #for each item in 'synopses', tokenize/stem
    totalvocab_stemmed.extend(allwords_stemmed) #extend the 'totalvocab_stemmed' list

    allwords_tokenized = tokenize_only(i)
    totalvocab_tokenized.extend(allwords_tokenized)
In [ ]:
vocab_frame = pd.DataFrame({'words': totalvocab_tokenized}, index = totalvocab_stemmed)
print('there are ' + str(vocab_frame.shape[0]) + ' items in vocab_frame')
In [ ]:
vocab_frame.head()
In [ ]:
from sklearn.feature_extraction.text import TfidfVectorizer

#define vectorizer parameters
tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=200000,
                                 min_df=0.2, stop_words=stopwords,
                                 use_idf=True, tokenizer=tokenize_and_stem, ngram_range=(1,3))

tfidf_matrix = tfidf_vectorizer.fit_transform(lettres)
In [ ]:
terms = tfidf_vectorizer.get_feature_names()
In [ ]:
from sklearn.metrics.pairwise import cosine_similarity
dist = 1 - cosine_similarity(tfidf_matrix)
In [ ]:
from sklearn.cluster import KMeans

num_clusters = 5

km = KMeans(n_clusters=num_clusters)

km.fit(tfidf_matrix)

clusters = km.labels_.tolist()
In [ ]:
from sklearn.externals import joblib

#uncomment the below to save your model 
#since I've already run my model I am loading from the pickle

#joblib.dump(km,  'doc_cluster.pkl')

km = joblib.load('doc_cluster.pkl')
clusters = km.labels_.tolist()

dossiers = { 'lycee' : lycees, 'eleve' : eleves, 'lettres': lettres, 'cluster' : clusters }
frame = pd.DataFrame(dossiers, index = [ clusters ], columns=['lycee', 'eleve', 'lettre', 'cluster'])
In [ ]:
frame['cluster'].value_counts()
In [ ]:
print("Top terms per cluster:")
print()
#sort cluster centers by proximity to centroid
order_centroids = km.cluster_centers_.argsort()[:, ::-1] 

for i in range(num_clusters):
    print("Cluster %d words:" % i, end='')

    for ind in order_centroids[i, :100]:
        print(' %s' % vocab_frame.ix[terms[ind].split(' ')].values.tolist()[0][0], end=',')
    print() #add whitespace
    print() #add whitespace

    # les dix premiers eleves et leurs lettres
    print("Cluster %d eleves:" % i, end='')
    l = []
    for eleve in frame.ix[i]['eleve'].values.tolist()[:10]:
        l.append(eleve)
        print(' %s,' % eleve, end='')

    print()
    print('exemple de lettres :')
    for e in l:
        print(lettres[eleves.index(e)])
        print()
    print() #add whitespace
    print() #add whitespace

print()
print()
In [ ]:
# à l'aide de l'etude précédente on trouve des noms adaptés pour les clusters

cluster_names = { 0 : 'Standard générique', 1 : 'Ciblée sur notre lycée', 4 : 'Style ampoulé',  
                 2 : 'Lettre CPGE', 3 :'Lettre MPSI' }
In [ ]:
import os  # for os.path.basename

import matplotlib.pyplot as plt
import matplotlib as mpl

from sklearn.manifold import MDS

MDS()

# convert two components as we're plotting points in a two-dimensional plane
# "precomputed" because we provide a distance matrix
# we will also specify `random_state` so the plot is reproducible.
mds = MDS(n_components=2, dissimilarity="precomputed", random_state=1)

pos = mds.fit_transform(dist)  # shape (n_components, n_samples)

xs, ys = pos[:, 0], pos[:, 1]
In [ ]:
#create data frame that has the result of the MDS plus the cluster numbers and titles
df = pd.DataFrame(dict(x=xs, y=ys, label=clusters)) 

#group by cluster
groups = df.groupby('label')

# set up plot
fig, ax = plt.subplots(figsize=(25, 12)) # set size
ax.margins(0.05) # Optional, just adds 5% padding to the autoscaling

#iterate through groups to layer the plot
#note that I use the cluster_name and cluster_color dicts with the 'name' lookup to return the appropriate color/label
for name, group in groups:
    ax.plot(group.x, group.y, marker='o', linestyle='', ms=30,
            label=cluster_names[name], 
            mec='none')
    ax.set_aspect('auto')
    ax.tick_params(\
        axis= 'x',          # changes apply to the x-axis
        which='both',      # both major and minor ticks are affected
        bottom=False,      # ticks along the bottom edge are off
        top=False,         # ticks along the top edge are off
        labelbottom=False)
    ax.tick_params(\
        axis= 'y',         # changes apply to the y-axis
        which='both',      # both major and minor ticks are affected
        left=False,      # ticks along the bottom edge are off
        top=False,         # ticks along the top edge are off
        labelleft=False)

ax.legend(numpoints=1)  #show legend with only 1 point
In [ ]:
from scipy.cluster.hierarchy import ward, dendrogram

linkage_matrix = ward(dist) #define the linkage_matrix using ward clustering pre-computed distances

fig, ax = plt.subplots(figsize=(15, 50)) # set size
ax = dendrogram(linkage_matrix, orientation="right", labels=eleves);

plt.tick_params(\
    axis= 'x',          # changes apply to the x-axis
    which='both',      # both major and minor ticks are affected
    bottom='off',      # ticks along the bottom edge are off
    top='off',         # ticks along the top edge are off
    labelbottom='off')

plt.tight_layout() #show plot with tight layout

#uncomment below to save figure
plt.savefig('ward_clusters.png', dpi=800) #save figure as ward_clusters