Actuellement, repart de l'ensemble des fichiers. Peu de volume ça va mais prévoir un simple append à terme.
import pandas as pd
from os import listdir
from os.path import isfile, join
synthetique = "/content/drive/MyDrive/Centrale Supelec - mémoire/Files/Extractions"
detail = "/content/drive/MyDrive/Centrale Supelec - mémoire/Files/Extractions/Détail"
path = "/content/drive/MyDrive/Centrale Supelec - mémoire/Files"
#lister tous les fichiers dans les répertoires
liste_synthetique = [f for f in listdir(synthetique) if isfile(join(synthetique, f))]
liste_detail = [f for f in listdir(detail) if isfile(join(detail, f))]
#lire chacun des fichiers des répertoires
#from ctypes import LittleEndianStructure
df_final = pd.DataFrame()
i=0
while i <len(liste_synthetique):
a = pd.read_excel(synthetique+"/"+liste_synthetique[i])
a = a.drop(columns= ["id_track"])
b = pd.read_excel(detail+"/"+liste_detail[i])
frames = [a,b]
df_final0 = pd.concat(frames, axis = 1, join="inner")
df_final = df_final.append(df_final0)
i=i+1
# Concaténer les fichiers et les enregistrer
df_final = df_final[['id_track', 'Intitulé', 'Entreprise', 'Localisation',
'Link', 'date de recherche', 'Description', 'Durée', 'Niveau', 'Type',
'Fonction', 'Secteur']]
df = df_final.reset_index()
df.to_excel(path+"/"+"LinkedIn - full concat.xlsx")
# Si besoin de réimporter le fichier
try :
df.head()
except :
import pandas as pd
path = "/content/drive/MyDrive/Centrale Supelec - mémoire/Files"
df = pd.read_excel(path+"/"+"LinkedIn - full concat.xlsx")
df = df.drop(columns = ["index","Unnamed: 0"])
path = "/content/drive/MyDrive/Centrale Supelec - mémoire/Files"
df = pd.read_excel(path+"/"+"LinkedIn - full concat.xlsx")
df = df.drop(columns = ["index","Unnamed: 0"])
df.shape
(3973, 12)
import pandas as pd
import json
!pip install clean-text
from cleantext import clean
import string
import re
!pip install unidecode
import unidecode
from string import digits
!pip install langdetect
from langdetect import *
from langdetect import DetectorFactory
DetectorFactory.seed = 0
# Conserver les lignes pour lesquelles il existe une description
nb_lines_ante = df.shape[0]
df = df[df['Description'].notna()]
nb_lines_poste = df.shape[0]
print(str(nb_lines_ante - nb_lines_poste)+" lignes supprimée(s) pour description nulle")
# Supprimer les éventuels doublons et vérifier s'il y a eu suppression
print("Il y a "+str(df.shape[0])+" lignes")
nb_lines_ante = df.shape[0]
df.drop_duplicates(keep = 'first', inplace=True,subset=['Description'])
nb_lines_poste = df.shape[0]
print(str(nb_lines_ante - nb_lines_poste)+" lignes supprimée(s) pour doublon")
# Trier les lignes issues d'une autre langue que le français
# question : comment gérer les textes en plusieurs langue ?
# benchmark des outils de detection de langues
# https://towardsdatascience.com/benchmarking-language-detection-for-nlp-8250ea8b67c
def langage_detection(text):
language_code = detect(text)
return language_code
df["Language"] = df["Description"].apply(lambda x : langage_detection(x))
df1 = df[(df.Language != "fr")]
print("Suppression de " +str(df1.shape[0])+ " lignes ne sont pas en langue française")
df = df[(df.Language == "fr")]
print("Il reste " +str(df.shape[0])+ " lignes à traiter")
df1.to_excel("english_job_offers.xlsx")
#Créer une nouvelle colonne pour le nettoyage
df["clean_description"] = df["Description"]
# Retirer les Urls
def remove_URL(text):
text = re.sub(r"http\S+", "", text)
text = re.sub(r"www\S+", "", text)
return re.sub(r"http\S+", "", text)
df["clean_description"] = df["clean_description"].apply(lambda x : remove_URL(x))
#1ère fonction pour retirer les emjojis, les symboles de monnaies.
def deEmojify(text) :
text = clean(text, no_emoji=True)
return(text)
df["clean_description"]= df["clean_description"].apply(lambda x : deEmojify(x))
# 2ème fonction pour retirer les emotes. Retire certain résidus, non retirés par le package clean-text
def deEmojify2(text):
regrex_pattern = re.compile(pattern = "["
u"\U0001F600-\U0001F64F" # emoticons
u"\U0001F300-\U0001F5FF" # symbols & pictographs
u"\U0001F680-\U0001F6FF" # transport & map symbols
u"\U0001F1E0-\U0001F1FF" # flags (iOS)
u"\U00002702-\U000027B0"
u"\U000024C2-\U0001F251"
"]+", flags = re.UNICODE)
return regrex_pattern.sub(r'',text)
df["clean_description"]= df["clean_description"].apply(lambda x:deEmojify2(x))
# Retirer les chiffres
def digit_remove(text) :
result = "".join([i for i in text if not i.isdigit()])
return result
df["clean_description"]= df["clean_description"].apply(lambda x: digit_remove(x))
# Mettre en minuscule
df["clean_description"] = df["clean_description"].apply(lambda x: x.lower())
#Retirer les monnaies, emails
def divers_clean(text) :
text = clean(text, no_currency_symbols=True, no_emails = True)
return(text)
df["clean_description"]= df["clean_description"].apply(lambda x:divers_clean(x))
# Retirer les éléments de textes de façon ad hoc en ajoutant un espace.
# Ces éléments ne sont pas inclus dans la bibliothèque de nettoyage utilisée plus tard.
#Création du dictionnaire des éléments à retirer
ad_hoc_dic = {"'":" ",
"’":" ",
"«":" ",
"»":" ",
"–":" ",
"-": " ",
"-" :" ",
"•" :" ",
"‘":" ",
"…": " ",
"/": " ",
"▪": " ",
"·":" ",
".":" "
}
def replace_strings(text,dic):
for i,j in dic.items():
text = text.replace(i,j)
return text
df["clean_description"] = df["clean_description"].apply(lambda x:replace_strings(x,ad_hoc_dic))
# Retirer la ponctuation
# Liste des ponctuations prises en compte par la bibliothèque
# string.punctuation
def remove_punctuation(text):
punctuationfree="".join([i for i in text if i not in string.punctuation])
return punctuationfree
df["clean_description"]= df["clean_description"].apply(lambda x:remove_punctuation(x))
#Import du dictionnaire des synonymes
with open("/content/drive/MyDrive/Centrale Supelec - mémoire/Files/synonymes.txt") as f:
synonyme_dic = f.read()
synonyme_dic = json.loads(synonyme_dic)
# Créer la fonction de remplacement des synonymes
def replace_synonymes(text,dic):
for i,j in dic.items():
text = text.replace(i,j)
return text
df["clean_description"] = df["clean_description"].apply(lambda x:replace_strings(x,synonyme_dic))
# Tokeniser le texte. On décide de tokeniser sur les mots car pas besoin a priori de conserver le niveau de la phrase.
#Créer la fonction tokeniser
def tokenization(text):
tokens = text.split()
return tokens
#Appliquer la fonction
df["tokenied_description"]= df["clean_description"].apply(lambda x: tokenization(x))
# Retirer les stopword
# Ouverture de la liste des stop words
with open("/content/drive/MyDrive/Centrale Supelec - mémoire/Files/stop_words_cleaned.txt",encoding='utf-8') as f:
stopwords = f.read().splitlines()
# Créer la fonction
def remove_stopwords(text):
output= [i for i in text if i not in stopwords]
return output
# Appliquer la fonction
df["noSw_description"] = ""
df["noSw_description"] = df["tokenied_description"].apply(lambda x: remove_stopwords(x))
#df.to_excel("/content/drive/MyDrive/Centrale Supelec - mémoire/Files/cleaned_linkedIn_jobs.xlsx")
!pip install spacy_lefff
import spacy
from spacy_lefff import LefffLemmatizer
from spacy.language import Language
# Instancier le lemmatizer
@Language.factory('french_lemmatizer')
def create_french_lemmatizer(nlp, name):
return LefffLemmatizer()
nlp = spacy.load('fr_dep_news_trf')
#Il existe deux modèles avec 2 objectifs
# accuracy : fr_dep_news_trf
# efficiency : fr_core_news_sm
#french_lemmatizer = LefffLemmatizer()
nlp.add_pipe("french_lemmatizer", name='lefff')
# Créer la fonction
def lemmatize(text):
doc = nlp(text)
for d in doc :
return d.lemma_ #lemmatizer simple
#return d._.lefff_lemma #lemmatizer complet
# Possible d'utiliser ._.lefff_lemma
# Mais il est très incomplet et retourne des None pour des mots en anglais ou inconnus.
# Demande de revoir la gestion des synonymes
# Problème dans la gestion des accès également à réaliser après.
list1=[]
# Ouvrir chaque cellule, lemmatizer chaque mot puis regrouper chaque cellule
for cell in df["noSw_description"]:
#créer une liste pour chaque offre
for word in cell :
list0 = []
lemmatized = lemmatize(word)
print(word)
print(lemmatized)
print()
list0.append(lemmatized)
list1.append(list0)
df["lemma"] = list1
# Importer NLTK
from nltk.stem.snowball import FrenchStemmer
stemmer = FrenchStemmer()
# Créer la fonction de stemming
def stemming (listed):
output = [stemmer.stem(i) for i in listed]
return output
import ast
def lister1(text):
result = ast.literal_eval(text)
return result
df["Stemmed"] = df["noSw_description"].apply(lambda x: stemming(x))
Ajouter une liste de terms communs
import gensim
from gensim.models import Phrases
# Occurence minimal du terme dans le texte
min_count = 2
# Seuil d'acceptation des bigrams
threshold = 10
#Création du modèle de bigram
bigram_mod = gensim.models.Phrases(df["Stemmed"], min_count, threshold)
#Création de la fonction de bigram
def bigram(texts):
return [bigram_mod[doc] for doc in texts]
df["Bigram"] = bigram(df["Stemmed"])
def token_concatenator(list) :
string = ' '.join([str(token) for token in list])
return string
df["Concatenated"] = df["Bigram"].apply(lambda x: token_concatenator(x))
# EN JSON
df.to_json("/content/drive/MyDrive/Centrale Supelec - mémoire/Files/clean_linkedIn_jobs")
# Import des fichiers
import pandas as pd
df_clean = pd.read_json("/content/drive/MyDrive/Centrale Supelec - mémoire/Files/clean_linkedIn_jobs")
df_clean.to_excel("/content/drive/MyDrive/Centrale Supelec - mémoire/Files/clean_linkedIn_jobs.xlsx")
df_clean.head()
id_track | Intitulé | Entreprise | Localisation | Link | date de recherche | Description | Durée | Niveau | Type | Fonction | Secteur | Language | clean_description | tokenied_description | noSw_description | Stemmed | Bigram | Concatenated | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 207dQJ9ajwGGHfH9amp54Q== | BUSINESS DEVELOPER EN PERFORMANCE DES RESSOURC... | Ayming | Levallois-Perret | https://fr.linkedin.com/jobs/view/business-dev... | 1658448000000 | Votre Job\n\n\n\n\nQuelles sont vos missions c... | il y a 3 jours | Premier emploi | Temps plein | Ventes et Développement commercial | Services et conseil aux entreprises | fr | votre job\nquelles sont vos missions chez aymi... | [votre, job, quelles, sont, vos, missions, che... | [job, missions, ayming, activite, conseil, rol... | [job, mission, ayming, activit, conseil, rol, ... | [job, mission, ayming, activit, conseil, rol, ... | job mission ayming activit conseil rol busines... |
1 | BlIS3I7uJX/cDxWCgmf6qA== | Directeur ressources humaines H/F | Hays | Montpellier | https://fr.linkedin.com/jobs/view/directeur-re... | 1658448000000 | Notre client, acteur incontournable du secteur... | il y a 3 semaines | Directeur | Temps plein | Ressources humaines | Fabrication de semi-conducteurs pour énergies ... | fr | notre client acteur incontournable du secteur ... | [notre, client, acteur, incontournable, du, se... | [client, acteur, incontournable, secteur, ener... | [client, acteur, incontourn, secteur, energ, r... | [client, acteur_incontourn, secteur, energ, re... | client acteur_incontourn secteur energ recherc... |
2 | niFoPTymYw468rAmpAvIbw== | Head of Sales- Directeur de la Relation Client... | Nextories | Paris et périphérie | https://fr.linkedin.com/jobs/view/head-of-sale... | 1658448000000 | Première plateforme de réservation de déménage... | il y a 1 semaine | Temps plein | None | None | None | fr | premiere plateforme de reservation de demenage... | [premiere, plateforme, de, reservation, de, de... | [premiere, plateforme, reservation, demenageur... | [premier, plateform, reserv, demenageur, nexto... | [premier, plateform, reserv, demenageur, nexto... | premier plateform reserv demenageur nextor voc... |
3 | 0Y5o2uHdhUyxpjfPZ4eryQ== | Chargé(e) de développement RH/Formation et dév... | CIMPA PLM Services | Toulouse | https://fr.linkedin.com/jobs/view/charg%C3%A9-... | 1658448000000 | CIMPA, filiale du groupe Sopra Steria, est un ... | il y a 1 semaine | Confirmé | Temps plein | Ressources humaines et Formation | Fabrication de composants pour l’industrie aér... | fr | cimpa filiale du groupe sopra steria est un de... | [cimpa, filiale, du, groupe, sopra, steria, es... | [cimpa, filiale, groupe, sopra, steria, leader... | [cimp, filial, group, sopr, steri, leader, eur... | [cimp, filial_group, sopr_steri, leader_europe... | cimp filial_group sopr_steri leader_europeen d... |
4 | r/NDDWVnsdzOmtnKrmipsg== | Chargé(e) de rémunération RH H/F | Stelliant | Colombes | https://fr.linkedin.com/jobs/view/charg%C3%A9-... | 1658448000000 | Le groupe Stelliant, avec un CA de 260 M€, lea... | il y a 3 jours | Confirmé | Temps plein | Ressources humaines et Gestion | Assurances | fr | le groupe stelliant avec un ca de m leader dan... | [le, groupe, stelliant, avec, un, ca, de, m, l... | [groupe, stelliant, leader, expertise, service... | [group, stelli, lead, expertis, servic, assur,... | [group_stelli, lead, expertis, servic, assur, ... | group_stelli lead expertis servic assur compt ... |
#Compter le nombre de token après le stemming
word_count = pd.Series(sum([item for item in df_clean.Bigram], [])).value_counts()
dict_word_count = {"Mot":word_count.index.values, "Nombre":word_count.values}
df_word_count = pd.DataFrame(dict_word_count)
#df_word_count_stemmed.to_csv("/content/drive/MyDrive/Centrale Supelec - mémoire/Files/word_count_stemmed.csv")
df_word_count
Mot | Nombre | |
---|---|---|
0 | ressourc_humain | 15304 |
1 | gestion | 6605 |
2 | equip | 5304 |
3 | recrut | 5254 |
4 | format | 4546 |
... | ... | ... |
24359 | mutualisent | 1 |
24360 | interdisciplinair | 1 |
24361 | brejt | 1 |
24362 | beat | 1 |
24363 | montbonnot | 1 |
24364 rows × 2 columns
Warning: total number of rows (24364) exceeds max_rows (20000). Limiting to first (20000) rows.
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
import numpy as np
from PIL import Image
d = {}
for a,x in df_word_count.values:
d[a]=x
wc = WordCloud(background_color="white", max_words=20)
wc.generate_from_frequencies(d)
plt.imshow(wc, interpolation="bilinear")
plt.axis("off")
plt.figure(figsize =(3,3), dpi = 100)
plt.savefig("Wordcloud1.png")
plt.show()
<Figure size 300x300 with 0 Axes>
from gensim import corpora, models
from gensim.models import CoherenceModel
!pip install pyLDAvis
%matplotlib inline
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)
import time
import matplotlib.pyplot as plt
#Création du DataFrame de restitution des résultats
df_num_topics = pd.DataFrame(columns=["Number of topics","Coherence Value", "Perplexity Value","Duration"])
data_source ="Bigram"
#Définition des valeurs d'itération
start = 1
end = 20
step = 1
#Lancement des itérations
for num_topics in range(start, end, step) :
#Un valeur de limit définie par expérience, sera ajustée
below_limit = len(df_clean[data_source])/(num_topics*3)
#Chargement du corpus de text
dictionary_LDA = corpora.Dictionary(df_clean[data_source])
#Filtrage des extrêmes du corpus
dictionary_LDA.filter_extremes(no_below=below_limit)
corpus = [dictionary_LDA.doc2bow(list_of_tokens) for list_of_tokens in df_clean[data_source]]
start = time.time()
%time lda_model = models.LdaModel(corpus, \
random_state=0, \
num_topics=num_topics, \
id2word=dictionary_LDA, \
chunksize = 1000, \
passes= 10, \
alpha = "auto", \
eta=0.01*len(dictionary_LDA.keys()))
end = time.time()
duration = end - start
coherence_model_lda = CoherenceModel(model=lda_model, texts=df_clean[data_source], dictionary=dictionary_LDA, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
perplexity = lda_model.log_perplexity(corpus)
#Enregistrement des résultats et ajout dans le DataFrame
new_row = {"Number of topics" :num_topics ,"Coherence Value" : coherence_lda, "Perplexity Value" :perplexity, "Duration":duration }
df_num_topics = df_num_topics.append(new_row, ignore_index=True)
print(df_num_topics)
#Enregistrement des résultats
df_num_topics.to_csv("/content/drive/MyDrive/Centrale Supelec - mémoire/Files/number_of_topics.csv")
print(df_num_topics)
#Affichage des graphiques de comparaison
print("Cohérence")
print("measures the degree of semantic similarity between high scoring words in the topic.")
plt.plot(df_num_topics["Number of topics"], df_num_topics["Coherence Value"])
plt.show()
print("")
print("Perplexity")
print("captures how surprised a model is of new data it has not seen before, and is measured as the normalized log-likelihood of a held-out test set.")
plt.plot(df_num_topics["Number of topics"], df_num_topics["Perplexity Value"])
plt.show()
print("")
print("Duration")
plt.plot(df_num_topics["Number of topics"], df_num_topics["Duration"])
plt.show()
Number of topics Coherence Value Perplexity Value Duration 0 1.0 0.286835 -2.934444 2.032070 1 2.0 0.350899 -4.473031 15.422312 2 3.0 0.373375 -4.989392 17.480104 3 4.0 0.343543 -5.337832 19.439226 4 5.0 0.337770 -5.539370 26.644900 5 6.0 0.375206 -5.695915 22.970381 6 7.0 0.341574 -5.866452 24.399402 7 8.0 0.429432 -5.991241 23.162206 8 9.0 0.395694 -6.092819 23.086591 9 10.0 0.411773 -6.166275 23.616960 10 11.0 0.405491 -6.240823 24.516908 11 12.0 0.395302 -6.305894 24.833649 12 13.0 0.354007 -6.364524 25.459748 13 14.0 0.369627 -6.419422 27.618713 14 15.0 0.302849 -6.468026 30.072980 15 16.0 0.378977 -6.506609 27.700157 16 17.0 0.308133 -6.525176 27.646029 17 18.0 0.386590 -6.572245 30.635551 18 19.0 0.424391 -6.617898 33.676461 Cohérence measures the degree of semantic similarity between high scoring words in the topic.
Perplexity captures how surprised a model is of new data it has not seen before, and is measured as the normalized log-likelihood of a held-out test set.
Duration
topics = []
for idx, topic in lda_model.print_topics(-1) :
#print("Topic: {} -> Words: {}".format(idx, topic))
topics.append(topic)
topics
# Exporter la part de chaque cluster pour chaque doc
mixture = [dict(lda_model[x]) for x in corpus]
mixture
df_test = df_clean.sample(frac = 0.5)
df_contre_test = df_clean.drop(df_test.index)
#Création du DataFrame de restitution des résultats
df_num_topics = pd.DataFrame(columns=["Number of topics","Coherence Value", "Perplexity Value","Duration"])
data_source ="Bigram"
#Définition des valeurs d'itération
start = 1
end = 20
step = 1
#Lancement des itérations
for num_topics in range(start, end, step) :
#Un valeur de limit définie par expérience, sera ajustée
below_limit = len(df_clean[data_source])/(num_topics*3)
#Chargement du corpus de text
dictionary_LDA = corpora.Dictionary(df_clean[data_source])
#Filtrage des extrêmes du corpus
dictionary_LDA.filter_extremes(no_below=below_limit)
corpus = [dictionary_LDA.doc2bow(list_of_tokens) for list_of_tokens in df_test[data_source]]
start = time.time()
%time lda_model = models.LdaModel(corpus, \
random_state=0, \
num_topics=num_topics, \
id2word=dictionary_LDA, \
chunksize = 1000, \
passes= 10, \
alpha = "auto", \
eta=0.01*len(dictionary_LDA.keys()))
end = time.time()
duration = end - start
coherence_model_lda = CoherenceModel(model=lda_model, texts=df_test[data_source], dictionary=dictionary_LDA, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
perplexity = lda_model.log_perplexity(corpus)
#Enregistrement des résultats et ajout dans le DataFrame
new_row = {"Number of topics" :num_topics ,"Coherence Value" : coherence_lda, "Perplexity Value" :perplexity, "Duration":duration }
df_num_topics = df_num_topics.append(new_row, ignore_index=True)
print(df_num_topics)
#Enregistrement des résultats
df_num_topics.to_csv("/content/drive/MyDrive/Centrale Supelec - mémoire/Files/number_of_topics_test.csv")
print(df_num_topics)
#Affichage des graphiques de comparaison
print("Cohérence")
print("measures the degree of semantic similarity between high scoring words in the topic.")
plt.plot(df_num_topics["Number of topics"], df_num_topics["Coherence Value"])
plt.show()
print("")
print("Perplexity")
print("captures how surprised a model is of new data it has not seen before, and is measured as the normalized log-likelihood of a held-out test set.")
plt.plot(df_num_topics["Number of topics"], df_num_topics["Perplexity Value"])
plt.show()
print("")
print("Duration")
plt.plot(df_num_topics["Number of topics"], df_num_topics["Duration"])
plt.show()
#Création du DataFrame de restitution des résultats
df_num_topics = pd.DataFrame(columns=["Number of topics","Coherence Value", "Perplexity Value","Duration"])
data_source ="Bigram"
#Définition des valeurs d'itération
start = 1
end = 20
step = 1
#Lancement des itérations
for num_topics in range(start, end, step) :
#Un valeur de limit définie par expérience, sera ajustée
below_limit = len(df_clean[data_source])/(num_topics*3)
#Chargement du corpus de text
dictionary_LDA = corpora.Dictionary(df_clean[data_source])
#Filtrage des extrêmes du corpus
dictionary_LDA.filter_extremes(no_below=below_limit)
corpus = [dictionary_LDA.doc2bow(list_of_tokens) for list_of_tokens in df_contre_test[data_source]]
start = time.time()
%time lda_model = models.LdaModel(corpus, \
random_state=0, \
num_topics=num_topics, \
id2word=dictionary_LDA, \
chunksize = 1000, \
passes= 10, \
alpha = "auto", \
eta=0.01*len(dictionary_LDA.keys()))
end = time.time()
duration = end - start
coherence_model_lda = CoherenceModel(model=lda_model, texts=df_contre_test[data_source], dictionary=dictionary_LDA, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
perplexity = lda_model.log_perplexity(corpus)
#Enregistrement des résultats et ajout dans le DataFrame
new_row = {"Number of topics" :num_topics ,"Coherence Value" : coherence_lda, "Perplexity Value" :perplexity, "Duration":duration }
df_num_topics = df_num_topics.append(new_row, ignore_index=True)
print(df_num_topics)
#Enregistrement des résultats
df_num_topics.to_csv("/content/drive/MyDrive/Centrale Supelec - mémoire/Files/number_of_topics_contre_test.csv")
print(df_num_topics)
#Affichage des graphiques de comparaison
print("Cohérence")
print("measures the degree of semantic similarity between high scoring words in the topic.")
plt.plot(df_num_topics["Number of topics"], df_num_topics["Coherence Value"])
plt.show()
print("")
print("Perplexity")
print("captures how surprised a model is of new data it has not seen before, and is measured as the normalized log-likelihood of a held-out test set.")
plt.plot(df_num_topics["Number of topics"], df_num_topics["Perplexity Value"])
plt.show()
print("")
print("Duration")
plt.plot(df_num_topics["Number of topics"], df_num_topics["Duration"])
plt.show()
from gensim import corpora, models
!pip install pyLDAvis
%matplotlib inline
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)
data_source ="Bigram"
dictionary_LDA = corpora.Dictionary(df_clean[data_source])
len(dictionary_LDA.keys())
24364
data_source ="Bigram"
num_topics = 8
#Un valeur de limit définie par expérience
below_limit = len(df_clean[data_source])/(num_topics*3)
#below_limit = 300
#Chargement du corpus de text
dictionary_LDA = corpora.Dictionary(df_clean[data_source])
#Filtrage des extrêmes du corpus
dictionary_LDA.filter_extremes(no_below=below_limit)
corpus = [dictionary_LDA.doc2bow(list_of_tokens) for list_of_tokens in df_clean[data_source]]
# Paramétrage de l'algo LDA
lda_model = models.LdaModel(corpus, \
random_state=0, \
num_topics=num_topics, \
id2word=dictionary_LDA, \
chunksize = 1000, \
passes= 10, \
alpha = "auto", \
eta=0.01*len(dictionary_LDA.keys()))
"""
chunksize : correspond au nombre de documents à charger en même temps.
Par défaut 2000. Une valeur plus grande induit une meilleure rapidité d'exécution tant que la mémoire le permet
passes : correspond au nombre d'itérations à réaliser
Par défaut 1.
Alpha : représente la distribution a priori des topics dans chaque document.
Par défaut 0,1 Un alpha plus important induit une plus grande distribution des topics dans chaque document
Eta : représente la distribution à priori des mots pour chaque topic
Par défaut 0,1, un Eta plus important induit que les topics sont constitués de moins de mots
"""
"\nchunksize : correspond au nombre de documents à charger en même temps. \nPar défaut 2000. Une valeur plus grande induit une meilleure rapidité d'exécution tant que la mémoire le permet\n\npasses : correspond au nombre d'itérations à réaliser\nPar défaut 1.\n\nAlpha : représente la distribution a priori des topics dans chaque document.\nPar défaut 0,1 Un alpha plus important induit une plus grande distribution des topics dans chaque document\n\nEta : représente la distribution à priori des mots pour chaque topic\nPar défaut 0,1, un Eta plus important induit que les topics sont constitués de moins de mots\n\n"
# Affichage des résultats
lda_viz = gensimvis.prepare(lda_model, corpus, dictionary_LDA)
pyLDAvis.enable_notebook()
pyLDAvis.display(lda_viz)
/usr/local/lib/python3.7/dist-packages/pyLDAvis/_prepare.py:247: FutureWarning: In a future version of pandas all arguments of DataFrame.drop except for the argument 'labels' will be keyword-only by='saliency', ascending=False).head(R).drop('saliency', 1)
pyLDAvis.save_html(lda_viz, 'lda.html')
# Affichage des résultats de l'algoritme
for i,topic in lda_model.show_topics(formatted=True, num_topics=num_topics, num_words=10):
print(str(i)+": "+ topic)
print()
0: 0.024*"agenc" + 0.017*"agenc_emploi" + 0.017*"cdi_cdd" + 0.017*"competent" + 0.015*"interimair" + 0.014*"savoir_fair" + 0.014*"proximit_met" + 0.014*"present_territoir" + 0.014*"franc_adecco" + 0.014*"contrat_cherch" 1: 0.019*"candidat" + 0.016*"altern" + 0.015*"commun" + 0.014*"recherch" + 0.013*"integr" + 0.011*"entretien" + 0.011*"reseau_social" + 0.010*"intern" + 0.009*"marqu_employeur" + 0.009*"processus_recrut" 2: 0.019*"group" + 0.013*"innov" + 0.013*"franc" + 0.012*"manag" + 0.011*"commun" + 0.010*"solut" + 0.010*"competent" + 0.009*"evolu" + 0.009*"talent" + 0.009*"ete" 3: 0.020*"operationnel" + 0.018*"direct" + 0.017*"group" + 0.017*"social" + 0.016*"polit" + 0.015*"respons" + 0.015*"conseil" + 0.012*"sit" + 0.012*"organis" + 0.011*"anim" 4: 0.038*"pai" + 0.027*"administr" + 0.021*"salar" + 0.018*"personnel" + 0.015*"administr_personnel" + 0.015*"assist" + 0.013*"social" + 0.013*"travail" + 0.013*"etabl" + 0.012*"ger" 5: 0.022*"agenc" + 0.019*"reseau" + 0.018*"franc" + 0.017*"commercial" + 0.013*"direct" + 0.013*"humain" + 0.012*"group" + 0.011*"cadr" + 0.010*"meti" + 0.009*"techniqu" 6: 0.019*"integr" + 0.019*"altern" + 0.017*"charg" + 0.014*"ete" + 0.012*"organis" + 0.012*"administr" + 0.011*"dynam" + 0.010*"domain" + 0.010*"meti" + 0.009*"fonction" 7: 0.019*"outil" + 0.017*"activit" + 0.014*"direct" + 0.014*"organis" + 0.012*"techniqu" + 0.012*"social" + 0.011*"qualit" + 0.011*"securit" + 0.010*"connaiss" + 0.009*"realis"
# Exporter la part de chaque cluster pour chaque doc
mixture = [dict(lda_model[x]) for x in corpus]
mixture_df = pd.DataFrame(mixture)
index = pd.DataFrame(df_clean.index)
text = df_clean["Bigram"]
mixture_df1=[]
mixture_df1 =pd.concat([index,mixture_df, text], axis=1,ignore_index=True)
mixture_df1.columns = ["Document_No","Topic 3","Topic 4","Topic 6","Topic 7","Topic 8","Topic 5","Topic 2","Topic 1","Text"]
mixture_df1.to_excel("mixture_results_vf.xlsx")
mixture_df1
Document_No | Topic 3 | Topic 4 | Topic 6 | Topic 7 | Topic 8 | Topic 5 | Topic 2 | Topic 1 | Text | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 0.0 | 0.381200 | 0.105543 | 0.508441 | NaN | NaN | NaN | NaN | NaN | [job, mission, ayming, activit, conseil, rol, ... |
1 | 1.0 | 0.550027 | 0.444944 | NaN | NaN | NaN | NaN | NaN | NaN | [client, acteur_incontourn, secteur, energ, re... |
2 | 2.0 | 0.396284 | 0.176494 | 0.424460 | NaN | NaN | NaN | NaN | NaN | [premier, plateform, reserv, demenageur, nexto... |
3 | 3.0 | 0.505978 | NaN | NaN | 0.269114 | 0.220479 | NaN | NaN | NaN | [cimp, filial_group, sopr_steri, leader_europe... |
4 | 4.0 | NaN | 0.414394 | NaN | 0.141011 | 0.441615 | NaN | NaN | NaN | [group_stelli, lead, expertis, servic, assur, ... |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
3968 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | [mission, adecco, tertiair, toulous, recherch,... |
3969 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | [tanner, vivoin, tanner, cuir, precieux, maiso... |
3970 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | [entrepris, compos, collabor, traver_mond, soc... |
3971 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | [entrepris, cabinet_manpow, conseil, recrut, r... |
3972 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | [entrepris_manpow, grenobl, tertiair, recherch... |
3702 rows × 10 columns
def format_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=df_clean[data_source]):
# Init output
sent_topics_df = pd.DataFrame()
# Get main topic in each document
for i, row in enumerate(ldamodel[corpus]):
row = sorted(row, key=lambda x: (x[1]), reverse=True)
# Get the Dominant topic, Perc Contribution and Keywords for each document
for j, (topic_num, prop_topic) in enumerate(row):
if j == 0: # => dominant topic
wp = ldamodel.show_topic(topic_num)
topic_keywords = ", ".join([word for word, prop in wp])
sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
else:
break
sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']
# Add original text to the end of the output
#contents = pd.Series(texts)
#sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
index = pd.DataFrame(df_clean.index)
sent_topics_df = pd.concat([index,sent_topics_df, text], axis=1)
return(sent_topics_df)
df_topic_sents_keywords = format_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=df_clean[data_source])
df_topic_sents_keywords.dropna(inplace=True)
# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ["id_df",'Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords',"Text"]
df_dominant_topic.dropna(inplace=True)
# Show
df_dominant_topic.head(10)
id_df | Document_No | Dominant_Topic | Topic_Perc_Contrib | Keywords | Text | |
---|---|---|---|---|---|---|
0 | 0 | 0.0 | 5.0 | 0.5082 | agenc, reseau, franc, commercial, direct, huma... | [job, mission, ayming, activit, conseil, rol, ... |
1 | 1 | 1.0 | 2.0 | 0.5500 | group, innov, franc, manag, commun, solut, com... | [client, acteur_incontourn, secteur, energ, re... |
2 | 2 | 2.0 | 5.0 | 0.4244 | agenc, reseau, franc, commercial, direct, huma... | [premier, plateform, reserv, demenageur, nexto... |
3 | 3 | 3.0 | 2.0 | 0.5060 | group, innov, franc, manag, commun, solut, com... | [cimp, filial_group, sopr_steri, leader_europe... |
4 | 4 | 4.0 | 7.0 | 0.4416 | outil, activit, direct, organis, techniqu, soc... | [group_stelli, lead, expertis, servic, assur, ... |
5 | 5 | 5.0 | 2.0 | 0.6129 | group, innov, franc, manag, commun, solut, com... | [ayming, group_international, conseil, present... |
6 | 6 | 6.0 | 7.0 | 0.4529 | outil, activit, direct, organis, techniqu, soc... | [mission, attendent, candidat, mot, recherchon... |
7 | 7 | 7.0 | 7.0 | 0.4831 | outil, activit, direct, organis, techniqu, soc... | [cdi_temp, etabl, resident, medicalise, leis, ... |
8 | 8 | 8.0 | 3.0 | 0.3327 | operationnel, direct, group, social, polit, re... | [perfhomm, cabinet_recrut, approch, recrut, di... |
9 | 9 | 9.0 | 3.0 | 0.5825 | operationnel, direct, group, social, polit, re... | [recherchon, pol, atalian, propret, respons, r... |
df_dominant_topic.to_excel("Dominant topic.xlsx")
# Group top 5 sentences under each topic
sent_topics_sorteddf_mallet = pd.DataFrame()
sent_topics_outdf_grpd = df_topic_sents_keywords.groupby('Dominant_Topic')
for i, grp in sent_topics_outdf_grpd:
sent_topics_sorteddf_mallet = pd.concat([sent_topics_sorteddf_mallet,
grp.sort_values(['Perc_Contribution'], ascending=[0]).head(1)],
axis=0)
# Reset Index
sent_topics_sorteddf_mallet.reset_index(drop=True, inplace=True)
# Format
sent_topics_sorteddf_mallet.columns = ['Topic_Num', "Topic_Perc_Contrib", "Keywords", "Text"]
# Show
sent_topics_sorteddf_mallet
Topic_Num | Topic_Perc_Contrib | Keywords | Text | |
---|---|---|---|---|
0 | 3920 | 0.0 | 0.9924 | agenc, agenc_emploi, cdi_cdd, competent, inter... |
1 | 263 | 1.0 | 0.9947 | candidat, altern, commun, recherch, integr, en... |
2 | 3953 | 2.0 | 0.9979 | group, innov, franc, manag, commun, solut, com... |
3 | 3969 | 3.0 | 0.9977 | operationnel, direct, group, social, polit, re... |
4 | 3757 | 4.0 | 0.9965 | pai, administr, salar, personnel, administr_pe... |
5 | 643 | 5.0 | 0.9959 | agenc, reseau, franc, commercial, direct, huma... |
6 | 3866 | 6.0 | 0.9941 | integr, altern, charg, ete, organis, administr... |
7 | 3890 | 7.0 | 0.9983 | outil, activit, direct, organis, techniqu, soc... |
https://camembert-model.fr/#download
https://huggingface.co/docs/transformers/model_doc/camembert
Ce serait bien d'avoir le score de performance du résumé
!pip install hydra-core
!pip install omegaconf
!pip install bitarray
!pip install sacrebleu
!pip install transformers
!pip install sentencepiece
#!pip install regex
import torch
from transformers import RobertaTokenizerFast, EncoderDecoderModel, AutoTokenizer
device = 'cuda' if torch.cuda.is_available() else 'cpu'
ckpt = 'mrm8488/camembert2camembert_shared-finetuned-french-summarization'
tokenizer = AutoTokenizer.from_pretrained(ckpt)
model = EncoderDecoderModel.from_pretrained(ckpt).to(device)
def generate_summary(text):
inputs = tokenizer([text], padding="max_length", truncation=True,max_length=514,return_tensors="pt")
#par défaut max_length=512
input_ids = inputs.input_ids.to(device)
attention_mask = inputs.attention_mask.to(device)
output = model.generate(input_ids, attention_mask=attention_mask)
return tokenizer.decode(output[0], skip_special_tokens=True)
The following encoder weights were not tied to the decoder ['roberta/pooler'] The following encoder weights were not tied to the decoder ['roberta/pooler']
summarized = generate_summary(df_clean["Description"][0])
Setting `pad_token_id` to `eos_token_id`:6 for open-end generation.
summarized
'« Le Monde » a invité des entrepreneurs et des dirigeants d’entreprise à se confier sur leur parcours et leur vision de l’avenir du travail. Cette semaine, Ayming Digital Academy et son cabinet de conseil Aymer Digital Academy ont fait leur entrée au capital d’une entreprise spécialisée dans la gestion de carrière'
df_clean["Description"][1]
"Notre client, acteur incontournable du secteur de l'énergie, recherche son Directeur Ressources Humaines Groupe. Poste basé dans le Languedoc Roussillon.\n\nRattaché directement au Secrétaire Général du groupe, vous avez la responsabilité du département Ressources Humaines et vous définissez l'ensemble de la politique RH du groupe. Vous pilotez les équipes en charge du recrutement, de la formation et de la gestion administrative. Vous incarnez l'essence même et l'ADN du groupe sur ses valeurs humaines. Vous intervenez sur un périmètre international (France, Europe, Afrique...) sur une population de 400 collaborateurs. Vous assurez le déploiement et la digitalisation de la stratégie RH France, vous accompagnez la croissance externe, ainsi que les évolutions organisationnelles et managériales. vous proposez des axes d'amélioration en termes d'organisation fonctionnelle (amélioration continue des processus) et vous harmonisez la politique RH en place. Enfin, vous veillez au renforcement de l'adhésion et à l'engagement des collaborateurs et vous assumez une part de représentation externe lors d'événements liés à la vie de l'entreprise, à la stratégie d'entreprise et de fonction RH.\n\nIssu d'une formation de type Bac +5 en Management des Ressources Humaines, vous bénéficiez d'une expérience minimum de 5 ans sur le même type de poste. Vous êtes doté de grandes qualités de communication pour traiter avec des interlocuteurs de différents départements. Vous êtes dynamique, avec l'esprit d'entrepreneur et acteur de la conduite du changement. Vous justifiez d'une expérience dans l'internationalisation d'une entreprise en forte croissance, avec une maîtrise de l'anglais. Vous êtes reconnu pour votre leadership, votre écoute et votre capacité à manager des équipes. Vous avez un rôle de Team Player et de fortes capacités relationnelles."
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
tokenizer = AutoTokenizer.from_pretrained("mrm8488/camembert2camembert_shared-finetuned-french-summarization")
model = AutoModelForSeq2SeqLM.from_pretrained("mrm8488/camembert2camembert_shared-finetuned-french-summarization"
from transformers import CamembertConfig, CamembertModel
configuration = CamembertConfig()
CamembertModel = CamembertModel(configuration)
from transformers import RobertaConfig, RobertaModel
# pretrained roberta model
configuration = RobertaConfig()
roberta_model = RobertaModel(configuration)
def depth_score(timeseries):
"""
The depth score corresponds to how strongly the cues for a subtopic changed on both sides of a
given token-sequence gap and is based on the distance from the peaks on both sides of the valleyto that valley.
returns depth_scores
"""
depth_scores = []
for i in range(1, len(timeseries) - 1):
left, right = i - 1, i + 1
while left > 0 and timeseries[left - 1] > timeseries[left]:
left -= 1
while (
right < (len(timeseries) - 1) and timeseries[right + 1] > timeseries[right]
):
right += 1
depth_scores.append(
(timeseries[right] - timeseries[i]) + (timeseries[left] - timeseries[i])
)
return depth_scores
def smooth(timeseries, n, s):
smoothed_timeseries = timeseries[:]
for _ in range(n):
for index in range(len(smoothed_timeseries)):
neighbours = smoothed_timeseries[
max(0, index - s) : min(len(timeseries) - 1, index + s)
]
smoothed_timeseries[index] = sum(neighbours) / len(neighbours)
return smoothed_timeseries
def sentences_similarity(first_sentence_features, second_sentence_features) -> float:
"""
Given two senteneces embedding features compute cosine similarity
"""
similarity_metric = torch.nn.CosineSimilarity()
return float(similarity_metric(first_sentence_features, second_sentence_features))
def compute_window(timeseries, start_index, end_index):
"""given start and end index of embedding, compute pooled window value
[window_size, 768] -> [1, 768]
"""
stack = torch.stack([features[0] for features in timeseries[start_index:end_index]])
stack = stack.unsqueeze(
0
) # https://jbencook.com/adding-a-dimension-to-a-tensor-in-pytorch/
stack_size = end_index - start_index
pooling = torch.nn.MaxPool2d((stack_size - 1, 1))
return pooling(stack)
def block_comparison_score(timeseries, k):
"""
comparison score for a gap (i)
cfr. docstring of block_comparison_score
"""
res = []
for i in range(k, len(timeseries) - k):
first_window_features = compute_window(timeseries, i - k, i + 1)
second_window_features = compute_window(timeseries, i + 1, i + k + 2)
res.append(
sentences_similarity(first_window_features[0], second_window_features[0])
)
return res
def get_features_from_sentence(batch_sentences, layer=-2):
"""
extracts the BERT semantic representation
from a sentence, using an averaged value of
the `layer`-th layer
returns a 1-dimensional tensor of size 758
"""
batch_features = []
for sentence in batch_sentences:
tokens = CamembertModel.encode(sentence)
all_layers = CamembertModel.extract_features(tokens, return_all_hiddens=True)
pooling = torch.nn.AvgPool2d((len(tokens), 1))
sentence_features = pooling(all_layers[layer])
batch_features.append(sentence_features[0])
return batch_features
get_features_from_sentence(df_clean["Description"])
df_clean['Bert_tokens'] = df_clean['Stemmed'].apply(lambda x: camembert_embed(x))
df_clean.head()
!pip install transformers
!pip install hydra-core
!pip install omegaconf
!pip install bitarray
!pip install sacrebleu
!pip install sentencepiece
from transformers import CamembertConfig, CamembertModel
configuration = CamembertConfig()
CamembertModel = CamembertModel(configuration)
import matplotlib.pyplot as plt
import torch
import numpy as np
np.random.seed(0)
# Loading CamemBERT model
camembert = torch.hub.load('pytorch/fairseq', 'camembert-base')
def camembert_embed(text):
tokens = camembert.encode(text)
with torch.no_grad():
encoded_layers = camembert.extract_features(tokens,return_all_hiddens=True)
token_embeddings = torch.stack(encoded_layers, dim=0)
token_embeddings = torch.squeeze(token_embeddings, dim=1)
sum_vec = torch.sum(token_embeddings[-4:], dim=0)
sentence_embedding = torch.mean(sum_vec,dim=0)
array = sentence_embedding.detach().numpy()
return array
df_clean['CamemBert_tokens'] = df_clean['Concatenated'].apply(lambda x: camembert_embed(x))
df_clean.head()
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df_clean["Concatenated"])
from sklearn.cluster import KMeans
random.seed(10)
Sum_of_squared_distances = pd.DataFrame(columns=["Number of clusters","Sum of Squared Distances"])
start = 1
end = 20
step = 1
for num_clusters_km in range(start, end, step) :
km = KMeans(n_clusters=num_clusters,, random_state = 0)
km = km.fit(X)
new_row = {"Number of clusters" :num_clusters ,"Sum of Squared Distances" : km.inertia_}
Sum_of_squared_distances = Sum_of_squared_distances.append(new_row, ignore_index=True)
plt.plot(Sum_of_squared_distances["Number of clusters"],Sum_of_squared_distances["Sum of Squared Distances"])
plt.xlabel('k')
plt.ylabel('Sum_of_squared_distances')
plt.title('Elbow Method For Optimal k')
plt.show()
# On va garder 3 clusters
km = KMeans(n_clusters=3, random_state = 0)
km = km.fit(X)
df_clean["Clusters KM"] = km.labels_
df_clean.to_excel(("/content/drive/MyDrive/Centrale Supelec - mémoire/Files/clustered_linkedIn_jobs.xlsx"))
from sklearn.cluster import SpectralClustering
import matplotlib.pyplot as plt
from numpy import random
sc = SpectralClustering(n_clusters=3, random_state = 0).fit(X)
print(sc.affinity_matrix_)
[[1. 0.15091984 0.16372013 ... 0.16001521 0.15890312 0.14124878] [0.15091984 1. 0.17294612 ... 0.16998603 0.17078007 0.14342057] [0.16372013 0.17294612 1. ... 0.15306546 0.15537233 0.14123704] ... [0.16001521 0.16998603 0.15306546 ... 1. 0.18641539 0.15567204] [0.15890312 0.17078007 0.15537233 ... 0.18641539 1. 0.14427919] [0.14124878 0.14342057 0.14123704 ... 0.15567204 0.14427919 1. ]]
from sklearn.cluster import SpectralClustering
import matplotlib.pyplot as plt
from numpy import random
random.seed(1)
start = 1
end = 20
step = 1
def num_clusters_SC in range(start, end, step) :
sc = SpectralClustering(n_clusters=num_clusters_SC, random_state = 0).fit(x)