Commit eb66c02a authored by Zied SELLAMI's avatar Zied SELLAMI
Browse files

Delete unused files

parent 46972ed5
...@@ -26,7 +26,7 @@ docker run -it -p 5551:9991 demo:priorityInbox ...@@ -26,7 +26,7 @@ docker run -it -p 5551:9991 demo:priorityInbox
### Keywords extractor service ### Keywords extractor service
``` ```
curl -X POST 'http://0.0.0.0:5551/rest/priorityinbox/parsejson -F "file=@/path/to/email.json" curl -X POST 'http://0.0.0.0:5551/rest/priorityinbox/parsejson' -F "file=@/path/to/email.json"
``` ```
Sample of json file : Sample of json file :
``` ```
......
{
"messageId" : "<9d0ed80f9e38222618c97793080275f7@linagora.com>",
"from" : {
"address" : "szribi@linagora.com",
"personal" : "Sarah ZRIBI"
},
"to" : [ {
"address" : "openpaas-ng@linagora.com",
"personal" : "OpenPaas NG"
} ],
"cc" : [ {
"address" : "azapolsky@linagora.com",
"personal" : null
}, {
"address" : "rbourgueil@linagora.com",
"personal" : "Regis BOURGUEIL"
}, {
"address" : "odaniel@linagora.com",
"personal" : "Olivier DANIEL"
} ],
"bcc" : [ ],
"Received" : "Mon, 28 Jan 2019 07:56:43 +0000",
"Date" : "Mon, 28 Jan 2019 07:56:43 +0000",
"in-Reply-To" : "<7f122e70-ed8f-b98b-4a14-fc37f0c79f26@linagora.com>",
"subject" : "OPNG - IMPORTANT - Changement de la date de revue - Nv Doodle",
"body" : "\nChers partenaires, \r\n\r\nJe viens tout juste de recevoir un mail de Mme. Géraldine EYANG\r\nNDONG,(ci-après) me demandant de reporter la date de la revue. \r\n\r\nVoici le nouveau Doodle : https://doodle.com/poll/kh8ywd7hb4c83esx . Je\r\nvous prie de le remplir as soon as possible. \r\n\r\nCeci ne reporte pas la date de livraison initiale (aujourd\"hui) afin de\r\npermettre aux personnes du CGI et de la DGE de prendre connaissance des\r\ntravaux du projet. \r\n\r\nÀ ce jour voici la liste des reviewers que j'ai : \r\n\r\n- Mme. Geraldine EYANG NDONG \r\n\r\n- Le nouveau directeur adjoint du CGI \r\n\r\n- Une personne de la DGE (je n'ai pas encore son nom) \r\n\r\n- M. Jean Christophe GOUGEON \r\n\r\n- M. Philippe ROY de Cap Digital \r\n\r\n\"Je suis contrainte de modifier la date de l'Etape clé 3 pour permettre\r\nau CGI et à la DGE de se joindre à nous. Aussi je vous propose nos\r\nnouvelles disponibilités : le 15 mai (toute la journée), les 16 et 17\r\nmai (matin uniquement). J'insiste encore sur la nécessité de nous\r\ntransmettre les documents le plus tôt possible afin de nous permettre de\r\nles analyser en amont de la réunion. \"\r\n\r\nJe vous remercie par avance de votre retour rapide. \r\n\r\nCordialement, \r\n\r\nSarah\r\n\r\n-- \r\nDr. Sarah ZRIBI\r\nResearch Project Manager\r\nLinagora - France\r\nPhone: +33 (0) 9 72 36 97 95\r\nCell: +33 (0) 7 69 22 72 54\r\nResearch web site: http://research.linagora.com ",
"attachments" : [{
"file_size" : "243534",
"content_name" : "presentation.pdf",
"content_type" : "APPLICATION/PDF"
}, {
"file_size": "243534",
"content_name" : "APPLICATION/ICS",
"content_type" : "meeting.ics"
}],
"emailFolder" : "INBOX",
"user" : "Zied Sellami",
"alternativeAddress" : [ "zsellami@linagora.com", "zied.sellami@linagora.com" ],
"X-Spam-Flag" : "NO"
}
\ No newline at end of file
{
"messageId" : "<9d0ed80f9e38222618c97793080275f7@linagora.com>",
"from" : {
"address" : "vstefenn@linagora.com",
"personal" : "Valérie Stefenn"
},
"to" : [ {
"address" : "zsellami@linagora.com",
"personal" : "Zied Sellami"
} ],
"cc" : [ {
"address" : "jplorre@linagora.com",
"personal" : "Jean-Pierre Lorré"
}],
"bcc" : [ ],
"Received" : "Mon, 28 Jan 2019 07:56:43 +0000",
"Date" : "Mon, 28 Jan 2019 07:56:43 +0000",
"in-Reply-To" : "<7f122e70-ed8f-b98b-4a14-fc37f0c79f26@linagora.com>",
"subject" : "Re: Linagora GSO : Convention de stage de Sonia Ratsiandavana : Pour vérification et signature",
"body" : "Bonjour,\n\nVoici le scan comportant des modifications.\n\nUn fois cela fait, Jean-Pierre peut signer en P/O\n\nMerci de nous renvoyer un scan avec les signatures.\n-- \nCordialement.\n\nValérie STEFFEN\nAssistante \n\n06.43.90.91.26\n\n------------------------------------------------\nLINAGORA\nTour Franklin\n100 Terrasse Boieldieu\n92042 Paris La Défense Cedex\nTel : +33(0)1 46 96 63 63\nfax : +33(0)1 46 96 63 64\n\n-------------------------------------------------\nhttp://www.linagora.com\n\n\"La présente transmission contient des informations confidentielles appartenant à Linagora, exclusivement destinées au(x) destinataire(s) identifié(s) ci-dessus. Si vous n'en faites pas partie, toute reproduction, distribution ou divulgation de tout ou partie des informations de cette transmission, ou toute action effectuée sur la base de celles-ci vous sont formellement interdites.\nSi vous avez reçu cette transmission par erreur, nous vous remercions de nous en avertir et de la détruire de votre système d'information.\n\nThe present transmission contains privileged and confidential information belonging to Linagora, exclusively intended for the recipient(s) thereabove identified. If you are not one of these aforementioned recipients, any reproduction, distribution, disclosure of said information in whole or in part, as well as any action undertaken on the basis of said information are strictly prohibited. If you received the present transmission by mistake, please inform us and destroy it from your messenging and information systems.\"\nLe 29/01/2019 à 10:21, Zied SELLAMI a écrit :\nBonjour Valérie,\n\nci-joint la convention de stage de Sonia Ratsiandavana que nous avons reçu de la part de son université.\n\n \n\nCordialement,\n\nZied Sellami\n\n-- \nZied Sellami\nIngénieur de Recherche - Linagora Grand Sud Ouest\n75 Route de Revel, 31500 Toulouse\nEmail : zsellami@linagora.com\nTéléphone : 05 62 19 24 91\nPortable : 06 64 53 93 84\nwww.linagora.com",
"attachments" : [{
"file_size" : "243534",
"content_name" : "convention.pdf",
"content_type" : "APPLICATION/PDF"
}],
"emailFolder" : "INBOX",
"user" : "Zied Sellami",
"alternativeAddress" : [ "zsellami@linagora.com", "zied.sellami@linagora.com" ],
"X-Spam-Flag" : "NO"
}
\ No newline at end of file
package org.linagora.priorityInbox;
import java.io.File;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import javax.mail.internet.InternetAddress;
import org.apache.commons.io.FileUtils;
import org.joda.time.DateTime;
import org.joda.time.Hours;
import org.linagora.priorityInbox.data.CorpusType;
import org.linagora.priorityInbox.data.Edge;
import org.linagora.priorityInbox.data.Email;
import org.linagora.priorityInbox.data.LinkType;
import org.linagora.priorityInbox.emailReader.CSVReader;
import org.linagora.priorityInbox.text.Text;
import org.linagora.priorityInbox.text.TextCleaner;
import org.tartarus.snowball.SnowballProgram;
import org.tartarus.snowball.ext.FrenchStemmer;
public class PriorityInboxLearner {
private static String RULE = null;
private static String URGENT_RULE = "(?i)(très urgent|très important|very urgent|extremely important|extremely urgent|very important|extr[êe]mement important|important|rapidement|urgent|à lire attentivement|répondre rapidement|answer quickly|à traiter dans les plus brefs délais|très grave|grave)";
private static List<String> ATTACH_TYPES = Arrays.asList("docx", "doc", "odt", "pdf", "pttx", "ptt", "xls", "ods", "odp", "rtf", "csv", "opendocument", "msword");
private static Set<String> userNameComponents = new HashSet<String>();
public static void main(String[] args) {
HashMap<String, Double> weightedContact = new HashMap<String, Double>();
SnowballProgram stemmer = new FrenchStemmer();
HashMap<String, Email> emails = new HashMap<String, Email>();
String DATA_FILE_PATH = "/home/zsellami/newCorpus/zied2/DATA.csv";
String BODY_DIR_PATH = "/home/zsellami/newCorpus/zied2/emails_body/";
String user = "Zied Sellami";
stemmer.setCurrent(user.toLowerCase());
stemmer.stem();
userNameComponents.add(stemmer.getCurrent().replace(" ", "_"));
String stemmed_user = "";
for(String str: user.split(" ")) {
stemmer.setCurrent(str.toLowerCase());
stemmer.stem();
userNameComponents.add(stemmer.getCurrent());
stemmed_user = stemmed_user + stemmer.getCurrent() + " ";
}
stemmed_user = stemmed_user.trim();
userNameComponents.add(stemmed_user);
userNameComponents.add(stemmed_user.replace(" ", "_"));
stemmed_user = "";
for(String str: user.split("[\\- ]")) {
stemmer.setCurrent(str.toLowerCase());
stemmer.stem();
userNameComponents.add(stemmer.getCurrent());
stemmed_user = stemmed_user + stemmer.getCurrent() + " ";
}
stemmed_user = stemmed_user.trim();
userNameComponents.add(stemmed_user);
userNameComponents.add(stemmed_user.replace(" ", "_"));
userNameComponents.remove("");
System.out.println(userNameComponents);
//System.exit(0);
//List<String> alternativeEmails = Arrays.asList("jplorre@linagora.com","jean-pierre.lorre@linagora.com","jean-pierre.lorre@petalslink.com","jean-pierre.lorre@ebmwebsourcing.com", "jean-pierre.lorre@toulouse.valtech.fr");
List<String> alternativeEmails = Arrays.asList("zsellami@linagora.com","zied.sellami@linagora.com");
emails = CSVReader.parseEmails(DATA_FILE_PATH, BODY_DIR_PATH);
emails = standardizeUser(user, alternativeEmails, emails);
HashMap<String, Edge> edges = new HashMap<String, Edge>();
HashMap<Email, List<Email>> inReplyToGraph = new HashMap<Email, List<Email>>();
List<Email> sentEmails = new ArrayList<Email>();
List<Email> socialEmails = new ArrayList<Email>();
int urgentCounter = 0;
for(Email email: emails.values()) {
if(isAChannelEmail(email.getFrom().getAddress())){
socialEmails.add(email);
}
InternetAddress from = email.getFrom();
boolean existUrgent = false;
if(email.getSubject() != null) {
existUrgent = Pattern.compile(URGENT_RULE).matcher(email.getSubject()).find();
if(existUrgent) {
urgentCounter ++;
}
}
if(email.getInReplyTo() == null) {
if(!existUrgent) {
toEdges(user, from, email.getTo(), LinkType.TO, edges);
toEdges(user, from, email.getCc(), LinkType.CC, edges);
toEdges(user, from, email.getBcc(), LinkType.BCC, edges);
}else {
toEdges(user, from, email.getTo(), LinkType.TO_URGENT, edges);
toEdges(user, from, email.getCc(), LinkType.CC_URGENT, edges);
toEdges(user, from, email.getBcc(), LinkType.BCC_URGENT, edges);
}
}else {
if(!existUrgent) {
toEdges(user, from, email.getTo(), LinkType.IN_REPLY_TO, edges);
toEdges(user, from, email.getCc(), LinkType.IN_REPLY_CC, edges);
toEdges(user, from, email.getBcc(), LinkType.IN_REPLY_BCC, edges);
}else {
toEdges(user, from, email.getTo(), LinkType.IN_REPLY_TO_URGENT, edges);
toEdges(user, from, email.getCc(), LinkType.IN_REPLY_CC_URGENT, edges);
toEdges(user, from, email.getBcc(), LinkType.IN_REPLY_BCC_URGENT, edges);
}
Email originEmail = emails.get(email.getInReplyTo());
if(originEmail == null) {
originEmail = emails.get(email.getInReplyTo().replaceAll("[<>]", ""));
}
if(originEmail !=null) {
List<Email> inReplyList = inReplyToGraph.get(originEmail);
if(inReplyList == null) {
inReplyList = new ArrayList<Email>();
inReplyToGraph.put(originEmail, inReplyList);
}
inReplyList.add(email);
}
}
}
for(Email email: emails.values()) {
String from = email.getFrom().getAddress();
if(from != null) {
if(email.getFrom().getAddress().equalsIgnoreCase(user) && !inReplyToGraph.containsKey(email)) {
sentEmails.add(email);
}
}
}
sentEmails.addAll(inReplyToGraph.keySet());
buildKeyWordsModels(sentEmails);
socialEmails.removeAll(sentEmails);
buildSocialKeyWordsModels(socialEmails);
// System.exit(0);
System.out.println("Emails Size: " + emails.size());
System.out.println("Edges Size: " +edges.size());
System.out.println("Very urgent :"+urgentCounter);
//System.exit(0);
List<Edge> edgesList = new ArrayList<Edge>(edges.values());
Collections.sort(edgesList);
Edge.printAllExchanges();
System.out.println();
int i = 0;
for(Edge edge: edgesList) {
if((edge.getSource().equals(user) || edge.getTarget().equals(user))
&& !edge.getSource().equals(edge.getTarget())
) {
double weight = 1d;
if(isAChannelEmail(edge.getSource())
|| isAChannelEmail(edge.getTarget())) {
weight = -1d;
}
String contact = "";
if(!edge.getSource().equals(user)) {
contact = edge.getSource();
}else {
contact = edge.getTarget();
}
weightedContact.put(contact, weight * edge.getScore());
i++;
}
}
/*HashMap<String, Double> emailWeights = new HashMap<String, Double>();
Double TOTAL_REPLY_TIME = 0d;
for(Entry<Email, List<Email>> key: inReplyToGraph.entrySet()) {
//count++;
List<Email> repliedEmails = key.getValue();
String sender = key.getKey().getFrom();
Double score = new Double(repliedEmails.size());
//System.out.println(key.getKey().getFrom() + " ReplyTO number: " + repliedEmails.size());
DateTime send = key.getKey().getEmailTime();
int minTime = Integer.MAX_VALUE;
for(Email replied: repliedEmails) {
DateTime answered = replied.getEmailTime();
if(send != null && answered != null) {
//System.out.println("--Answered delay: " + Hours.hoursBetween(send, answered).getHours() + " hours");
if(Hours.hoursBetween(send, answered).getHours() > 0) {
minTime = Math.min(minTime, Hours.hoursBetween(send, answered).getHours());
}
if(Hours.hoursBetween(send, answered).getHours() < 0) {
minTime = 0;
}
}
}
if(minTime > 0) {
score = score / new Double(minTime) ;
}
TOTAL_REPLY_TIME = TOTAL_REPLY_TIME + score;
Double currentScore = emailWeights.get(sender);
if(currentScore == null) {
emailWeights.put(sender, score);
}else {
emailWeights.replace(sender, score + currentScore);
}
}
for(Entry<String, Double> entry: emailWeights.entrySet()) {
entry.setValue(entry.getValue() / TOTAL_REPLY_TIME);
}
for(Entry<String, Double> entry: weightedContact.entrySet()) {
Double replyScore = emailWeights.get(entry.getKey());
if(replyScore != null) {
entry.setValue(replyScore + entry.getValue());
}else {
}
}*/
Map<String,Double> sortedEmails =
weightedContact.entrySet().stream()
.sorted(Collections.reverseOrder(Map.Entry.comparingByValue()))
.collect(Collectors.toMap(Entry::getKey, Entry::getValue, (e1, e2) -> e1, LinkedHashMap::new));
for(Entry<String, Double> entry: sortedEmails.entrySet()) {
System.out.println(entry.getKey() + " >>> " + entry.getValue());
try {
FileUtils.writeStringToFile(new File("recipient.model"), entry.getKey()+"\t"+entry.getValue()+"\n", true);
}catch(Exception e) {
}
}
}
private static void buildKeyWordsModels(List<Email> emails) {
Double TOTAL_KEY_BODY = 0d;
Double TOTAL_KEY_SUBJECT = 0d;
HashMap<CorpusType, HashMap<String, Double>> keywords = new HashMap<CorpusType, HashMap<String, Double>>();
keywords.put(CorpusType.SUBJECT, new HashMap<String, Double>());
keywords.put(CorpusType.BODY, new HashMap<String, Double>());
int size = emails.size();
int counter = 1;
for(Email email: emails) {
// double weight = 1d;
// for(Entry<String, String> entry: email.getAttachments().entrySet()) {
// for(String attach_type: ATTACH_TYPES) {
// if(entry.getValue().contains(attach_type))
// weight = weight + 1d;
// }
// }
System.out.println("building email keywords models... " + counter + "/" + size);
counter++;
String subject = email.getSubject();
String body = email.getBody();
if(body != null) {
body = TextCleaner.cleanReplyBlock(body);
}
Text.setStopWordsDirectory("stopwords/");
String emailContent = subject + "\n" + body;
Text.language = Text.detectLanguage(emailContent);
if(subject != null) {
String processedText = Text.process(subject);
List<String> ngrams = Text.toNGramsALL(3, processedText.toLowerCase());
HashMap<String, Double> keys = Text.countTokens(String.join(" ", ngrams));
addAllKeyWords(keys, keywords.get(CorpusType.SUBJECT));
for(Double value: keys.values()) { TOTAL_KEY_SUBJECT = TOTAL_KEY_SUBJECT + value; }
}
if(body != null) {
String processedText = Text.process(body);
List<String> ngrams = Text.toNGramsALL(3, processedText.toLowerCase());
HashMap<String, Double> keys = Text.countTokens(String.join(" ", ngrams));
addAllKeyWords(keys, keywords.get(CorpusType.BODY));
for(Double value: keys.values()) { TOTAL_KEY_BODY = TOTAL_KEY_BODY + value; }
}
}
System.out.println("Email Corpus build !");
Map<String,Double> sortedBodyKeywords =
keywords.get(CorpusType.BODY).entrySet().stream()
.sorted(Collections.reverseOrder(Map.Entry.comparingByValue()))
.collect(Collectors.toMap(Entry::getKey, Entry::getValue, (e1, e2) -> e1, LinkedHashMap::new));
for(Entry<String, Double> entry: sortedBodyKeywords.entrySet()) {
entry.setValue(entry.getValue() / TOTAL_KEY_BODY);
}
Map<String,Double> sortedSubjectKeywords =
keywords.get(CorpusType.SUBJECT).entrySet().stream()
.sorted(Collections.reverseOrder(Map.Entry.comparingByValue()))
.collect(Collectors.toMap(Entry::getKey, Entry::getValue, (e1, e2) -> e1, LinkedHashMap::new));
for(Entry<String, Double> entry: sortedSubjectKeywords.entrySet()) {
entry.setValue(entry.getValue() / TOTAL_KEY_SUBJECT);
}
System.out.println("Showing keywords BODY");
int i = 0;
for(Entry<String, Double> entry: sortedBodyKeywords.entrySet()) {
i ++;
System.out.println(entry.getKey() + " >> " + entry.getValue());
try {
FileUtils.writeStringToFile(new File("body.model"), entry.getKey()+"\t"+entry.getValue()+"\n", true);
}catch(Exception e) {
}
}
System.out.println("Showing keywords Subject");
i = 0;
for(Entry<String, Double> entry: sortedSubjectKeywords.entrySet()) {
i ++;
System.out.println(entry.getKey() + " >> " + entry.getValue());
try {
FileUtils.writeStringToFile(new File("subject.model"), entry.getKey()+"\t"+entry.getValue()+"\n", true);
}catch(Exception e) {
}
}
}
private static void buildSocialKeyWordsModels(List<Email> emails) {
Double TOTAL_KEY_BODY = 0d;
Double TOTAL_KEY_SUBJECT = 0d;
HashMap<CorpusType, HashMap<String, Double>> keywords = new HashMap<CorpusType, HashMap<String, Double>>();
keywords.put(CorpusType.SUBJECT, new HashMap<String, Double>());
keywords.put(CorpusType.BODY, new HashMap<String, Double>());
int size = emails.size();
int counter = 1;
for(Email email: emails) {
// double weight = 1d;
// for(Entry<String, String> entry: email.getAttachments().entrySet()) {
// for(String attach_type: ATTACH_TYPES) {
// if(entry.getValue().contains(attach_type))
// weight = weight + 1d;
// }
// }
System.out.println("building email keywords models... " + counter + "/" + size);
counter++;
String subject = email.getSubject();
String body = email.getBody();
if(body != null) {
body = TextCleaner.cleanReplyBlock(body);
}
Text.setStopWordsDirectory("stopwords/");
String emailContent = subject + "\n" + body;
Text.language = Text.detectLanguage(emailContent);
if(subject != null) {
String processedText = Text.process(subject);
List<String> ngrams = Text.toNGramsALL(3, processedText.toLowerCase());
HashMap<String, Double> keys = Text.countTokens(String.join(" ", ngrams));
addAllKeyWords(keys, keywords.get(CorpusType.SUBJECT));
for(Double value: keys.values()) { TOTAL_KEY_SUBJECT = TOTAL_KEY_SUBJECT + value; }
}
if(body != null) {
String processedText = Text.process(body);
List<String> ngrams = Text.toNGramsALL(3, processedText.toLowerCase());
HashMap<String, Double> keys = Text.countTokens(String.join(" ", ngrams));
addAllKeyWords(keys, keywords.get(CorpusType.BODY));
for(Double value: keys.values()) { TOTAL_KEY_BODY = TOTAL_KEY_BODY + value; }
}
}
System.out.println("Email Corpus build !");
Map<String,Double> sortedBodyKeywords =
keywords.get(CorpusType.BODY).entrySet().stream()
.sorted(Collections.reverseOrder(Map.Entry.comparingByValue()))
.collect(Collectors.toMap(Entry::getKey, Entry::getValue, (e1, e2) -> e1, LinkedHashMap::new));
for(Entry<String, Double> entry: sortedBodyKeywords.entrySet()) {
entry.setValue(-1d * (entry.getValue() / TOTAL_KEY_BODY));
}
Map<String,Double> sortedSubjectKeywords =
keywords.get(CorpusType.SUBJECT).entrySet().stream()
.sorted(Collections.reverseOrder(Map.Entry.comparingByValue()))
.collect(Collectors.toMap(Entry::getKey, Entry::getValue, (e1, e2) -> e1, LinkedHashMap::new));
for(Entry<String, Double> entry: sortedSubjectKeywords.entrySet()) {
entry.setValue(-1d * (entry.getValue() / TOTAL_KEY_SUBJECT));
}
System.out.println("Showing keywords BODY");
int i = 0;
for(Entry<String, Double> entry: sortedBodyKeywords.entrySet()) {
i ++;
System.out.println(entry.getKey() + " >> " + entry.getValue());
try {
FileUtils.writeStringToFile(new File("bodySpam.model"), entry.getKey()+"\t"+entry.getValue()+"\n", true);
}catch(Exception e) {
}
// if(i > 100) {
// break;
// }
}
System.out.println("Showing keywords Subject");
for(Entry<String, Double> entry: sortedSubjectKeywords.entrySet()) {
System.out.println(entry.getKey() + " >> " + entry.getValue());
try {
FileUtils.writeStringToFile(new File("subjectSpam.model"), entry.getKey()+"\t"+entry.getValue()+"\n", true);
}catch(Exception e) {
}
}
}
private static void addAllKeyWords(HashMap<String, Double> keywordsToAdd, HashMap<String, Double> keywords ) {
for(Entry<String, Double> entry: keywordsToAdd.entrySet()) {
if(!entry.getKey().equals("")) {
Double score = keywords.get(entry.getKey());
if(score == null) {
keywords.put(entry.getKey(), entry.getValue());
}else {
keywords.replace(entry.getKey(), score + entry.getValue());
}