Commit e258900e authored by Zied SELLAMI's avatar Zied SELLAMI
Browse files

Update Priority-Inbox module

parent b6bee1cb
......@@ -9,6 +9,18 @@ The tools is a rest-api that compute the category and a score from a email json
3. **NOTIFICATION**: is returned if the email is a notification email. For instance a chat notification, a social media notification or each email with a not-reply email.
4. **TO_READ**: if the email is not **IMPORTANT** and not a **NOTIFICATION**
### First results
A small dataset of **604** emails was annotated manually (**234** important emails, **321** notification emails and **49** to_read email) was created to test this approach.
Remember that this AI approach are not based on Machine Learning but only on generic rules to compute a priority for an email.
**The next table resume the obtained results:**
|Email Category|Precision|Recall|Fmeasure|
|--|--|--|--|
|Important|81.90%|73.50%|77.47%|
|To_Read|25.39%|65.30%|36.57%|
|Notification|99.62%|83.17%|90.66%|
# Running the service with Docker
## Building the image
......@@ -63,7 +75,7 @@ Sample of json file:
"Date" : "Mon, 28 Jan 2019 07:56:43 +0000",
"in-Reply-To" : "messageId_in_reply_to",
"subject" : "IMPORTANT: Testing The Priority Inbox",
"body" : "\nDear, \n\n this a test of the priority inbox\n regards,\nfirst_name last_name\nDirector Tester",
"body" : "\nDear, \n\n this a test of the priority inbox\n regards, first_name last_name of to1\nDirector Tester",
"attachments" : [{
"file_size" : "243534",
"content_name" : "presentation.pdf",
......@@ -83,6 +95,6 @@ Expected result:
```
{
"category" : "IMPORTANT",
"score" : 20.25
"score" : 14.916666666666668
}
```
This diff is collapsed.
......@@ -9,7 +9,7 @@ Député, Deputy
Conseiller, Advisor
Préfét, Prefet
Maire, Mayor
Adjoint au Maire, Deputy Mayor
Adjoint au Maire, Deputy Mayor, Adjoint au directeur, Adjoint de direction
Directeur Général, Directeur Associé, General Director, Associate Director
Directeur exécutif, Executive Director
Directeur, Director, Responsable de Direction, Directeur Adjoint, Chief Officier, CTO, Chief Technical Officier
......
package org.linagora.priorityInbox.feature;
public class EmailSample implements Comparable<EmailSample>{
private String messageId;
private long priority;
private DocumentType documentType;
private LabeledDocument email = null;
public EmailSample() {
}
public String getMessageId() {
return messageId;
}
public void setMessageId(String messageId) {
this.messageId = messageId;
}
public long getPriority() {
return priority;
}
public void setPriority(long priority) {
this.priority = priority;
}
public DocumentType getDocumentType() {
return documentType;
}
public void setDocumentType(DocumentType documentType) {
this.documentType = documentType;
}
public LabeledDocument getLabeledDocument() {
return email;
}
public void setLabeledDocument(LabeledDocument email) {
this.email = email;
}
@Override
public int compareTo(EmailSample o) {
if(o.getPriority() == this.getPriority()) return 0;
if(o.getPriority() < this.getPriority()) return -1;
return 1;
}
}
......@@ -31,7 +31,7 @@ public class FeatureExtractor {
private static String NOTIFICATION_REGEX = null;
private static String URGENT_REGEX = "(?i)(rappel|reminder|très urgent|très important|very urgent|extremely important|extremely urgent|very important|extr[êe]mement important|important|rapidement|urgent|à lire attentivement|répondre rapidement|answer quickly|à traiter dans les plus brefs délais|très grave|grave)";
private static String IMPORTANT_REGEX = "(?i)(rappel|reminder|très urgent|très important|very urgent|extremely important|extremely urgent|very important|extr[êe]mement important|important|rapidement|urgent|à lire attentivement|répondre rapidement|answer quickly|à traiter dans les plus brefs délais|très grave|grave|acceptée?|rejetée?|acceptation|accepted|refused|rejected|confirmée?|confirmed|undelivered)";
private static String SPAM_REGEX = "(?i)(\\bspam\\b|\\*+spam\\*+|quarantine report for)";
......@@ -57,7 +57,7 @@ public class FeatureExtractor {
private static String SPAM_FOLDER_NAME_REGEX = "(?i)(pourriels?|ignorés?|spams?|indésirables?)";
private static Pattern URGENT_PATTERN = Pattern.compile(URGENT_REGEX);
private static Pattern IMPORTANT_PATTERN = Pattern.compile(IMPORTANT_REGEX);
private static Pattern SPAM_PATTERN = Pattern.compile(SPAM_REGEX);
......@@ -112,8 +112,8 @@ public class FeatureExtractor {
int numberOfReply = numberOfReply(email);
int numberOfForward = numberOfForward(email);
boolean isUrgent = isUrgent(email);
// boolean isSpam = isSpam(email);
boolean isImportant = isImportant(email);
boolean isSpam = isSpam(email);
int numberDocAttachments = numberDocAttachments(email);
boolean containsDocAttachments = false;
......@@ -142,12 +142,12 @@ public class FeatureExtractor {
document.setAForward(isAForward);
document.setNumberOfForward(numberOfForward);
document.setNumberOfReply(numberOfReply);
document.setUrgent(isUrgent);
document.setImportant(isImportant);
document.setNumberDocAttachments(numberDocAttachments);
document.setContainsDocAttachments(containsDocAttachments);
document.setContainsMeetingInvitation(containsMeetingInvitation);
document.setANotification(isANotification);
// document.setSpam(isSpam);
document.setSpam(isSpam);
// document.toRead(toRead);
document.setNumberOfRecipient(numberOfRecipient);
document.setToRecipient(isToRecipient);
......@@ -215,8 +215,8 @@ public class FeatureExtractor {
int numberOfReply = numberOfReply(email);
int numberOfForward = numberOfForward(email);
boolean isUrgent = isUrgent(email);
// boolean isSpam = isSpam(email);
boolean isImportant = isImportant(email);
boolean isSpam = isSpam(email);
int numberDocAttachments = numberDocAttachments(email);
boolean containsDocAttachments = false;
......@@ -245,12 +245,12 @@ public class FeatureExtractor {
document.setAForward(isAForward);
document.setNumberOfForward(numberOfForward);
document.setNumberOfReply(numberOfReply);
document.setUrgent(isUrgent);
document.setImportant(isImportant);
document.setNumberDocAttachments(numberDocAttachments);
document.setContainsDocAttachments(containsDocAttachments);
document.setContainsMeetingInvitation(containsMeetingInvitation);
document.setANotification(isANotification);
// document.setSpam(isSpam);
document.setSpam(isSpam);
// document.toRead(toRead);
document.setNumberOfRecipient(numberOfRecipient);
document.setToRecipient(isToRecipient);
......@@ -349,7 +349,7 @@ public class FeatureExtractor {
double score = document.getScore();
if (document.isRecieved()) {
DateTime emailTime = document.getEmail().getEmailTime();
if (emailTime.getYear() == 2018 && emailTime.getDayOfMonth() == 20 && emailTime.getMonthOfYear() == 4) {
if (emailTime.getYear() == 2018 && emailTime.getDayOfMonth() == 13 && emailTime.getMonthOfYear() == 4) {
System.out.println(document.getEmail().getFrom().getPersonal() + " " + document.getEmail().getTo()
+ " " + document.getEmail().getEmailTime() + " " + document.getEmail().getSubject() + " "
+ score);
......@@ -572,10 +572,10 @@ public class FeatureExtractor {
return forwardCounter;
}
public static boolean isUrgent(Email email) {
public static boolean isImportant(Email email) {
if (email.getSubject() != null) {
return URGENT_PATTERN.matcher(email.getSubject()).find();
return IMPORTANT_PATTERN.matcher(email.getSubject()).find();
}
if (email.getEmailFolder() != null) {
......@@ -698,16 +698,16 @@ public class FeatureExtractor {
String signature = "";
double rankScore = 0d;
HashMap<String, List<PositionRank>> positionRanks = PositionRank.getPositionsRank(JOB_POSITION_RULES);
HashMap<String, List<PositionWeight>> positionRanks = PositionWeight.getPositionWeights(JOB_POSITION_RULES);
if (email.getBody() != null) {
if (email.getFrom().getPersonal() != null) {
signature = TextCleaner.detectSignature(email.getBody(), email.getFrom().getPersonal());
}
}
for (PositionRank rank : positionRanks.get("all.txt")) {
for (PositionWeight rank : positionRanks.get("all.txt")) {
boolean rankFind = rank.getRegex().matcher(signature).find();
if (rankFind) {
rankScore = rank.getRank() * 1d;
rankScore = rank.getWeight();
break;
}
}
......
......@@ -15,7 +15,7 @@ public class LabeledDocument implements Comparable<LabeledDocument>{
private boolean isCcRecipient = false;
private boolean isBccRecipient = false;
private boolean isAForward = false;
private boolean isUrgent = false;
private boolean isImportant = false;
private boolean isSpam = false;
private boolean containsDocAttachments = false;
private boolean containsMeetingInvitation = false;
......@@ -40,6 +40,8 @@ public class LabeledDocument implements Comparable<LabeledDocument>{
private boolean existReplyText;
public static double IMPORTANT_SCORE_THRESHOLD = 10d;
public LabeledDocument() {
}
......@@ -77,11 +79,11 @@ public class LabeledDocument implements Comparable<LabeledDocument>{
public void setAForward(boolean isAForward) {
this.isAForward = isAForward;
}
public boolean isUrgent() {
return isUrgent;
public boolean isImportant() {
return isImportant;
}
public void setUrgent(boolean isUrgent) {
this.isUrgent = isUrgent;
public void setImportant(boolean isImportant) {
this.isImportant = isImportant;
}
public boolean isContainsDocAttachments() {
return containsDocAttachments;
......@@ -189,9 +191,11 @@ public class LabeledDocument implements Comparable<LabeledDocument>{
public DocumentType getDocumentType() {
if(documentType == null) {
if(isANotification) {documentType = DocumentType.NOTIFICATION;}
else if(isAReply || isAForward || emailTriggeringAReply || isUrgent || this.getScore() >= 10d) {documentType = DocumentType.IMPORTANT;}
else {documentType = DocumentType.TO_READ;}
if(isAReply || isAForward || emailTriggeringAReply || isImportant || this.getScore() >= IMPORTANT_SCORE_THRESHOLD) {documentType = DocumentType.IMPORTANT;}
else if(isANotification && (isAReply || isAForward) && !isSpam) {documentType = DocumentType.TO_READ;}
else if(isANotification || isSpam) {documentType = DocumentType.NOTIFICATION;}
else {documentType = DocumentType.TO_READ;}
}
......@@ -208,7 +212,7 @@ public class LabeledDocument implements Comparable<LabeledDocument>{
if(isAReply) {score = score + numberOfReply * 1d;} // rule 2
if(isAForward) {score = score + numberOfForward * 1d;} // rule 3
if((isAReply || isAForward) && existReplyText) {score = score + 1d;} // rule 4
if(isUrgent) {score = score + 1d;} // rule 5
if(isImportant) {score = score + 1d;} // rule 5
if(containsDocAttachments) {score = score + numberDocAttachments * 1d;} // rule 6
if(containsMeetingInvitation) {score = score + 1d;} // rule 7
......@@ -234,7 +238,7 @@ public class LabeledDocument implements Comparable<LabeledDocument>{
public String toString() {
return "LabeledDocument [isSent=" + isSent + ", isRecieved=" + isRecieved + ", isAReply=" + isAReply
+ ", isToRecipient=" + isToRecipient + ", isCcRecipient=" + isCcRecipient + ", isBccRecipient="
+ isBccRecipient + ", isAForward=" + isAForward + ", isUrgent=" + isUrgent + ", isSpam=" + isSpam
+ isBccRecipient + ", isAForward=" + isAForward + ", isImportant=" + isImportant + ", isSpam=" + isSpam
+ ", containsDocAttachments=" + containsDocAttachments + ", containsMeetingInvitation="
+ containsMeetingInvitation + ", isANotification=" + isANotification + ", emailTriggeringAReply="
+ emailTriggeringAReply + ", numberOfRecipient=" + numberOfRecipient
......
......@@ -8,59 +8,38 @@ import java.util.regex.Pattern;
import org.apache.commons.io.FileUtils;
public class PositionRank {
public class PositionWeight {
private Pattern regex = null;
private int rank = 0;
private double weight = 0;
private static HashMap<String, List<PositionRank>> positionRanks = new HashMap<String, List<PositionRank>>();
private static HashMap<String, List<PositionWeight>> positionWeights = new HashMap<String, List<PositionWeight>>();
public static double maxWeight = LabeledDocument.IMPORTANT_SCORE_THRESHOLD * 2d;
public static HashMap<String, List<PositionRank>> getPositionsRank(String directory){
if(positionRanks.isEmpty()) {
try {
File dir = new File(directory);
for(File f: dir.listFiles()) {
List<PositionRank> ranks = new ArrayList<PositionRank>();
positionRanks.put(f.getName(), ranks);
List<String> lines = FileUtils.readLines(f);
int size = lines.size();
for(String line: lines) {
List<String> elements = new ArrayList<String>();
for(String element: line.split(", ?")) {
elements.add(Pattern.quote(element.trim()));
}
PositionRank positionRank = new PositionRank();
positionRank.setRank(size);
positionRank.setRegex(Pattern.compile(String.join("|", elements), Pattern.CASE_INSENSITIVE));
size --;
ranks.add(positionRank);
}
}
}catch(Exception e) {
e.printStackTrace();
}
}
return positionRanks;
public static HashMap<String, List<PositionWeight>> getPositionWeights(String directory){
File dir = new File(directory);
return getPositionWeights(dir);
}
public static HashMap<String, List<PositionRank>> getPositionsRank(File directory){
if(positionRanks.isEmpty()) {
public static HashMap<String, List<PositionWeight>> getPositionWeights(File directory){
if(positionWeights.isEmpty()) {
try {
for(File f: directory.listFiles()) {
List<PositionRank> ranks = new ArrayList<PositionRank>();
positionRanks.put(f.getName(), ranks);
List<PositionWeight> ranks = new ArrayList<PositionWeight>();
positionWeights.put(f.getName(), ranks);
List<String> lines = FileUtils.readLines(f);
int size = lines.size();
double step = maxWeight / (double)size;
double allStep = 0d;
for(String line: lines) {
List<String> elements = new ArrayList<String>();
for(String element: line.split(", ?")) {
elements.add(Pattern.quote(element.trim()));
}
PositionRank positionRank = new PositionRank();
positionRank.setRank(size);
PositionWeight positionRank = new PositionWeight();
positionRank.setWeight(maxWeight - allStep);
allStep = allStep + step;
positionRank.setRegex(Pattern.compile(String.join("|", elements), Pattern.CASE_INSENSITIVE));
size --;
ranks.add(positionRank);
......@@ -71,15 +50,15 @@ public class PositionRank {
}
}
return positionRanks;
return positionWeights;
}
public PositionRank() {
public PositionWeight() {
}
public PositionRank(Pattern regex, int rank) {
public PositionWeight(Pattern regex, int rank) {
this.regex = regex;
this.rank = rank;
this.weight = rank;
}
......@@ -91,12 +70,12 @@ public class PositionRank {
this.regex = regex;
}
public int getRank() {
return rank;
public double getWeight() {
return weight;
}
public void setRank(int rank) {
this.rank = rank;
public void setWeight(double rank) {
this.weight = rank;
}
}
package org.linagora.priorityInbox.feature;
import java.io.File;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import org.apache.commons.io.FileUtils;
import org.linagora.priorityInbox.data.Email;
import org.linagora.priorityInbox.data.EmailFolder;
import org.linagora.priorityInbox.data.Inbox;
import org.linagora.priorityInbox.emailReader.CSVReader;
import org.linagora.priorityInbox.text.TextCleaner;
public class TestEmailSample {
public static void main(String[] args) {
//Text.setStopWordsDirectory("stopwords");
List<EmailSample> emailSamples = new ArrayList<EmailSample>();
int notif = 0;
int important = 0;
int read = 0;
int correctRead = 0;
int errorRead = 0;
int allResultRead = 0;
int correctNotification = 0;
int errorNotification = 0;
int allResultNotification = 0;
int correctImportant = 0;
int errorImportant = 0;
int allResultImportant = 0;
try {
List<String> lines = FileUtils.readLines(new File("annotated.csv"));
for(String line: lines) {
String [] elements = line.split("\t");
String messageId = elements[0];
long priority = Long.parseLong(elements[1]);
DocumentType documentType = null;
if(elements[2].equals("T") || elements[2].equals("R")) {
documentType = DocumentType.TO_READ;
read++;
}else if(elements[2].equals("I")) {
documentType = DocumentType.IMPORTANT;
important++;
}else if(elements[2].equals("N")) {
documentType = DocumentType.NOTIFICATION;
notif ++;
}
EmailSample sample = new EmailSample();
sample.setMessageId(messageId);
sample.setDocumentType(documentType);
sample.setPriority(priority);
emailSamples.add(sample);
}
System.out.println("Important Email: " + important);
System.out.println("Notification Email: " + notif);
System.out.println("To_Read Email: " + read);
}catch(Exception e) {
e.printStackTrace();
}
System.out.println(emailSamples.size());
String DATA_FILE_PATH = "/home/zsellami/newCorpus/zied2/DATA.csv";
String BODY_DIR_PATH = "/home/zsellami/newCorpus/zied2/emails_body/";
String user = "Zied Sellami";
List<String> alternativeEmails = Arrays.asList("zsellami@linagora.com", "zied.sellami@linagora.com");
// String DATA_FILE_PATH = "/home/zsellami/newCorpus/test/DATA.csv";
// String BODY_DIR_PATH = "/home/zsellami/newCorpus/test/emails_body/";
long startTime = System.currentTimeMillis();
Inbox inbox = CSVReader.parseInbox(DATA_FILE_PATH, BODY_DIR_PATH);
inbox.setUser(user);
inbox.setEmailAddresses(alternativeEmails);
// for(EmailFolder emailFolder: inbox.getEmailFolders().values()) {
// PriorityInboxLearner.standardizeUser(user, alternativeEmails, emailFolder.getEmails());
// }
System.out.println("Personal email address standardized with the user name");
FeatureExtractor.JOB_POSITION_RULES = new File("jobpositions/");
FeatureExtractor.NOTIFICATION_CHANNELS = new File("channels/");
TextCleaner.CLEANING_REGEX = new File("TextCleaner.regex");
List<LabeledDocument> documents = new ArrayList<LabeledDocument>();
for(EmailFolder folder: inbox.getEmailFolders().values()) {
for(EmailSample sample: emailSamples) {
List<Email> emails = folder.searchEmail(sample.getMessageId());
if(!emails.isEmpty()) {
Email email = emails.get(0);
email.setUser(user);
email.setAlternativeAddress(alternativeEmails);
sample.setLabeledDocument(FeatureExtractor.toLabeledDocument(email));
//break;
}
}
// for(Email email: folder.getEmails().values()) {
// email.setUser(user);
// email.setAlternativeAddress(alternativeEmails);
// LabeledDocument labeledDocument = FeatureExtractor.toLabeledDocument(email);
// documents.add(labeledDocument);
//
// }
}
for(EmailSample sample: emailSamples) {
if(sample.getDocumentType() == DocumentType.TO_READ && sample.getLabeledDocument().getDocumentType() == sample.getDocumentType()) {
//System.out.println(sample.getMessageId());
correctRead ++;
}
if(sample.getDocumentType() == DocumentType.TO_READ && sample.getLabeledDocument().getDocumentType() != sample.getDocumentType()) {
//System.out.println(sample.getMessageId());
errorRead ++;
}
if(sample.getLabeledDocument().getDocumentType() == DocumentType.TO_READ) {
allResultRead ++;
}
if(sample.getDocumentType() == DocumentType.NOTIFICATION && sample.getLabeledDocument().getDocumentType() == sample.getDocumentType()) {
//System.out.println(sample.getMessageId());
correctNotification ++;
}
if(sample.getDocumentType() == DocumentType.NOTIFICATION && sample.getLabeledDocument().getDocumentType() != sample.getDocumentType()) {
//System.out.println(sample.getMessageId());
errorNotification ++;
}
if(sample.getLabeledDocument().getDocumentType() == DocumentType.NOTIFICATION) {
allResultNotification ++;
}
if(sample.getDocumentType() == DocumentType.IMPORTANT && sample.getLabeledDocument().getDocumentType() == sample.getDocumentType()) {
//System.out.println(sample.getMessageId());
correctImportant ++;
}
if(sample.getDocumentType() == DocumentType.IMPORTANT && sample.getLabeledDocument().getDocumentType() != sample.getDocumentType()) {
//System.out.println(sample.getMessageId());
errorImportant ++;
}
if(sample.getLabeledDocument().getDocumentType() == DocumentType.IMPORTANT) {
allResultImportant ++;
}
}
double recall = (double)correctImportant / (double)important;
double precision = (double)correctImportant / (double)allResultImportant;
double fmeasure = 2d * ((precision * recall) / (precision + recall));
System.out.println("Important email : allResult: "+ allResultImportant + ", error: "+ errorImportant + ", correctResult: " + correctImportant+ ", Precision :" + precision +", rappel: "+recall + ", fmeasure: "+fmeasure);
recall = (double)correctRead / (double)read;
precision = (double)correctRead / (double)allResultRead;
fmeasure = 2d * ((precision * recall) / (precision + recall));
System.out.println("TO_READ allResult: "+ allResultRead + ", error: "+ errorRead + ", correctResult: " + correctRead+ ", Precision :" + precision+", rappel: "+recall+ ", fmeasure: "+fmeasure);
recall = (double)correctNotification / (double)notif;
precision = (double)correctNotification / (double)allResultNotification;
fmeasure = 2d * ((precision * recall) / (precision + recall));
System.out.println("Notification allResult: "+ allResultNotification + ", error: "+ errorNotification + ", correctResult: " + correctNotification+ ", Precision :" + precision+", rappel: "+recall+ ", fmeasure: "+fmeasure);
long endTime = System.currentTimeMillis();
double duration = ((endTime - startTime) / 1000d ) / 60d;
System.out.println("Time execution duration: " + duration + "mn");
}