Commit a53e50d5 authored by Zied SELLAMI's avatar Zied SELLAMI
Browse files

Integrate new rules

parent ba6aa0f7
......@@ -2,10 +2,11 @@
The priority-inbox is a maven java projet. This module allows to assign to an email a category (**IMPORTANT, TO_READ or NOTIFICATION**) and a score. The priority-inbox is based on a set of generic rules.
The tools is a rest-api that compute the category and a score from a email json file.
The tools is a rest-api that compute the category and a score from a email json file. In average, the service is able to process 169 emails/second (result obtained by processing 7683 emails).
### Supported categories
1. **IMPORTANT**: is returned if the email is important. **14 generic rules** are implemented to indicate if an email is important or not. For instance, we analyse the signature of the email. An email will be considered important if its sender (from) is the CEO of a company or a Director of a Department. An email is important if the body contains many thread (many replies) or many forwards.
1. **IMPORTANT**: is returned if the email is important. **15 generic rules** are implemented to indicate if an email is important or not. For instance, we analyse the signature of the email. An email will be considered important if its sender (from) is the CEO of a company or a Director of a Department. An email is important if the body contains many thread (many replies) or many forwards.
3. **NOTIFICATION**: is returned if the email is a notification email. For instance a chat notification, a social media notification or each email with a not-reply email.
4. **TO_READ**: if the email is not **IMPORTANT** and not a **NOTIFICATION**
......@@ -17,9 +18,9 @@ Remember that this AI approach are not based on Machine Learning but only on gen
| Email category | Precision | Recall | Fmeasure |
| -------- | -------- | -------- | -------- |
| Important | 84.50% | 77.41% | 80.80% |
| Important | 84.80% | 77.41% | 80.94% |
| To_Read | 33.76% | 69.33% | 45.41% |
| Notification | 99.62% | 83.17% | 90.66% |
| Notification | 99.62% | 83.48% | 90.84% |
# Running the service with Docker
......
package org.linagora.priorityInbox.data;
public enum CorpusType {
SUBJECT, BODY;
}
......@@ -8,18 +8,18 @@ public class Edge implements Comparable<Edge>{
private String source = null;
private String target = null;
private HashMap<LinkType, Long> exchanges = new HashMap<LinkType, Long>();
private HashMap<EdgeType, Long> exchanges = new HashMap<EdgeType, Long>();
private static HashMap<LinkType, Long> ALL_Exchanges = new HashMap<LinkType, Long>();
private static HashMap<EdgeType, Long> ALL_Exchanges = new HashMap<EdgeType, Long>();
public Edge() {
for(LinkType linkType : LinkType.values()) {
for(EdgeType linkType : EdgeType.values()) {
exchanges.put(linkType, 0L);
}
if(ALL_Exchanges.isEmpty()) {
for(LinkType linkType : LinkType.values()) {
for(EdgeType linkType : EdgeType.values()) {
ALL_Exchanges.put(linkType, 0L);
}
}
......@@ -29,28 +29,28 @@ public class Edge implements Comparable<Edge>{
public Edge(String source, String target) {
this.source = source;
this.target = target;
for(LinkType linkType : LinkType.values()) {
for(EdgeType linkType : EdgeType.values()) {
exchanges.put(linkType, 0L);
}
if(ALL_Exchanges.isEmpty()) {
for(LinkType linkType : LinkType.values()) {
for(EdgeType linkType : EdgeType.values()) {
ALL_Exchanges.put(linkType, 0L);
}
}
}
public Edge(String source, String target, LinkType linkType) {
public Edge(String source, String target, EdgeType linkType) {
this.source = source;
this.target = target;
for(LinkType linkType_ : LinkType.values()) {
for(EdgeType linkType_ : EdgeType.values()) {
exchanges.put(linkType_, 0L);
}
if(ALL_Exchanges.isEmpty()) {
for(LinkType linkType_ : LinkType.values()) {
for(EdgeType linkType_ : EdgeType.values()) {
ALL_Exchanges.put(linkType_, 0L);
}
}
......@@ -84,11 +84,11 @@ public class Edge implements Comparable<Edge>{
return frequency;
}
public long getFrequency(LinkType linkType) {
public long getFrequency(EdgeType linkType) {
return exchanges.get(linkType);
}
public void incrementFrequency(LinkType linkType) {
public void incrementFrequency(EdgeType linkType) {
Long currentValue = exchanges.get(linkType);
Long newValue = currentValue + 1L;
......@@ -103,7 +103,7 @@ public class Edge implements Comparable<Edge>{
public double getScore() {
double score = 0d;
double zeroValue = 0d;
for(Entry<LinkType, Long> entry: exchanges.entrySet()) {
for(Entry<EdgeType, Long> entry: exchanges.entrySet()) {
Double value = 0d;
Double totalLinkTypeEdge = new Double(ALL_Exchanges.get(entry.getKey()));
if(totalLinkTypeEdge > 0d) {
......@@ -120,7 +120,7 @@ public class Edge implements Comparable<Edge>{
return finalScore;
}
public HashMap<LinkType, Long> getExchanges(){
public HashMap<EdgeType, Long> getExchanges(){
return this.exchanges;
}
......
package org.linagora.priorityInbox.data;
public enum LinkType {
TO, CC, BCC, IN_REPLY_TO, IN_REPLY_CC, IN_REPLY_BCC, TO_URGENT, CC_URGENT, BCC_URGENT, IN_REPLY_TO_URGENT, IN_REPLY_CC_URGENT, IN_REPLY_BCC_URGENT;
public enum EdgeType {
TO, CC, BCC, IN_REPLY_TO, IN_REPLY_CC, IN_REPLY_BCC;
public static Double toWeight(LinkType linkType) {
public static Double toWeight(EdgeType linkType) {
switch(linkType) {
case TO: return 1d;
case CC: return 0.8d;
case BCC: return 0.6d;
case BCC: return 0.5d;
case IN_REPLY_TO: return 2d;
case IN_REPLY_CC: return 1.8d;
case IN_REPLY_BCC: return 1.6d;
case TO_URGENT: return 3d;
case CC_URGENT: return 2.8d;
case BCC_URGENT: return 2.6d;
case IN_REPLY_TO_URGENT: return 4d;
case IN_REPLY_CC_URGENT: return 3.8d;
case IN_REPLY_BCC_URGENT: return 3.6d;
case IN_REPLY_BCC: return 1.5d;
default: return 1d;
}
}
public static Double totalWeighting() {
Double total = 0d;
for(LinkType linkType: LinkType.values()) {
for(EdgeType linkType: EdgeType.values()) {
total = total + toWeight(linkType);
}
return total;
......
package org.linagora.priorityInbox.data;
import java.util.Collection;
import java.util.HashMap;
import java.util.List;
......
package org.linagora.priorityInbox.feature;
import java.io.File;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map.Entry;
import javax.mail.internet.InternetAddress;
import org.apache.commons.io.FileUtils;
import org.linagora.priorityInbox.data.Email;
import org.linagora.priorityInbox.text.Text;
import org.linagora.priorityInbox.text.TextCleaner;
public class DynamicFeature {
public static String modelDirectory = null;
private static File getSenderModel(String userId) {
File senderModel = null;
try {
Path userModelPath = Paths.get(modelDirectory, userId, "/");
Path senderModelPath = Paths.get(userModelPath.toString(), "senderModel");
File directory = userModelPath.toFile();
if (!directory.exists()) {
directory.mkdirs();
FileUtils.writeStringToFile(senderModelPath.toFile(), "");
}else if(!senderModelPath.toFile().exists()) {
FileUtils.writeStringToFile(senderModelPath.toFile(), "");
}
senderModel = senderModelPath.toFile();
}catch (Exception e) {
e.printStackTrace();
}
return senderModel;
}
private static File getTextModel(String userId) {
File textModel = null;
try {
Path userModelPath = Paths.get(modelDirectory, userId, "/");
Path textModelPath = Paths.get(userModelPath.toString(), "textModel");
File directory = userModelPath.toFile();
if (!directory.exists()) {
directory.mkdirs();
FileUtils.writeStringToFile(textModelPath.toFile(), "");
}else if(!textModelPath.toFile().exists()) {
FileUtils.writeStringToFile(textModelPath.toFile(), "");
}
textModel = textModelPath.toFile();
}catch (Exception e) {
e.printStackTrace();
}
return textModel;
}
private static String toUserId(String user, List<String> alternativeAddress) {
String userId = "";
Collections.sort(alternativeAddress);
userId = user.hashCode() + "_" + alternativeAddress.hashCode();
return userId;
}
public static double getSenderWeight(Email email) {
double senderWeight = 0d;
try {
HashMap<String, TextDataModel> models = new HashMap<String, TextDataModel>();
TextDataModel senderTextDataModel = null;
String userId = toUserId(email.getUser(), email.getAlternativeAddress());
File senderModel = getSenderModel(userId);
models = readModelFromFile(senderModel);
int maxDocumentSize = 0;
for(Entry<String, TextDataModel> entry: models.entrySet()) {
TextDataModel currentValue = entry.getValue();
maxDocumentSize = Math.max(maxDocumentSize, currentValue.getTotalDocumentSize());
break;
}
String sender = email.getFrom().getAddress();
if(sender != null) {
TextDataModel currentValue = models.get(sender.toLowerCase());
if(currentValue == null) {
currentValue = new TextDataModel(sender.toLowerCase(), 1, 0, 1);
models.put(sender, currentValue);
}else {
currentValue.setFrequency(currentValue.getFrequency() + 1);
currentValue.setDocumentContainingWord(currentValue.getDocumentContainingWord() + 1);
}
senderTextDataModel = currentValue;
}
StringBuffer modelBuffer = new StringBuffer();
for(Entry<String, TextDataModel> entry: models.entrySet()) {
TextDataModel currentValue = entry.getValue();
currentValue.setTotalDocumentSize(maxDocumentSize + 1);
modelBuffer.append(entry.getValue().toStringData());
}
FileUtils.write(senderModel, modelBuffer.toString());
double tf = 1d;
double idf = Math.log10((double)senderTextDataModel.getTotalDocumentSize() / (double)senderTextDataModel.getDocumentContainingWord());
double tfidf = tf * idf;
senderWeight = tfidf;
// System.out.println("Sender weight: " + tfidf);
}catch(Exception e) {
e.printStackTrace();
}
return senderWeight;
}
public static double getEmailContentWeight(Email email) {
double contentWeight = 0d;
try {
HashMap<String, TextDataModel> models = new HashMap<String, TextDataModel>();
List<TextDataModel> documentModel = new ArrayList<TextDataModel>();
String userId = toUserId(email.getUser(), email.getAlternativeAddress());
File textModel = getTextModel(userId);
models = readModelFromFile(textModel);
int maxDocumentSize = 0;
for(Entry<String, TextDataModel> entry: models.entrySet()) {
TextDataModel currentValue = entry.getValue();
maxDocumentSize = Math.max(maxDocumentSize, currentValue.getTotalDocumentSize());
break;
}
String language = detectLanguage(email);
// HashMap<String, Integer> subjectWords = toSubjectBagOfWords(email, language);
// HashMap<String, Integer> bodyWords = toBodyBagOfWords(email, language);
HashMap<String, Integer> emailWords = toBagOfWords(email, language);
// int subjectWordFrequency = 0;
// int bodyWordFrequency = 0;
int emailWordFrequency = 0;
//update data model with new words
for(Entry<String, Integer> entry: emailWords.entrySet()) {
TextDataModel currentValue = models.get(entry.getKey());
emailWordFrequency = emailWordFrequency + entry.getValue();
if(currentValue == null) {
currentValue = new TextDataModel(entry.getKey(), entry.getValue(), 0, 1);
models.put(entry.getKey(), currentValue);
}else {
currentValue.setFrequency(currentValue.getFrequency() + entry.getValue());
currentValue.setDocumentContainingWord(currentValue.getDocumentContainingWord() + 1);
}
documentModel.add(currentValue);
}
// //update data model for subjectWord
// for(Entry<String, Integer> entry: subjectWords.entrySet()) {
// TextDataModel currentValue = models.get(entry.getKey());
// subjectWordFrequency = subjectWordFrequency + entry.getValue();
// if(currentValue == null) {
// currentValue = new TextDataModel(entry.getKey(), entry.getValue(), 0, 1);
// models.put(entry.getKey(), currentValue);
// }else {
// currentValue.setFrequency(currentValue.getFrequency() + entry.getValue());
// currentValue.setDocumentContainingWord(currentValue.getDocumentContainingWord() + 1);
// }
// documentModel.add(currentValue);
//
// }
//
// //update data model for bodyWord
// for(Entry<String, Integer> entry: bodyWords.entrySet()) {
// TextDataModel currentValue = models.get(entry.getKey());
// bodyWordFrequency = bodyWordFrequency + entry.getValue();
// if(currentValue == null) {
//
// currentValue = new TextDataModel(entry.getKey(), entry.getValue(), 0, 1);
// models.put(entry.getKey(), currentValue);
// }else {
// currentValue.setFrequency(currentValue.getFrequency() + entry.getValue());
// currentValue.setDocumentContainingWord(currentValue.getDocumentContainingWord() + 1);
//
// }
// documentModel.add(currentValue);
// }
StringBuffer modelBuffer = new StringBuffer();
for(Entry<String, TextDataModel> entry: models.entrySet()) {
TextDataModel currentValue = entry.getValue();
currentValue.setTotalDocumentSize(maxDocumentSize + 1);
modelBuffer.append(entry.getValue().toStringData());
}
FileUtils.write(textModel, modelBuffer.toString());
for(TextDataModel textDataModel: documentModel) {
String word = textDataModel.getWord();
int totalDocumentSize = textDataModel.getTotalDocumentSize();
int documentContainingWord = textDataModel.getDocumentContainingWord();
double tf = (double)emailWords.get(word) / (double)emailWordFrequency;
double idf = Math.log10((double) totalDocumentSize/ (double)documentContainingWord);
double tfidf = tf * idf;
contentWeight = contentWeight + tfidf;
//System.out.println("TFIDF " + word + ": " + tfidf);
}
// for(TextDataModel textDataModel: documentModel) {
// String word = textDataModel.getWord();
// int totalDocumentSize = textDataModel.getTotalDocumentSize();
// int documentContainingWord = textDataModel.getDocumentContainingWord();
// if(word.startsWith("S_")) {
// double tf = (double)subjectWords.get(word) / (double)subjectWordFrequency;
// double idf = Math.log10((double) totalDocumentSize/ (double)documentContainingWord);
// double tfidf = tf * idf;
// contentWeight = contentWeight + tfidf;
// //System.out.println("TFIDF " + word + ": " + tfidf);
// }else {
// double tf = (double)bodyWords.get(word) / (double)bodyWordFrequency;
// double idf = Math.log10((double)totalDocumentSize / (double)documentContainingWord);
// double tfidf = tf * idf;
// //System.out.println("TFIDF " + word + ": " + tfidf);
// contentWeight = contentWeight + tfidf;
// }
//
// }
//System.out.println("Email contentWeight: " + contentWeight);
}catch(Exception e) {
e.printStackTrace();
}
return contentWeight;
}
public static HashMap<String, TextDataModel> readModelFromFile(File file){
HashMap<String, TextDataModel> models = new HashMap<String, TextDataModel>();
try {
List<String> lines = FileUtils.readLines(file);
for(String line: lines) {
String [] elements = line.split("\t");
String word = elements[0];
int frequency = Integer.parseInt(elements[1]);
int totalDocumentSize = Integer.parseInt(elements[2]);
int documentContainingWord = Integer.parseInt(elements[3]);
TextDataModel textDataModel = new TextDataModel(word, frequency, totalDocumentSize, documentContainingWord);
models.put(word, textDataModel);
}
}catch(Exception e) {
e.printStackTrace();
}
return models;
}
public static String detectLanguage(Email email) {
String content = "";
if (email.getSubject() != null) {
content = email.getSubject();
}
if (email.getBody() != null) {
content = content + "\n" + email.getBody();
}
return Text.detectLanguage(content);
}
private static HashMap<String, Integer> toSubjectBagOfWords(Email email, String language) {
HashMap<String, Integer> model = new HashMap<String, Integer>();
if (email.getSubject() != null) {
Text.setLanguage(language);
String processedSubject = Text.process(email.getSubject());
// List<String> words = Text.toNGramsALL(3, processedSubject.toLowerCase());
// HashMap<String, Double> frequency = Text.countTokens(String.join(" ", words));
HashMap<String, Double> frequency = Text.countTokens(processedSubject.toLowerCase());
for (Entry<String, Double> entry : frequency.entrySet()) {
model.put("S_" + entry.getKey(), entry.getValue().intValue());
}
return model;
}
return model;
}
private static HashMap<String, Integer> toBagOfWords(Email email, String language) {
HashMap<String, Integer> model = new HashMap<String, Integer>();
String content = "";
if(email.getSubject() != null) {
content = email.getSubject();
}
if(email.getBody() != null) {
String cleanedText = TextCleaner.cleanReplyBlock(email.getBody());
String signature = TextCleaner.detectSignature(email.getBody(), email.getFrom().getPersonal());
content = content + " \n" + cleanedText.replace(signature, "");
}
if (!content.equals("")) {
Text.setLanguage(language);
String processedContent = Text.process(content);
// List<String> words = Text.toNGramsALL(3, processedSubject.toLowerCase());
// HashMap<String, Double> frequency = Text.countTokens(String.join(" ", words));
HashMap<String, Double> frequency = Text.countTokens(processedContent.toLowerCase());
for (Entry<String, Double> entry : frequency.entrySet()) {
model.put(entry.getKey(), entry.getValue().intValue());
}
return model;
}
return model;
}
private static HashMap<String, Integer> toBodyBagOfWords(Email email, String language) {
HashMap<String, Integer> model = new HashMap<String, Integer>();
if (email.getBody() != null) {
Text.setLanguage(language);
String cleanedText = TextCleaner.cleanReplyBlock(email.getBody());
String signature = TextCleaner.detectSignature(email.getBody(), email.getFrom().getPersonal());
cleanedText = cleanedText.replace(signature, "");
String processedBody = Text.process(cleanedText);
// List<String> words = Text.toNGramsALL(3, processedBody.toLowerCase());
// HashMap<String, Double> frequency = Text.countTokens(String.join(" ", words));
HashMap<String, Double> frequency = Text.countTokens(processedBody.toLowerCase());
for (Entry<String, Double> entry : frequency.entrySet()) {
model.put("B_" + entry.getKey(), entry.getValue().intValue());
}
return model;
}
return model;
}
public static void main(String [] args) {
Text.setStopWordsDirectory("stopwords/");
TextCleaner.CLEANING_REGEX = new File("TextCleaner.regex");
DynamicFeature.modelDirectory = "models/";
DynamicFeature.getTextModel("abcdef");
List<String> alternativeEmails = Arrays.asList("zsellami@linagora.com", "zied.sellami@linagora.com");
Email email = new Email();
email.setSubject("batman est revenu de la rue de bastia avec paris");
email.setBody("Bonsoir, merci de voir le mail.");
email.setUser("Zied Sellami");
email.setAlternativeAddress(alternativeEmails);
try {
email.setFrom(new InternetAddress("jplorre@lingaora.com", "Jean-Pierre Lorré"));
}catch(Exception e) {
e.printStackTrace();
}
DynamicFeature.getSenderWeight(email);
DynamicFeature.getEmailContentWeight(email);
}
}
......@@ -6,6 +6,7 @@ import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;