Commit d63367ac authored by Zied SELLAMI's avatar Zied SELLAMI
Browse files

Integrating DynamicFeature

parent 4c333593
......@@ -4,3 +4,5 @@ TMP_DIRECTORY = /home/zsellami/tmp
NOTIFICATION_CHANNEL = /home/zsellami/dev/git/priority-inbox/priorityInbox/channels/
JOB_POSITION_RULE = /home/zsellami/dev/git/priority-inbox/priorityInbox/jobpositions/
TEXT_CLEANER_RULE = /home/zsellami/dev/git/priority-inbox/priorityInbox/TextCleaner.regex
MODEL_DIRECTORY = /home/zsellami/dev/git/priority-inbox/priorityInbox/models/
STOP_WORDS_DIRECTORY = /home/zsellami/dev/git/priority-inbox/priorityInbox/stopwords/
\ No newline at end of file
......@@ -4,3 +4,5 @@ TMP_DIRECTORY = /priority-inbox/priorityInbox/tmp
NOTIFICATION_CHANNEL = /priority-inbox/priorityInbox/channels/
JOB_POSITION_RULE = /priority-inbox/priorityInbox/jobpositions/
TEXT_CLEANER_RULE = /priority-inbox/priorityInbox/TextCleaner.regex
MODEL_DIRECTORY = /priority-inbox/priorityInbox/models/
STOP_WORDS_DIRECTORY = /priority-inbox/priorityInbox/stopwords/
\ No newline at end of file
......@@ -105,7 +105,20 @@
<artifactId>tika-langdetect</artifactId>
<version>1.20</version>
</dependency>
</dependencies>
<dependency>
<groupId>com.thoughtworks.paranamer</groupId>
<artifactId>paranamer</artifactId>
<version>2.8</version>
</dependency>
<dependency>
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>
<version>15.0</version>
</dependency>
</dependencies>
<build>
<plugins>
......
......@@ -58,6 +58,14 @@ public class Configuration {
public String getTMPDirectory() {
return parameters.get("TMP_DIRECTORY");
}
public String getModelDirectory() {
return parameters.get("MODEL_DIRECTORY");
}
public String getStopWordsDirectory() {
return parameters.get("STOP_WORDS_DIRECTORY");
}
@Override
public String toString() {
......
......@@ -22,8 +22,10 @@ import org.glassfish.jersey.media.multipart.FormDataContentDisposition;
import org.glassfish.jersey.media.multipart.FormDataParam;
import org.linagora.priorityInbox.data.Email;
import org.linagora.priorityInbox.feature.DynamicFeature;
import org.linagora.priorityInbox.feature.FeatureExtractor;
import org.linagora.priorityInbox.feature.LabeledDocument;
import org.linagora.priorityInbox.text.Text;
import org.linagora.priorityInbox.text.TextCleaner;
import com.fasterxml.jackson.databind.ObjectMapper;
......@@ -41,6 +43,8 @@ public class WebService {
FeatureExtractor.JOB_POSITION_RULES = new File(config.getJobPositionRules());
FeatureExtractor.NOTIFICATION_CHANNELS = new File(config.getNotificationChannels());
TextCleaner.CLEANING_REGEX = new File(config.getTextCleanerRegex());
DynamicFeature.modelDirectory = config.getModelDirectory();
Text.setStopWordsDirectory(config.getStopWordsDirectory());
}
// https://stackoverflow.com/questions/30653012/multipart-form-data-no-injection-source-found-for-a-parameter-of-type-public-ja?utm_medium=organic&utm_source=google_rich_qa&utm_campaign=google_rich_qa
......
package org.linagora.priorityInbox.api;
import java.net.URI;
import java.util.concurrent.Executors;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.TimeUnit;
import javax.ws.rs.core.UriBuilder;
......@@ -8,6 +11,7 @@ import org.glassfish.grizzly.http.server.HttpServer;
import org.glassfish.jersey.grizzly2.httpserver.GrizzlyHttpServerFactory;
import org.glassfish.jersey.media.multipart.MultiPartFeature;
import org.glassfish.jersey.server.ResourceConfig;
import org.linagora.priorityInbox.feature.DynamicFeature;
public class WebServiceMain {
......@@ -17,8 +21,26 @@ public class WebServiceMain {
}
public static void main(String[] args) {
final ScheduledExecutorService executor = Executors.newSingleThreadScheduledExecutor();
Runnable periodicSaving = new Runnable() {
public void run() {
DynamicFeature.saveModels();
}
};
if(args.length ==1) {
Thread app = new Thread() {
Runtime.getRuntime().addShutdownHook(new Thread() {
public void run() {
DynamicFeature.saveModels();
System.out.println("Program stopped. Models saved before stopping the service.");
executor.shutdown();
}
});
executor.scheduleWithFixedDelay(periodicSaving, 300, 600, TimeUnit.SECONDS);
Thread app = new Thread() {
public void run() {
Configuration config = new Configuration(args[0]);
......@@ -59,6 +81,7 @@ public class WebServiceMain {
System.out.println("Please set config argument. Usage: WebServiceMain configFilePath");
System.exit(0);
}
}
......
......@@ -37,6 +37,8 @@ public class FeatureExtractor {
private static String QUESTION_REGEX = "(?im)^(who|what|when|where|why|how|is|can|does|do|which|could|would|should|did|shall|are|has|have|will|at).+\\?$|^\\w+\\-(ce|vous|tu|nous|t\\-(on|il|elle)).+\\?$|^(qui|(à|avec|de) qui|(à|avec|de) quoi|où|comment|pourquoi|combien) .+\\?$|^(quelles|quels|quelle|quel) .+\\?$|^est ce (que|qu').+\\?$";
private static String EMAIL_FROM_SUBSCRIPTION = "(?im)(se désinscrire|se désabonner|unsubscribe)\\s?$";
private static String[] FWD_REGEX_LIST = {
"(-+ *\\w+ (d'origine|original|transf(é|é|.{0,5})r(é|é|.{0,5})) *-+|-+ *(forwarded|original) \\w+ *-+)" };
......@@ -76,6 +78,8 @@ public class FeatureExtractor {
private static Pattern QUESTION_PATTERN = Pattern.compile(QUESTION_REGEX);
private static Pattern EMAIL_FROM_SUBSCRIPTION_PATTERN = Pattern.compile(EMAIL_FROM_SUBSCRIPTION);
private static Pattern PUNCTUATION_PATTERN = Pattern.compile("(…|\\.\\.\\.|\\.|\\?|:|\\!|;)");
private static Pattern BLANK_LINE_PATTERN = Pattern.compile("(?im)\\s*");
......@@ -132,8 +136,19 @@ public class FeatureExtractor {
boolean containsMeetingInvitation = containsMeetingInvitation(email);
boolean isANotification = isANotification(email);
double senderScore = getSenderScore(email) ;//+ DynamicFeature.getSenderWeight(email);
// double contentWeight = DynamicFeature.getEmailContentWeight(email);
PositionWeight positionWeight = getSenderScore(email);
double senderScore = 0d;
String jobPosition = null;
if(positionWeight != null) {
senderScore = positionWeight.getWeight();
jobPosition = positionWeight.getName();
}//else {
double tfidfSenderScore = DynamicFeature.getSenderWeight(email);
senderScore = Math.max(senderScore, tfidfSenderScore);
//}
double contentWeight = DynamicFeature.getEmailContentWeight(email);
boolean existReplyText = existReplyText(email);
// System.out.println("Sender Score: " + senderScore);
// System.out.println("Content Weight: " + contentWeight);
......@@ -173,7 +188,11 @@ public class FeatureExtractor {
document.setExistReplyText(existReplyText);
document.setContainsQuestions(containsQuestions);
document.setNumberOfQuestions(numberOfQuestions);
// document.setContentWeight(contentWeight);
if(jobPosition != null) {
document.setJobPosition(jobPosition);
}
document.setContentWeight(contentWeight);
// document.setSubjectWords(subjectWords);
// document.setBodyWords(bodyWords);
......@@ -244,7 +263,16 @@ public class FeatureExtractor {
boolean containsMeetingInvitation = containsMeetingInvitation(email);
boolean isANotification = isANotification(email);
double senderScore = getSenderScore(email);// + DynamicFeature.getSenderWeight(email);
PositionWeight positionWeight = getSenderScore(email);
double senderScore = 0d;
String jobPosition = null;
if(positionWeight != null) {
senderScore = positionWeight.getWeight();
jobPosition = positionWeight.getName();
}else {
senderScore = DynamicFeature.getSenderWeight(email);
}
//double contentWeight = DynamicFeature.getEmailContentWeight(email);
boolean existReplyText = existReplyText(email);
......@@ -286,6 +314,9 @@ public class FeatureExtractor {
// document.setBodyWords(bodyWords);
document.setContainsQuestions(containsQuestions);
document.setNumberOfQuestions(numberOfQuestions);
if(jobPosition != null) {
document.setJobPosition(jobPosition);
}
// System.out.println(email.getMessageId()+ " - " + email.getSubject() + " - " + emailFolderName);
// System.out.println("isSent: " + isSent);
......@@ -626,6 +657,8 @@ public class FeatureExtractor {
}
public static boolean isANotification(Email email) {
boolean isANotification = false;
boolean existUnsubscription = false;
if (email.getFrom() == null) {
return false;
}
......@@ -648,8 +681,17 @@ public class FeatureExtractor {
}
}
return email.getFrom().getAddress().matches(NOTIFICATION_REGEX);
isANotification = email.getFrom().getAddress().matches(NOTIFICATION_REGEX);
if(!isANotification) {
if(email.getBody() != null) {
String content = email.getBody();
existUnsubscription = EMAIL_FROM_SUBSCRIPTION_PATTERN.matcher(content).find();
}
}
return isANotification || existUnsubscription;
}
......@@ -678,7 +720,7 @@ public class FeatureExtractor {
return false;
}
public static double getSenderScore(Email email) {
public static PositionWeight getSenderScore(Email email) {
String signature = "";
double rankScore = 0d;
......@@ -690,12 +732,20 @@ public class FeatureExtractor {
}
for (PositionWeight rank : positionRanks.get("all.txt")) {
boolean rankFind = rank.getRegex().matcher(signature).find();
if (rankFind) {
rankScore = rank.getWeight();
break;
Matcher match = rank.getRegex().matcher(signature);
while(match.find()) {
// String text = match.group();
// System.out.println("Matched name: " + rank.getName());
return rank;
//break;
}
// if (rankFind) {
// rankScore = rank.getWeight();
// break;
// }
}
return rankScore;
//return rankScore;
return null;
}
public static boolean existReplyText(Email email) {
......
package org.linagora.priorityInbox.feature;
import java.util.List;
import java.util.Random;
import org.linagora.priorityInbox.data.Email;
......@@ -42,6 +43,8 @@ public class LabeledDocument implements Comparable<LabeledDocument>{
private boolean isSameOrganisation;
private boolean existReplyText;
private String jobPosition = null;
public static double IMPORTANT_SCORE_THRESHOLD = 10d;
......@@ -195,6 +198,7 @@ public class LabeledDocument implements Comparable<LabeledDocument>{
if(documentType == null) {
if(isAReply || isAForward || emailTriggeringAReply || isImportant || (!isANotification && this.getScore() >= IMPORTANT_SCORE_THRESHOLD)) {documentType = DocumentType.IMPORTANT;}
else if((isANotification && (isAReply || isAForward) && !isSpam && !isImportant) || (!isANotification && containsQuestions)) {documentType = DocumentType.TO_READ;}
else if(isANotification || isSpam) {documentType = DocumentType.NOTIFICATION;}
......@@ -222,14 +226,14 @@ public class LabeledDocument implements Comparable<LabeledDocument>{
if(isToRecipient) {score = score + (1d/(double)numberOfRecipient);} // rule 9
if(isCcRecipient) {score = score + (0.8d/(double)numberOfRecipient);} // rule 10
if(isBccRecipient) {score = score + (0.5d/(double)numberOfRecipient);} // rule 11
if(isBccRecipient) {score = score + (0.6d/(double)numberOfRecipient);} // rule 11
if(emailTriggeringAReply) {score = score + 1d;} // rule 12
if(!isSameOrganisation) {score = score + 1d;} // rule 13
if(!isToGroup) {score = score + 1d;} // rule 14
score = score + senderScore; // rule 15
//score = score + contentWeight; // rule 16
score = score + contentWeight; // rule 16
documentScore = new Double(score);
}
......@@ -319,6 +323,66 @@ public class LabeledDocument implements Comparable<LabeledDocument>{
public void setNumberOfQuestions(int numberOfQuestions) {
this.numberOfQuestions = numberOfQuestions;
}
public void setJobPosition(String jobPosition) {
this.jobPosition = jobPosition;
}
public String getJobPosition() {
return this.jobPosition;
}
public String getBooleanFeatureAsText() {
String feature = "";
//feature = feature + " " + this.email.getFrom().getAddress();
if(isSent) {feature = feature + " isSent";}
if(isRecieved) {feature = feature + " isRecieved";}
if(isAReply) {feature = feature + " isAReply";}
if(isToRecipient) {feature = feature + " isToRecipient";}
if(isCcRecipient) {feature = feature + " isCcRecipient";}
if(isBccRecipient) {feature = feature + " isBccRecipient";}
if(isAForward) {feature = feature + " isAForward";}
if(isImportant) {feature = feature + " isImportant";}
if(isSpam) {feature = feature + " isSpam";}
if(containsDocAttachments) {feature = feature + " containsDocAttachments";}
if(containsMeetingInvitation) {feature = feature + " containsMeetingInvitation";}
if(isANotification) {feature = feature + " isANotification";}
if(emailTriggeringAReply) {feature = feature + " emailTriggeringAReply";}
if(containsQuestions) {feature = feature + " containsQuestions";}
if(isSameOrganisation) {feature = feature + " isSameOrganisation";}
if(isToGroup) {feature = feature + " isToGroup";}
if(existReplyText){feature = feature + " existReplyText";}
if(jobPosition != null) {feature = feature + " " + jobPosition;}
return feature;
}
public String getFeatureAsBinaryData(int startIndex) {
String feature = "";
//feature = feature + " " + this.email.getFrom().getAddress();
if(isSent) {feature = feature + " 1.0";} else {feature = feature + " 0.0";}
if(isRecieved) {feature = feature + " 1.0";} else {feature = feature + " 0.0";}
if(isAReply) {feature = feature + " 1.0";} else {feature = feature + " 0.0";}
if(isToRecipient) {feature = feature + " 1.0";} else {feature = feature + " 0.0";}
if(isCcRecipient) {feature = feature + " 1.0";} else {feature = feature + " 0.0";}
if(isBccRecipient) {feature = feature + " 1.0";} else {feature = feature + " 0.0";}
if(isAForward) {feature = feature + " 1.0";} else {feature = feature + " 0.0";}
if(isImportant) {feature = feature + " 1.0";} else {feature = feature + " 0.0";}
if(isSpam) {feature = feature + " 1.0";} else {feature = feature + " 0.0";}
if(containsDocAttachments) {feature = feature + " 1.0";} else {feature = feature + " 0.0";}
if(containsMeetingInvitation) {feature = feature + " 1.0";} else {feature = feature + " 0.0";}
if(isANotification) {feature = feature + " 1.0";} else {feature = feature + " 0.0";}
if(emailTriggeringAReply) {feature = feature + " 1.0";} else {feature = feature + " 0.0";}
if(containsQuestions) {feature = feature + " 1.0";} else {feature = feature + " 0.0";}
if(isSameOrganisation) {feature = feature + " 1.0";} else {feature = feature + " 0.0";}
if(isToGroup) {feature = feature + " 1.0";} else {feature = feature + " 0.0";}
if(existReplyText){feature = feature + " 1.0";} else {feature = feature + " 0.0";}
//if(jobPosition != null) {feature = feature + " " + jobPosition;}
return feature;
}
......
......@@ -12,6 +12,7 @@ public class PositionWeight {
private Pattern regex = null;
private double weight = 0;
private String name = null;
private static HashMap<String, List<PositionWeight>> positionWeights = new HashMap<String, List<PositionWeight>>();
public static double maxWeight = LabeledDocument.IMPORTANT_SCORE_THRESHOLD * 2d;
......@@ -40,6 +41,7 @@ public class PositionWeight {
PositionWeight positionRank = new PositionWeight();
positionRank.setWeight(maxWeight - allStep);
allStep = allStep + step;
positionRank.setName(elements.get(0).replace("\\Q","").replace("\\E","").replace(" ", "_").trim());
positionRank.setRegex(Pattern.compile(String.join("|", elements), Pattern.CASE_INSENSITIVE));
size --;
ranks.add(positionRank);
......@@ -60,6 +62,12 @@ public class PositionWeight {
this.regex = regex;
this.weight = rank;
}
public PositionWeight(String name, Pattern regex, int rank) {
this.name = name;
this.regex = regex;
this.weight = rank;
}
public Pattern getRegex() {
......@@ -77,5 +85,13 @@ public class PositionWeight {
public void setWeight(double rank) {
this.weight = rank;
}
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
}
package org.linagora.priorityInbox.feature;
public class TextDataModel {
public class TextDataModel implements Comparable<TextDataModel>{
private String word = null;
private int index = -1;
private int frequency = -1;
private int totalDocumentSize = -1;
private int documentContainingWord = -1;
......@@ -17,6 +18,14 @@ public class TextDataModel {
this.totalDocumentSize = totalDocumentSize;
this.documentContainingWord = documentContainingWord;
}
public TextDataModel(int index, String word, int frequency, int totalDocumentSize, int documentContainingWord) {
this.word = word;
this.setIndex(index);
this.frequency = frequency;
this.totalDocumentSize = totalDocumentSize;
this.documentContainingWord = documentContainingWord;
}
public String getWord() {
return word;
......@@ -43,10 +52,24 @@ public class TextDataModel {
this.documentContainingWord = documentContainingWord;
}
public String toStringData() {
return word + "\t" + frequency + "\t" + totalDocumentSize + "\t" + documentContainingWord + "\n";
public int getIndex() {
return index;
}
public void setIndex(int index) {
this.index = index;
}
public String toStringData() {
return index + "\t" + word + "\t" + frequency + "\t" + totalDocumentSize + "\t" + documentContainingWord + "\n";
}
@Override
public int compareTo(TextDataModel o) {
if(index == o.index) return 0;
if(index < o.index) return -1;
return 1;
}
}
......@@ -24,6 +24,7 @@ public class Text {
private static HashMap<String, Pattern> stopWordsRegexMap = new HashMap<String, Pattern>();
private static List<Pattern> regexStopWords = new ArrayList<Pattern>();
private static String stopWordsDirectory = null;
private static final String SENTENCE_BORDER = "SENTENCEBORDERSTOP";
public static String language = null;
private static LanguageDetector languageDetector = new OptimaizeLangDetector().loadModels();
......@@ -31,7 +32,10 @@ public class Text {
private static HashMap<String, Pattern> Combined_Patterns = new HashMap<String, Pattern>();
private static Pattern Number_Pattern = Pattern.compile("(" + String.join("|", "0123456789٠١٢٣٤٥٦٧٨٩〇零一壹二贰貳三叁叄四肆五伍六陸七柒八捌九玖十拾०१२३४५६७८९१०אבגדהוזחטי零一二三四五六七八九十๐๑๒๓๔๕๖๗๘๙๑๐".split("")) + ")");
private static Pattern Blank_Pattern = Pattern.compile("\\s+");
private static Pattern Space_Pattern = Pattern.compile("(\u00a0|\u202f|\u2007)");
private static Pattern Space_Pattern = Pattern.compile("(\u00a0|\u202f|\u2007|\t|\u200b)");
private static Pattern Sentence_Pattern_Splitter = Pattern.compile("([\\.\\?\\!\\\"]((?<=[a-zéèếùûàçî0-9][:\\.\\?\\!])|(?<=[a-zéèếùûàçî0-9][:\\.\\?\\!]\\\"))(\\s|\\r\\n|\\n){1,}(?=[\\\"\\(\\{\\[]?[A-Z]))|([\\(\\[\\{\"\\)\\}\\]][:\\.\\?\\!\\\"\\r\\n]{1,})|([:\\.\\;\\?\\!] ?[\\r\\n]+)");
private static Pattern Sentence_Border_Pattern = Pattern.compile(SENTENCE_BORDER.toLowerCase());
public static boolean removeSpecificPunctuation = true;
public static boolean removeRegexStopWords = true;
public static boolean removeStopWords = true;
......@@ -40,6 +44,14 @@ public class Text {
public static boolean stemm = true;
public static boolean removeAccent = true;
public static List<String> toSentences(String text){
List<String> sentences = new ArrayList<String>();
for(String sent: Sentence_Pattern_Splitter.split(text,0)) {
if(!StringUtils.isBlank(sent)) {sentences.add(sent.trim());}
}
return sentences;
}
public static void setStopWordsDirectory(String directory) {
stopWordsDirectory = directory;
}
......@@ -113,6 +125,8 @@ public class Text {
String processedText = text;
processedText = Space_Pattern.matcher(processedText).replaceAll(" ");
//processedText = String.join(" " + SENTENCE_BORDER + "\n", toSentences(processedText));
List<Pattern> stopWords = new ArrayList<Pattern>();
List<Pattern> allStopWords = new ArrayList<Pattern>();
......@@ -194,7 +208,7 @@ public class Text {
private static String removeSingleChar(String text) {
StringBuffer processedText = new StringBuffer();
for(String word: Blank_Pattern.split(text, 0)) {
if(word.length() > 1) {
if(word.length() > 4) {
processedText.append(word + " ");
}
}
......@@ -296,16 +310,49 @@ public class Text {
//statistic
public static List<String> toNGrams(int n, String str) {
List<String> ngrams = new ArrayList<String>();
String[] words = Blank_Pattern.split(str, 0);
for (int i = 0; i < words.length - n + 1; i++)
ngrams.add(concat(words, i, i+n));
String [] sentences = Sentence_Border_Pattern.split(str,0);
for(String sentence: sentences) {
sentence = sentence.trim();
String[] words = Blank_Pattern.split(sentence, 0);
if(words.length < n) {
int diff = n - words.length;
String[] newWords = new String[n];
for(int i= 0; i< n; i++) {
if(i < diff) {
newWords[i] = "<empty>";
}else {
newWords[i] = words[i-diff];
}
}
words = newWords;
}
for (int i = 0; i < words.length - n + 1; i++) {
ngrams.add(concat(words, i, i+n));
}
}
return ngrams;
}
public static List<String> toNGramsALL(int n, String str){
List<String> ngrams = new ArrayList<String>();
for(int i=1; i<=n; i++) {
ngrams.addAll(toNGrams(i, str));
List<String> ngramsToAdd = toNGrams(i, str);
List<String> correctNGram = new ArrayList<String>();
for(String ngram:ngramsToAdd) {
int size = ngram.split("_").length;
if(size < n) {
int diff = n - size;
for(int j=0; j< diff;j++) {
ngram = "<empty>_" + ngram;
}
}
correctNGram.add(ngram);
}
ngrams.addAll(correctNGram);
}
return ngrams;