Commit d63367ac authored by Zied SELLAMI's avatar Zied SELLAMI
Browse files

Integrating DynamicFeature

parent 4c333593
......@@ -4,3 +4,5 @@ TMP_DIRECTORY = /home/zsellami/tmp
NOTIFICATION_CHANNEL = /home/zsellami/dev/git/priority-inbox/priorityInbox/channels/
JOB_POSITION_RULE = /home/zsellami/dev/git/priority-inbox/priorityInbox/jobpositions/
TEXT_CLEANER_RULE = /home/zsellami/dev/git/priority-inbox/priorityInbox/TextCleaner.regex
MODEL_DIRECTORY = /home/zsellami/dev/git/priority-inbox/priorityInbox/models/
STOP_WORDS_DIRECTORY = /home/zsellami/dev/git/priority-inbox/priorityInbox/stopwords/
\ No newline at end of file
......@@ -4,3 +4,5 @@ TMP_DIRECTORY = /priority-inbox/priorityInbox/tmp
NOTIFICATION_CHANNEL = /priority-inbox/priorityInbox/channels/
JOB_POSITION_RULE = /priority-inbox/priorityInbox/jobpositions/
TEXT_CLEANER_RULE = /priority-inbox/priorityInbox/TextCleaner.regex
MODEL_DIRECTORY = /priority-inbox/priorityInbox/models/
STOP_WORDS_DIRECTORY = /priority-inbox/priorityInbox/stopwords/
\ No newline at end of file
......@@ -105,7 +105,20 @@
<artifactId>tika-langdetect</artifactId>
<version>1.20</version>
</dependency>
</dependencies>
<dependency>
<groupId>com.thoughtworks.paranamer</groupId>
<artifactId>paranamer</artifactId>
<version>2.8</version>
</dependency>
<dependency>
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>
<version>15.0</version>
</dependency>
</dependencies>
<build>
<plugins>
......
......@@ -58,6 +58,14 @@ public class Configuration {
public String getTMPDirectory() {
return parameters.get("TMP_DIRECTORY");
}
public String getModelDirectory() {
return parameters.get("MODEL_DIRECTORY");
}
public String getStopWordsDirectory() {
return parameters.get("STOP_WORDS_DIRECTORY");
}
@Override
public String toString() {
......
......@@ -22,8 +22,10 @@ import org.glassfish.jersey.media.multipart.FormDataContentDisposition;
import org.glassfish.jersey.media.multipart.FormDataParam;
import org.linagora.priorityInbox.data.Email;
import org.linagora.priorityInbox.feature.DynamicFeature;
import org.linagora.priorityInbox.feature.FeatureExtractor;
import org.linagora.priorityInbox.feature.LabeledDocument;
import org.linagora.priorityInbox.text.Text;
import org.linagora.priorityInbox.text.TextCleaner;
import com.fasterxml.jackson.databind.ObjectMapper;
......@@ -41,6 +43,8 @@ public class WebService {
FeatureExtractor.JOB_POSITION_RULES = new File(config.getJobPositionRules());
FeatureExtractor.NOTIFICATION_CHANNELS = new File(config.getNotificationChannels());
TextCleaner.CLEANING_REGEX = new File(config.getTextCleanerRegex());
DynamicFeature.modelDirectory = config.getModelDirectory();
Text.setStopWordsDirectory(config.getStopWordsDirectory());
}
// https://stackoverflow.com/questions/30653012/multipart-form-data-no-injection-source-found-for-a-parameter-of-type-public-ja?utm_medium=organic&utm_source=google_rich_qa&utm_campaign=google_rich_qa
......
package org.linagora.priorityInbox.api;
import java.net.URI;
import java.util.concurrent.Executors;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.TimeUnit;
import javax.ws.rs.core.UriBuilder;
......@@ -8,6 +11,7 @@ import org.glassfish.grizzly.http.server.HttpServer;
import org.glassfish.jersey.grizzly2.httpserver.GrizzlyHttpServerFactory;
import org.glassfish.jersey.media.multipart.MultiPartFeature;
import org.glassfish.jersey.server.ResourceConfig;
import org.linagora.priorityInbox.feature.DynamicFeature;
public class WebServiceMain {
......@@ -17,8 +21,26 @@ public class WebServiceMain {
}
public static void main(String[] args) {
final ScheduledExecutorService executor = Executors.newSingleThreadScheduledExecutor();
Runnable periodicSaving = new Runnable() {
public void run() {
DynamicFeature.saveModels();
}
};
if(args.length ==1) {
Thread app = new Thread() {
Runtime.getRuntime().addShutdownHook(new Thread() {
public void run() {
DynamicFeature.saveModels();
System.out.println("Program stopped. Models saved before stopping the service.");
executor.shutdown();
}
});
executor.scheduleWithFixedDelay(periodicSaving, 300, 600, TimeUnit.SECONDS);
Thread app = new Thread() {
public void run() {
Configuration config = new Configuration(args[0]);
......@@ -59,6 +81,7 @@ public class WebServiceMain {
System.out.println("Please set config argument. Usage: WebServiceMain configFilePath");
System.exit(0);
}
}
......
package org.linagora.priorityInbox.feature;
import java.io.File;
import java.math.BigInteger;
import java.nio.charset.Charset;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.text.DecimalFormat;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map.Entry;
import javax.mail.internet.InternetAddress;
import org.apache.commons.io.FileUtils;
import org.linagora.priorityInbox.data.Email;
import org.linagora.priorityInbox.text.Text;
import org.linagora.priorityInbox.text.TextCleaner;
public class DynamicFeature {
public static String modelDirectory = null;
private static File getSenderModel(String userId) {
File senderModel = null;
try {
Path userModelPath = Paths.get(modelDirectory, userId, "/");
Path senderModelPath = Paths.get(userModelPath.toString(), "senderModel");
File directory = userModelPath.toFile();
if (!directory.exists()) {
directory.mkdirs();
FileUtils.writeStringToFile(senderModelPath.toFile(), "");
}else if(!senderModelPath.toFile().exists()) {
FileUtils.writeStringToFile(senderModelPath.toFile(), "");
private static HashMap<String, HashMap<String, TextDataModel>> WORDS_DATA = new HashMap<String, HashMap<String, TextDataModel>>();
private static HashMap<String, HashMap<String, TextDataModel>> SENDER_DATA = new HashMap<String, HashMap<String, TextDataModel>>();
private static synchronized HashMap<String, TextDataModel> getWordsData(String userId) {
HashMap<String, TextDataModel> wordsData = WORDS_DATA.get(userId);
if (wordsData == null) {
Path wordsDataPath = Paths.get(modelDirectory, userId, "textModel");
wordsData = readModelFromFile(wordsDataPath.toFile());
if (wordsData != null) {
WORDS_DATA.put(userId, wordsData);
}
}
senderModel = senderModelPath.toFile();
}catch (Exception e) {
e.printStackTrace();
return wordsData;
}
private static synchronized HashMap<String, TextDataModel> getSendersData(String userId) {
HashMap<String, TextDataModel> wordsData = SENDER_DATA.get(userId);
if (wordsData == null) {
Path wordsDataPath = Paths.get(modelDirectory, userId, "senderModel");
wordsData = readModelFromFile(wordsDataPath.toFile());
if (wordsData != null) {
SENDER_DATA.put(userId, wordsData);
}
}
return senderModel;
return wordsData;
}
private static File getTextModel(String userId) {
File textModel = null;
try {
Path userModelPath = Paths.get(modelDirectory, userId, "/");
Path textModelPath = Paths.get(userModelPath.toString(), "textModel");
File directory = userModelPath.toFile();
if (!directory.exists()) {
directory.mkdirs();
FileUtils.writeStringToFile(textModelPath.toFile(), "");
}else if(!textModelPath.toFile().exists()) {
FileUtils.writeStringToFile(textModelPath.toFile(), "");
public static synchronized void saveModels() {
long start = System.currentTimeMillis();
// Saving words data
for (Entry<String, HashMap<String, TextDataModel>> entry : WORDS_DATA.entrySet()) {
String userId = entry.getKey();
StringBuffer modelBuffer = new StringBuffer();
for (TextDataModel textDataModel : entry.getValue().values()) {
modelBuffer.append(textDataModel.toStringData());
}
textModel = textModelPath.toFile();
}catch (Exception e) {
e.printStackTrace();
try {
FileUtils.write(Paths.get(modelDirectory, userId, "textModel").toFile(),
modelBuffer.toString(), Charset.forName("UTF-8"));
} catch (Exception e) {
e.printStackTrace();
}
}
// Saving sender data
for (Entry<String, HashMap<String, TextDataModel>> entry : SENDER_DATA.entrySet()) {
String userId = entry.getKey();
StringBuffer modelBuffer = new StringBuffer();
for (TextDataModel textDataModel : entry.getValue().values()) {
modelBuffer.append(textDataModel.toStringData());
}
try {
FileUtils.write(Paths.get(modelDirectory, userId, "senderModel").toFile(),
modelBuffer.toString(), Charset.forName("UTF-8"));
} catch (Exception e) {
e.printStackTrace();
}
}
long end = System.currentTimeMillis();
String readableSize = "";
long size = 0;
try {
Path folder = Paths.get(modelDirectory);
size = FileUtils.sizeOfDirectory(folder.toFile());
int unitIndex = (int) (Math.log10(size) / 3);
String[] units = new String[] {"B", "KB", "MB", "GB", "TB", "ZB"};
double unitValue = 1 << (unitIndex * 10);
readableSize = new DecimalFormat("#,##0.#")
.format(size / unitValue) + " "
+ units[unitIndex];
System.out.println("SAVING MODELS IN " + modelDirectory + " - MODELS SIZE=" + readableSize);
}catch(Exception e) {
//e.printStackTrace();
System.err.println("SAVING MODELS... NO MODEL BUILT YET.");
}
return textModel;
}
private static String toUserId(String user, List<String> alternativeAddress) {
String userId = "";
Collections.sort(alternativeAddress);
userId = user.hashCode() + "_" + alternativeAddress.hashCode();
return userId;
}
public static double getSenderWeight(Email email) {
double senderWeight = 0d;
try {
HashMap<String, TextDataModel> models = new HashMap<String, TextDataModel>();
TextDataModel senderTextDataModel = null;
String userId = toUserId(email.getUser(), email.getAlternativeAddress());
File senderModel = getSenderModel(userId);
models = readModelFromFile(senderModel);
models = getSendersData(userId);// readModelFromFile(dataModelFile);
int maxDocumentSize = 0;
for(Entry<String, TextDataModel> entry: models.entrySet()) {
TextDataModel currentValue = entry.getValue();
maxDocumentSize = Math.max(maxDocumentSize, currentValue.getTotalDocumentSize());
break;
int currentIndex = 0;
if (models == null) {
models = new HashMap<String, TextDataModel>();
SENDER_DATA.put(userId, models);
} else {
for (Entry<String, TextDataModel> entry : models.entrySet()) {
TextDataModel currentValue = entry.getValue();
maxDocumentSize = Math.max(maxDocumentSize, currentValue.getTotalDocumentSize());
break;
}
for (TextDataModel tdm : models.values()) {
currentIndex = Math.max(currentIndex, tdm.getIndex());
}
}
currentIndex = currentIndex + 1;
String sender = email.getFrom().getAddress();
if(sender != null) {
if (sender != null) {
TextDataModel currentValue = models.get(sender.toLowerCase());
if(currentValue == null) {
currentValue = new TextDataModel(sender.toLowerCase(), 1, 0, 1);
if (currentValue == null) {
currentValue = new TextDataModel(currentIndex, sender.toLowerCase(), 1, 0, 1);
currentIndex++;
models.put(sender, currentValue);
}else {
} else {
currentValue.setFrequency(currentValue.getFrequency() + 1);
currentValue.setDocumentContainingWord(currentValue.getDocumentContainingWord() + 1);
}
senderTextDataModel = currentValue;
}
StringBuffer modelBuffer = new StringBuffer();
for(Entry<String, TextDataModel> entry: models.entrySet()) {
for (Entry<String, TextDataModel> entry : models.entrySet()) {
TextDataModel currentValue = entry.getValue();
currentValue.setTotalDocumentSize(maxDocumentSize + 1);
modelBuffer.append(entry.getValue().toStringData());
}
FileUtils.write(senderModel, modelBuffer.toString());
double tf = 1d;
double idf = Math.log10((double)senderTextDataModel.getTotalDocumentSize() / (double)senderTextDataModel.getDocumentContainingWord());
double idf = Math.log10((double) senderTextDataModel.getTotalDocumentSize()
/ (double) senderTextDataModel.getDocumentContainingWord());
double tfidf = tf * idf;
senderWeight = tfidf;
// System.out.println("Sender weight: " + tfidf);
}catch(Exception e) {
// System.out.println("Sender weight: " + tfidf);
} catch (Exception e) {
e.printStackTrace();
}
return senderWeight;
}
public static double getEmailContentWeight(Email email) {
double contentWeight = 0d;
try {
HashMap<String, TextDataModel> models = new HashMap<String, TextDataModel>();
List<TextDataModel> documentModel = new ArrayList<TextDataModel>();
String userId = toUserId(email.getUser(), email.getAlternativeAddress());
File textModel = getTextModel(userId);
models = readModelFromFile(textModel);
int maxDocumentSize = 0;
for(Entry<String, TextDataModel> entry: models.entrySet()) {
TextDataModel currentValue = entry.getValue();
maxDocumentSize = Math.max(maxDocumentSize, currentValue.getTotalDocumentSize());
break;
int currentIndex = 0;
models = getWordsData(userId);// readModelFromFile(dataModelFile);
if (models == null) {
models = new HashMap<String, TextDataModel>();
WORDS_DATA.put(userId, models);
} else {
for (Entry<String, TextDataModel> entry : models.entrySet()) {
TextDataModel currentValue = entry.getValue();
maxDocumentSize = Math.max(maxDocumentSize, currentValue.getTotalDocumentSize());
break;
}
for (TextDataModel tdm : models.values()) {
currentIndex = Math.max(currentIndex, tdm.getIndex());
}
}
currentIndex = currentIndex + 1;
String language = detectLanguage(email);
// HashMap<String, Integer> subjectWords = toSubjectBagOfWords(email, language);
// HashMap<String, Integer> bodyWords = toBodyBagOfWords(email, language);
HashMap<String, Integer> emailWords = toBagOfWords(email, language);
// int subjectWordFrequency = 0;
// int bodyWordFrequency = 0;
int emailWordFrequency = 0;
//update data model with new words
for(Entry<String, Integer> entry: emailWords.entrySet()) {
// update data model with new words
for (Entry<String, Integer> entry : emailWords.entrySet()) {
TextDataModel currentValue = models.get(entry.getKey());
emailWordFrequency = emailWordFrequency + entry.getValue();
if(currentValue == null) {
currentValue = new TextDataModel(entry.getKey(), entry.getValue(), 0, 1);
if (currentValue == null) {
currentValue = new TextDataModel(currentIndex, entry.getKey(), entry.getValue(), 0, 1);
currentIndex++;
models.put(entry.getKey(), currentValue);
}else {
} else {
currentValue.setFrequency(currentValue.getFrequency() + entry.getValue());
currentValue.setDocumentContainingWord(currentValue.getDocumentContainingWord() + 1);
}
documentModel.add(currentValue);
}
// //update data model for subjectWord
// for(Entry<String, Integer> entry: subjectWords.entrySet()) {
// TextDataModel currentValue = models.get(entry.getKey());
// subjectWordFrequency = subjectWordFrequency + entry.getValue();
// if(currentValue == null) {
// currentValue = new TextDataModel(entry.getKey(), entry.getValue(), 0, 1);
// models.put(entry.getKey(), currentValue);
// }else {
// currentValue.setFrequency(currentValue.getFrequency() + entry.getValue());
// currentValue.setDocumentContainingWord(currentValue.getDocumentContainingWord() + 1);
// }
// documentModel.add(currentValue);
//
// }
//
// //update data model for bodyWord
// for(Entry<String, Integer> entry: bodyWords.entrySet()) {
// TextDataModel currentValue = models.get(entry.getKey());
// bodyWordFrequency = bodyWordFrequency + entry.getValue();
// if(currentValue == null) {
//
// currentValue = new TextDataModel(entry.getKey(), entry.getValue(), 0, 1);
// models.put(entry.getKey(), currentValue);
// }else {
// currentValue.setFrequency(currentValue.getFrequency() + entry.getValue());
// currentValue.setDocumentContainingWord(currentValue.getDocumentContainingWord() + 1);
//
// }
// documentModel.add(currentValue);
// }
StringBuffer modelBuffer = new StringBuffer();
for(Entry<String, TextDataModel> entry: models.entrySet()) {
for (Entry<String, TextDataModel> entry : models.entrySet()) {
TextDataModel currentValue = entry.getValue();
currentValue.setTotalDocumentSize(maxDocumentSize + 1);
modelBuffer.append(entry.getValue().toStringData());
}
FileUtils.write(textModel, modelBuffer.toString());
for(TextDataModel textDataModel: documentModel) {
for (TextDataModel textDataModel : documentModel) {
String word = textDataModel.getWord();
int totalDocumentSize = textDataModel.getTotalDocumentSize();
int documentContainingWord = textDataModel.getDocumentContainingWord();
double tf = (double) emailWords.get(word) / (double) emailWordFrequency;
double idf = Math.log10((double) totalDocumentSize / (double) documentContainingWord);
double tfidf = tf * idf;
contentWeight = contentWeight + tfidf;
//System.out.println("TFIDF " + word + ": " + tfidf);
double tf = (double)emailWords.get(word) / (double)emailWordFrequency;
double idf = Math.log10((double) totalDocumentSize/ (double)documentContainingWord);
double tfidf = tf * idf;
contentWeight = contentWeight + tfidf;
//System.out.println("TFIDF " + word + ": " + tfidf);
}
// for(TextDataModel textDataModel: documentModel) {
// String word = textDataModel.getWord();
// int totalDocumentSize = textDataModel.getTotalDocumentSize();
// int documentContainingWord = textDataModel.getDocumentContainingWord();
// if(word.startsWith("S_")) {
// double tf = (double)subjectWords.get(word) / (double)subjectWordFrequency;
// double idf = Math.log10((double) totalDocumentSize/ (double)documentContainingWord);
// double tfidf = tf * idf;
// contentWeight = contentWeight + tfidf;
// //System.out.println("TFIDF " + word + ": " + tfidf);
// }else {
// double tf = (double)bodyWords.get(word) / (double)bodyWordFrequency;
// double idf = Math.log10((double)totalDocumentSize / (double)documentContainingWord);
// double tfidf = tf * idf;
// //System.out.println("TFIDF " + word + ": " + tfidf);
// contentWeight = contentWeight + tfidf;
// }
//
// }
//System.out.println("Email contentWeight: " + contentWeight);
}catch(Exception e) {
} catch (Exception e) {
e.printStackTrace();
}
return contentWeight;
}
public static HashMap<String, TextDataModel> readModelFromFile(File file){
public static HashMap<String, TextDataModel> readModelFromFile(File file) {
HashMap<String, TextDataModel> models = new HashMap<String, TextDataModel>();