Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Open sidebar
Zied SELLAMI
priority-inbox
Commits
d63367ac
Commit
d63367ac
authored
May 10, 2019
by
Zied SELLAMI
Browse files
Integrating DynamicFeature
parent
4c333593
Changes
13
Show whitespace changes
Inline
Side-by-side
Showing
13 changed files
with
496 additions
and
301 deletions
+496
-301
priorityInbox/CONFIG
priorityInbox/CONFIG
+2
-0
priorityInbox/CONFIG_Docker
priorityInbox/CONFIG_Docker
+2
-0
priorityInbox/pom.xml
priorityInbox/pom.xml
+14
-1
priorityInbox/src/main/java/org/linagora/priorityInbox/api/Configuration.java
...in/java/org/linagora/priorityInbox/api/Configuration.java
+8
-0
priorityInbox/src/main/java/org/linagora/priorityInbox/api/WebService.java
.../main/java/org/linagora/priorityInbox/api/WebService.java
+4
-0
priorityInbox/src/main/java/org/linagora/priorityInbox/api/WebServiceMain.java
...n/java/org/linagora/priorityInbox/api/WebServiceMain.java
+24
-1
priorityInbox/src/main/java/org/linagora/priorityInbox/feature/DynamicFeature.java
...va/org/linagora/priorityInbox/feature/DynamicFeature.java
+204
-267
priorityInbox/src/main/java/org/linagora/priorityInbox/feature/FeatureExtractor.java
.../org/linagora/priorityInbox/feature/FeatureExtractor.java
+61
-11
priorityInbox/src/main/java/org/linagora/priorityInbox/feature/LabeledDocument.java
...a/org/linagora/priorityInbox/feature/LabeledDocument.java
+66
-2
priorityInbox/src/main/java/org/linagora/priorityInbox/feature/PositionWeight.java
...va/org/linagora/priorityInbox/feature/PositionWeight.java
+16
-0
priorityInbox/src/main/java/org/linagora/priorityInbox/feature/TextDataModel.java
...ava/org/linagora/priorityInbox/feature/TextDataModel.java
+26
-3
priorityInbox/src/main/java/org/linagora/priorityInbox/text/Text.java
...x/src/main/java/org/linagora/priorityInbox/text/Text.java
+68
-15
priorityInbox/src/main/java/org/linagora/priorityInbox/text/TextCleaner.java
...ain/java/org/linagora/priorityInbox/text/TextCleaner.java
+1
-1
No files found.
priorityInbox/CONFIG
View file @
d63367ac
...
...
@@ -4,3 +4,5 @@ TMP_DIRECTORY = /home/zsellami/tmp
NOTIFICATION_CHANNEL = /home/zsellami/dev/git/priority-inbox/priorityInbox/channels/
JOB_POSITION_RULE = /home/zsellami/dev/git/priority-inbox/priorityInbox/jobpositions/
TEXT_CLEANER_RULE = /home/zsellami/dev/git/priority-inbox/priorityInbox/TextCleaner.regex
MODEL_DIRECTORY = /home/zsellami/dev/git/priority-inbox/priorityInbox/models/
STOP_WORDS_DIRECTORY = /home/zsellami/dev/git/priority-inbox/priorityInbox/stopwords/
\ No newline at end of file
priorityInbox/CONFIG_Docker
View file @
d63367ac
...
...
@@ -4,3 +4,5 @@ TMP_DIRECTORY = /priority-inbox/priorityInbox/tmp
NOTIFICATION_CHANNEL = /priority-inbox/priorityInbox/channels/
JOB_POSITION_RULE = /priority-inbox/priorityInbox/jobpositions/
TEXT_CLEANER_RULE = /priority-inbox/priorityInbox/TextCleaner.regex
MODEL_DIRECTORY = /priority-inbox/priorityInbox/models/
STOP_WORDS_DIRECTORY = /priority-inbox/priorityInbox/stopwords/
\ No newline at end of file
priorityInbox/pom.xml
View file @
d63367ac
...
...
@@ -105,6 +105,19 @@
<artifactId>
tika-langdetect
</artifactId>
<version>
1.20
</version>
</dependency>
<dependency>
<groupId>
com.thoughtworks.paranamer
</groupId>
<artifactId>
paranamer
</artifactId>
<version>
2.8
</version>
</dependency>
<dependency>
<groupId>
com.google.guava
</groupId>
<artifactId>
guava
</artifactId>
<version>
15.0
</version>
</dependency>
</dependencies>
<build>
...
...
priorityInbox/src/main/java/org/linagora/priorityInbox/api/Configuration.java
View file @
d63367ac
...
...
@@ -59,6 +59,14 @@ public class Configuration {
return
parameters
.
get
(
"TMP_DIRECTORY"
);
}
public
String
getModelDirectory
()
{
return
parameters
.
get
(
"MODEL_DIRECTORY"
);
}
public
String
getStopWordsDirectory
()
{
return
parameters
.
get
(
"STOP_WORDS_DIRECTORY"
);
}
@Override
public
String
toString
()
{
return
"Configuration [parameters="
+
parameters
+
"]"
;
...
...
priorityInbox/src/main/java/org/linagora/priorityInbox/api/WebService.java
View file @
d63367ac
...
...
@@ -22,8 +22,10 @@ import org.glassfish.jersey.media.multipart.FormDataContentDisposition;
import
org.glassfish.jersey.media.multipart.FormDataParam
;
import
org.linagora.priorityInbox.data.Email
;
import
org.linagora.priorityInbox.feature.DynamicFeature
;
import
org.linagora.priorityInbox.feature.FeatureExtractor
;
import
org.linagora.priorityInbox.feature.LabeledDocument
;
import
org.linagora.priorityInbox.text.Text
;
import
org.linagora.priorityInbox.text.TextCleaner
;
import
com.fasterxml.jackson.databind.ObjectMapper
;
...
...
@@ -41,6 +43,8 @@ public class WebService {
FeatureExtractor
.
JOB_POSITION_RULES
=
new
File
(
config
.
getJobPositionRules
());
FeatureExtractor
.
NOTIFICATION_CHANNELS
=
new
File
(
config
.
getNotificationChannels
());
TextCleaner
.
CLEANING_REGEX
=
new
File
(
config
.
getTextCleanerRegex
());
DynamicFeature
.
modelDirectory
=
config
.
getModelDirectory
();
Text
.
setStopWordsDirectory
(
config
.
getStopWordsDirectory
());
}
// https://stackoverflow.com/questions/30653012/multipart-form-data-no-injection-source-found-for-a-parameter-of-type-public-ja?utm_medium=organic&utm_source=google_rich_qa&utm_campaign=google_rich_qa
...
...
priorityInbox/src/main/java/org/linagora/priorityInbox/api/WebServiceMain.java
View file @
d63367ac
package
org.linagora.priorityInbox.api
;
import
java.net.URI
;
import
java.util.concurrent.Executors
;
import
java.util.concurrent.ScheduledExecutorService
;
import
java.util.concurrent.TimeUnit
;
import
javax.ws.rs.core.UriBuilder
;
...
...
@@ -8,6 +11,7 @@ import org.glassfish.grizzly.http.server.HttpServer;
import
org.glassfish.jersey.grizzly2.httpserver.GrizzlyHttpServerFactory
;
import
org.glassfish.jersey.media.multipart.MultiPartFeature
;
import
org.glassfish.jersey.server.ResourceConfig
;
import
org.linagora.priorityInbox.feature.DynamicFeature
;
public
class
WebServiceMain
{
...
...
@@ -17,7 +21,25 @@ public class WebServiceMain {
}
public
static
void
main
(
String
[]
args
)
{
final
ScheduledExecutorService
executor
=
Executors
.
newSingleThreadScheduledExecutor
();
Runnable
periodicSaving
=
new
Runnable
()
{
public
void
run
()
{
DynamicFeature
.
saveModels
();
}
};
if
(
args
.
length
==
1
)
{
Runtime
.
getRuntime
().
addShutdownHook
(
new
Thread
()
{
public
void
run
()
{
DynamicFeature
.
saveModels
();
System
.
out
.
println
(
"Program stopped. Models saved before stopping the service."
);
executor
.
shutdown
();
}
});
executor
.
scheduleWithFixedDelay
(
periodicSaving
,
300
,
600
,
TimeUnit
.
SECONDS
);
Thread
app
=
new
Thread
()
{
public
void
run
()
{
...
...
@@ -59,6 +81,7 @@ public class WebServiceMain {
System
.
out
.
println
(
"Please set config argument. Usage: WebServiceMain configFilePath"
);
System
.
exit
(
0
);
}
}
...
...
priorityInbox/src/main/java/org/linagora/priorityInbox/feature/DynamicFeature.java
View file @
d63367ac
package
org.linagora.priorityInbox.feature
;
import
java.io.File
;
import
java.math.BigInteger
;
import
java.nio.charset.Charset
;
import
java.nio.file.Files
;
import
java.nio.file.Path
;
import
java.nio.file.Paths
;
import
java.text.DecimalFormat
;
import
java.util.ArrayList
;
import
java.util.Arrays
;
import
java.util.Collections
;
import
java.util.HashMap
;
import
java.util.List
;
import
java.util.Map.Entry
;
import
javax.mail.internet.InternetAddress
;
import
org.apache.commons.io.FileUtils
;
import
org.linagora.priorityInbox.data.Email
;
import
org.linagora.priorityInbox.text.Text
;
...
...
@@ -21,52 +22,92 @@ public class DynamicFeature {
public
static
String
modelDirectory
=
null
;
private
static
File
getSenderModel
(
String
userId
)
{
File
senderModel
=
null
;
try
{
Path
userModelPath
=
Paths
.
get
(
modelDirectory
,
userId
,
"/"
);
Path
senderModelPath
=
Paths
.
get
(
userModelPath
.
toString
(),
"senderModel"
);
private
static
HashMap
<
String
,
HashMap
<
String
,
TextDataModel
>>
WORDS_DATA
=
new
HashMap
<
String
,
HashMap
<
String
,
TextDataModel
>>();
private
static
HashMap
<
String
,
HashMap
<
String
,
TextDataModel
>>
SENDER_DATA
=
new
HashMap
<
String
,
HashMap
<
String
,
TextDataModel
>>();
File
directory
=
userModelPath
.
toFile
();
private
static
synchronized
HashMap
<
String
,
TextDataModel
>
getWordsData
(
String
userId
)
{
HashMap
<
String
,
TextDataModel
>
wordsData
=
WORDS_DATA
.
get
(
userId
);
if
(
wordsData
==
null
)
{
Path
wordsDataPath
=
Paths
.
get
(
modelDirectory
,
userId
,
"textModel"
);
wordsData
=
readModelFromFile
(
wordsDataPath
.
toFile
());
if
(
wordsData
!=
null
)
{
WORDS_DATA
.
put
(
userId
,
wordsData
);
}
}
return
wordsData
;
}
if
(!
directory
.
exists
())
{
directory
.
mkdirs
();
FileUtils
.
writeStringToFile
(
senderModelPath
.
toFile
(),
""
);
}
else
if
(!
senderModelPath
.
toFile
().
exists
())
{
FileUtils
.
writeStringToFile
(
senderModelPath
.
toFile
(),
""
);
private
static
synchronized
HashMap
<
String
,
TextDataModel
>
getSendersData
(
String
userId
)
{
HashMap
<
String
,
TextDataModel
>
wordsData
=
SENDER_DATA
.
get
(
userId
);
if
(
wordsData
==
null
)
{
Path
wordsDataPath
=
Paths
.
get
(
modelDirectory
,
userId
,
"senderModel"
);
wordsData
=
readModelFromFile
(
wordsDataPath
.
toFile
());
if
(
wordsData
!=
null
)
{
SENDER_DATA
.
put
(
userId
,
wordsData
);
}
}
return
wordsData
;
}
public
static
synchronized
void
saveModels
()
{
long
start
=
System
.
currentTimeMillis
();
// Saving words data
for
(
Entry
<
String
,
HashMap
<
String
,
TextDataModel
>>
entry
:
WORDS_DATA
.
entrySet
())
{
String
userId
=
entry
.
getKey
();
StringBuffer
modelBuffer
=
new
StringBuffer
();
for
(
TextDataModel
textDataModel
:
entry
.
getValue
().
values
())
{
modelBuffer
.
append
(
textDataModel
.
toStringData
());
}
senderModel
=
senderModelPath
.
toFile
();
}
catch
(
Exception
e
)
{
try
{
FileUtils
.
write
(
Paths
.
get
(
modelDirectory
,
userId
,
"textModel"
).
toFile
(),
modelBuffer
.
toString
(),
Charset
.
forName
(
"UTF-8"
));
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
}
}
return
senderModel
;
// Saving sender data
for
(
Entry
<
String
,
HashMap
<
String
,
TextDataModel
>>
entry
:
SENDER_DATA
.
entrySet
())
{
String
userId
=
entry
.
getKey
();
StringBuffer
modelBuffer
=
new
StringBuffer
();
for
(
TextDataModel
textDataModel
:
entry
.
getValue
().
values
())
{
modelBuffer
.
append
(
textDataModel
.
toStringData
());
}
private
static
File
getTextModel
(
String
userId
)
{
File
textModel
=
null
;
try
{
Path
userModelPath
=
Paths
.
get
(
modelDirectory
,
userId
,
"/"
);
Path
textModelPath
=
Paths
.
get
(
userModelPath
.
toString
(),
"textModel"
);
File
directory
=
userModelPath
.
toFile
();
FileUtils
.
write
(
Paths
.
get
(
modelDirectory
,
userId
,
"senderModel"
).
toFile
(),
modelBuffer
.
toString
(),
Charset
.
forName
(
"UTF-8"
));
if
(!
directory
.
exists
())
{
directory
.
mkdirs
();
FileUtils
.
writeStringToFile
(
textModelPath
.
toFile
(),
""
);
}
else
if
(!
textModelPath
.
toFile
().
exists
())
{
FileUtils
.
writeStringToFile
(
textModelPath
.
toFile
(),
""
);
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
}
}
textModel
=
textModelPath
.
toFile
();
long
end
=
System
.
currentTimeMillis
();
String
readableSize
=
""
;
long
size
=
0
;
try
{
Path
folder
=
Paths
.
get
(
modelDirectory
);
size
=
FileUtils
.
sizeOfDirectory
(
folder
.
toFile
());
int
unitIndex
=
(
int
)
(
Math
.
log10
(
size
)
/
3
);
String
[]
units
=
new
String
[]
{
"B"
,
"KB"
,
"MB"
,
"GB"
,
"TB"
,
"ZB"
};
double
unitValue
=
1
<<
(
unitIndex
*
10
);
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
readableSize
=
new
DecimalFormat
(
"#,##0.#"
)
.
format
(
size
/
unitValue
)
+
" "
+
units
[
unitIndex
];
System
.
out
.
println
(
"SAVING MODELS IN "
+
modelDirectory
+
" - MODELS SIZE="
+
readableSize
);
}
catch
(
Exception
e
)
{
//e.printStackTrace();
System
.
err
.
println
(
"SAVING MODELS... NO MODEL BUILT YET."
);
}
return
textModel
;
}
private
static
String
toUserId
(
String
user
,
List
<
String
>
alternativeAddress
)
{
...
...
@@ -77,7 +118,6 @@ public class DynamicFeature {
return
userId
;
}
public
static
double
getSenderWeight
(
Email
email
)
{
double
senderWeight
=
0
d
;
try
{
...
...
@@ -85,48 +125,59 @@ public class DynamicFeature {
TextDataModel
senderTextDataModel
=
null
;
String
userId
=
toUserId
(
email
.
getUser
(),
email
.
getAlternativeAddress
());
File
senderModel
=
getSenderModel
(
userId
);
models
=
readModelFromFile
(
senderModel
);
models
=
getSendersData
(
userId
);
// readModelFromFile(dataModelFile);
int
maxDocumentSize
=
0
;
int
currentIndex
=
0
;
if
(
models
==
null
)
{
models
=
new
HashMap
<
String
,
TextDataModel
>();
SENDER_DATA
.
put
(
userId
,
models
);
for
(
Entry
<
String
,
TextDataModel
>
entry:
models
.
entrySet
())
{
}
else
{
for
(
Entry
<
String
,
TextDataModel
>
entry
:
models
.
entrySet
())
{
TextDataModel
currentValue
=
entry
.
getValue
();
maxDocumentSize
=
Math
.
max
(
maxDocumentSize
,
currentValue
.
getTotalDocumentSize
());
break
;
}
for
(
TextDataModel
tdm
:
models
.
values
())
{
currentIndex
=
Math
.
max
(
currentIndex
,
tdm
.
getIndex
());
}
}
currentIndex
=
currentIndex
+
1
;
String
sender
=
email
.
getFrom
().
getAddress
();
if
(
sender
!=
null
)
{
if
(
sender
!=
null
)
{
TextDataModel
currentValue
=
models
.
get
(
sender
.
toLowerCase
());
if
(
currentValue
==
null
)
{
currentValue
=
new
TextDataModel
(
sender
.
toLowerCase
(),
1
,
0
,
1
);
if
(
currentValue
==
null
)
{
currentValue
=
new
TextDataModel
(
currentIndex
,
sender
.
toLowerCase
(),
1
,
0
,
1
);
currentIndex
++;
models
.
put
(
sender
,
currentValue
);
}
else
{
}
else
{
currentValue
.
setFrequency
(
currentValue
.
getFrequency
()
+
1
);
currentValue
.
setDocumentContainingWord
(
currentValue
.
getDocumentContainingWord
()
+
1
);
}
senderTextDataModel
=
currentValue
;
}
StringBuffer
modelBuffer
=
new
StringBuffer
();
for
(
Entry
<
String
,
TextDataModel
>
entry:
models
.
entrySet
())
{
for
(
Entry
<
String
,
TextDataModel
>
entry
:
models
.
entrySet
())
{
TextDataModel
currentValue
=
entry
.
getValue
();
currentValue
.
setTotalDocumentSize
(
maxDocumentSize
+
1
);
modelBuffer
.
append
(
entry
.
getValue
().
toStringData
());
}
FileUtils
.
write
(
senderModel
,
modelBuffer
.
toString
());
double
tf
=
1
d
;
double
idf
=
Math
.
log10
((
double
)
senderTextDataModel
.
getTotalDocumentSize
()
/
(
double
)
senderTextDataModel
.
getDocumentContainingWord
());
double
idf
=
Math
.
log10
((
double
)
senderTextDataModel
.
getTotalDocumentSize
()
/
(
double
)
senderTextDataModel
.
getDocumentContainingWord
());
double
tfidf
=
tf
*
idf
;
senderWeight
=
tfidf
;
// System.out.println("Sender weight: " + tfidf);
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
}
return
senderWeight
;
...
...
@@ -140,36 +191,43 @@ public class DynamicFeature {
List
<
TextDataModel
>
documentModel
=
new
ArrayList
<
TextDataModel
>();
String
userId
=
toUserId
(
email
.
getUser
(),
email
.
getAlternativeAddress
());
File
textModel
=
getTextModel
(
userId
);
models
=
readModelFromFile
(
textModel
);
int
maxDocumentSize
=
0
;
int
currentIndex
=
0
;
models
=
getWordsData
(
userId
);
// readModelFromFile(dataModelFile);
if
(
models
==
null
)
{
models
=
new
HashMap
<
String
,
TextDataModel
>();
WORDS_DATA
.
put
(
userId
,
models
);
for
(
Entry
<
String
,
TextDataModel
>
entry:
models
.
entrySet
())
{
}
else
{
for
(
Entry
<
String
,
TextDataModel
>
entry
:
models
.
entrySet
())
{
TextDataModel
currentValue
=
entry
.
getValue
();
maxDocumentSize
=
Math
.
max
(
maxDocumentSize
,
currentValue
.
getTotalDocumentSize
());
break
;
}
for
(
TextDataModel
tdm
:
models
.
values
())
{
currentIndex
=
Math
.
max
(
currentIndex
,
tdm
.
getIndex
());
}
}
currentIndex
=
currentIndex
+
1
;
String
language
=
detectLanguage
(
email
);
// HashMap<String, Integer> subjectWords = toSubjectBagOfWords(email, language);
// HashMap<String, Integer> bodyWords = toBodyBagOfWords(email, language);
HashMap
<
String
,
Integer
>
emailWords
=
toBagOfWords
(
email
,
language
);
// int subjectWordFrequency = 0;
// int bodyWordFrequency = 0;
int
emailWordFrequency
=
0
;
//update data model with new words
for
(
Entry
<
String
,
Integer
>
entry:
emailWords
.
entrySet
())
{
//
update data model with new words
for
(
Entry
<
String
,
Integer
>
entry
:
emailWords
.
entrySet
())
{
TextDataModel
currentValue
=
models
.
get
(
entry
.
getKey
());
emailWordFrequency
=
emailWordFrequency
+
entry
.
getValue
();
if
(
currentValue
==
null
)
{
currentValue
=
new
TextDataModel
(
entry
.
getKey
(),
entry
.
getValue
(),
0
,
1
);
if
(
currentValue
==
null
)
{
currentValue
=
new
TextDataModel
(
currentIndex
,
entry
.
getKey
(),
entry
.
getValue
(),
0
,
1
);
currentIndex
++;
models
.
put
(
entry
.
getKey
(),
currentValue
);
}
else
{
}
else
{
currentValue
.
setFrequency
(
currentValue
.
getFrequency
()
+
entry
.
getValue
());
currentValue
.
setDocumentContainingWord
(
currentValue
.
getDocumentContainingWord
()
+
1
);
}
...
...
@@ -177,109 +235,53 @@ public class DynamicFeature {
}
// //update data model for subjectWord
// for(Entry<String, Integer> entry: subjectWords.entrySet()) {
// TextDataModel currentValue = models.get(entry.getKey());
// subjectWordFrequency = subjectWordFrequency + entry.getValue();
// if(currentValue == null) {
// currentValue = new TextDataModel(entry.getKey(), entry.getValue(), 0, 1);
// models.put(entry.getKey(), currentValue);
// }else {
// currentValue.setFrequency(currentValue.getFrequency() + entry.getValue());
// currentValue.setDocumentContainingWord(currentValue.getDocumentContainingWord() + 1);
// }
// documentModel.add(currentValue);
//
// }
//
// //update data model for bodyWord
// for(Entry<String, Integer> entry: bodyWords.entrySet()) {
// TextDataModel currentValue = models.get(entry.getKey());
// bodyWordFrequency = bodyWordFrequency + entry.getValue();
// if(currentValue == null) {
//
// currentValue = new TextDataModel(entry.getKey(), entry.getValue(), 0, 1);
// models.put(entry.getKey(), currentValue);
// }else {
// currentValue.setFrequency(currentValue.getFrequency() + entry.getValue());
// currentValue.setDocumentContainingWord(currentValue.getDocumentContainingWord() + 1);
//
// }
// documentModel.add(currentValue);
// }
StringBuffer
modelBuffer
=
new
StringBuffer
();
for
(
Entry
<
String
,
TextDataModel
>
entry:
models
.
entrySet
())
{
for
(
Entry
<
String
,
TextDataModel
>
entry
:
models
.
entrySet
())
{
TextDataModel
currentValue
=
entry
.
getValue
();
currentValue
.
setTotalDocumentSize
(
maxDocumentSize
+
1
);
modelBuffer
.
append
(
entry
.
getValue
().
toStringData
());
}
FileUtils
.
write
(
textModel
,
modelBuffer
.
toString
());
for
(
TextDataModel
textDataModel:
documentModel
)
{
for
(
TextDataModel
textDataModel
:
documentModel
)
{
String
word
=
textDataModel
.
getWord
();
int
totalDocumentSize
=
textDataModel
.
getTotalDocumentSize
();
int
documentContainingWord
=
textDataModel
.
getDocumentContainingWord
();
double
tf
=
(
double
)
emailWords
.
get
(
word
)
/
(
double
)
emailWordFrequency
;
double
idf
=
Math
.
log10
((
double
)
totalDocumentSize
/
(
double
)
documentContainingWord
);
double
tf
=
(
double
)
emailWords
.
get
(
word
)
/
(
double
)
emailWordFrequency
;
double
idf
=
Math
.
log10
((
double
)
totalDocumentSize
/
(
double
)
documentContainingWord
);
double
tfidf
=
tf
*
idf
;
contentWeight
=
contentWeight
+
tfidf
;
//System.out.println("TFIDF " + word + ": " + tfidf);
}
// for(TextDataModel textDataModel: documentModel) {
// String word = textDataModel.getWord();
// int totalDocumentSize = textDataModel.getTotalDocumentSize();
// int documentContainingWord = textDataModel.getDocumentContainingWord();
// if(word.startsWith("S_")) {
// double tf = (double)subjectWords.get(word) / (double)subjectWordFrequency;
// double idf = Math.log10((double) totalDocumentSize/ (double)documentContainingWord);
// double tfidf = tf * idf;
// contentWeight = contentWeight + tfidf;
// //System.out.println("TFIDF " + word + ": " + tfidf);
// }else {
// double tf = (double)bodyWords.get(word) / (double)bodyWordFrequency;
// double idf = Math.log10((double)totalDocumentSize / (double)documentContainingWord);
// double tfidf = tf * idf;
// //System.out.println("TFIDF " + word + ": " + tfidf);
// contentWeight = contentWeight + tfidf;
// }
//
// }
//System.out.println("Email contentWeight: " + contentWeight);
}
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
}
return
contentWeight
;
}
public
static
HashMap
<
String
,
TextDataModel
>
readModelFromFile
(
File
file
){
public
static
HashMap
<
String
,
TextDataModel
>
readModelFromFile
(
File
file
)
{
HashMap
<
String
,
TextDataModel
>
models
=
new
HashMap
<
String
,
TextDataModel
>();
try
{
List
<
String
>
lines
=
FileUtils
.
readLines
(
file
);
for
(
String
line:
lines
)
{
String
[]
elements
=
line
.
split
(
"\t"
);
String
word
=
elements
[
0
];
int
frequency
=
Integer
.
parseInt
(
elements
[
1
]);
int
totalDocumentSize
=
Integer
.
parseInt
(
elements
[
2
]);
int
documentContainingWord
=
Integer
.
parseInt
(
elements
[
3
]);
TextDataModel
textDataModel
=
new
TextDataModel
(
word
,
frequency
,
totalDocumentSize
,
documentContainingWord
);
List
<
String
>
lines
=
FileUtils
.
readLines
(
file
,
Charset
.
forName
(
"UTF-8"
));
for
(
String
line
:
lines
)
{
String
[]
elements
=
line
.
split
(
"\t"
);
int
index
=
Integer
.
parseInt
(
elements
[
0
]);
String
word
=
elements
[
1
];
int
frequency
=
Integer
.
parseInt
(
elements
[
2
]);
int
totalDocumentSize
=
Integer
.
parseInt
(
elements
[
3
]);
int
documentContainingWord
=
Integer
.
parseInt
(
elements
[
4
]);
TextDataModel
textDataModel
=
new
TextDataModel
(
index
,
word
,
frequency
,
totalDocumentSize
,
documentContainingWord
);
models
.
put
(
word
,
textDataModel
);
}
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
}
catch
(
Exception
e
)
{
//e.printStackTrace();
System
.
err
.
println
(
"READING MODEL FROM DISK. NO MODEL SAVED YET."
);
}
return
models
;
...
...
@@ -296,32 +298,14 @@ public class DynamicFeature {
return
Text
.
detectLanguage
(
content
);
}
private
static
HashMap
<
String
,
Integer
>
toSubjectBagOfWords
(
Email
email
,
String
language
)
{
HashMap
<
String
,
Integer
>
model
=
new
HashMap
<
String
,
Integer
>();
if
(
email
.
getSubject
()
!=
null
)
{
Text
.
setLanguage
(
language
);
String
processedSubject
=
Text
.
process
(
email
.
getSubject
());
// List<String> words = Text.toNGramsALL(3, processedSubject.toLowerCase());
// HashMap<String, Double> frequency = Text.countTokens(String.join(" ", words));
HashMap
<
String
,
Double
>
frequency
=
Text
.
countTokens
(
processedSubject
.
toLowerCase
());
for
(
Entry
<
String
,
Double
>
entry
:
frequency
.
entrySet
())
{
model
.
put
(
"S_"
+
entry
.
getKey
(),
entry
.
getValue
().
intValue
());
}
return
model
;
}
return
model
;
}
private
static
HashMap
<
String
,
Integer
>
toBagOfWords
(
Email
email
,
String
language
)
{
HashMap
<
String
,
Integer
>
model
=
new
HashMap
<
String
,
Integer
>();
String
content
=
""
;
if
(
email
.
getSubject
()
!=
null
)
{
if
(
email
.
getSubject
()
!=
null
)
{
content
=
email
.
getSubject
();
}
if
(
email
.
getBody
()
!=
null
)
{
if
(
email
.
getBody
()
!=
null
)
{
String
cleanedText
=
TextCleaner
.
cleanReplyBlock
(
email
.
getBody
());
String
signature
=
TextCleaner
.
detectSignature
(
email
.
getBody
(),
email
.
getFrom
().
getPersonal
());
content
=
content
+
" \n"
+
cleanedText
.
replace
(
signature
,
""
);
...
...
@@ -330,62 +314,15 @@ public class DynamicFeature {
if
(!
content
.
equals
(
""
))
{
<