Commit ba6aa0f7 authored by Zied SELLAMI's avatar Zied SELLAMI
Browse files

Add reading html email body

parent 61737666
......@@ -9,7 +9,7 @@ import org.joda.time.DateTime;
public class Email {
//"messageId\tfrom\tto\tcc\tbcc\treceivedTime\tsentTime\tinReplyTo\tsubject\tbodyFile\tattachment\tpriority\temailFolder\n"
//"messageId\tfrom\tto\tcc\tbcc\treceivedTime\tsentTime\tinReplyTo\tsubject\tbodyFile\thtmlBodyFile\tattachment\tpriority\temailFolder\n"
private String messageId = null;
private InternetAddress from = null;
private List<InternetAddress> to = null;
......@@ -20,7 +20,9 @@ public class Email {
private String inReplyTo = null;
private String subject = null;
private String bodyFile = null;
private String htmlBodyFile = null;
private String body = null;
private String htmlBody = null;
private List<EmailAttachment> attachments = null;
private Integer priority = null;
private boolean isASpam = false;
......@@ -181,5 +183,21 @@ public class Email {
public void setAlternativeAddress(List<String> alternativeAddress) {
this.alternativeAddress = alternativeAddress;
}
public String getHtmlBodyFile() {
return htmlBodyFile;
}
public void setHtmlBodyFile(String htmlBodyFile) {
this.htmlBodyFile = htmlBodyFile;
}
public String getHtmlBody() {
return htmlBody;
}
public void setHtmlBody(String htmlBody) {
this.htmlBody = htmlBody;
}
}
......@@ -49,14 +49,14 @@ public class CSVReader {
// CSV file header
private static final String[] FILE_HEADER = { "messageId", "from", "to", "cc", "bcc", "receivedTime", "sentTime",
"inReplyTo", "subject", "bodyFile", "attachment", "priority", "emailFolder", "spamFlag" };
"inReplyTo", "subject", "bodyFile", "htmlBodyFile","attachment", "priority", "emailFolder", "spamFlag" };
private static Pattern Blank_Pattern = Pattern.compile("(\u00a0|\u202f|\u2007| )");
private static Pattern Email_Section_Pattern = Pattern.compile("<([^<]+@[^>]+)>");
public static HashMap<String, Email> parseEmails(String csvPath, String bodyPath) {
public static HashMap<String, Email> parseEmails(String csvPath, String bodyPath, String htmBodyPath) {
HashMap<String, Email> emails = new HashMap<String, Email>();
int counter = 0;
try {
......@@ -101,6 +101,15 @@ public class CSVReader {
body = Blank_Pattern.matcher(body).replaceAll(" ");
email.setBody(body);
}
email.setHtmlBodyFile(csvRecord.get("htmlBodyFile"));
if (csvRecord.get("htmlBodyFile") != null) {
String htmlBody = FileUtils.readFileToString(new File(htmBodyPath + csvRecord.get("htmlBodyFile")));
htmlBody = Blank_Pattern.matcher(htmlBody).replaceAll(" ");
email.setHtmlBody(htmlBody);
}
email.setAttachments(parseAttachments(csvRecord.get("attachment")));
email.setPriority(parseStringInteger(csvRecord.get("priority")));
......@@ -134,7 +143,7 @@ public class CSVReader {
return emails;
}
public static Inbox parseInbox(String csvPath, String bodyPath) {
public static Inbox parseInbox(String csvPath, String bodyPath, String htmBodyPath) {
HashMap<String, EmailFolder> emailFolders = new HashMap<String, EmailFolder>();
Inbox inbox = null;
int counter = 0;
......@@ -180,6 +189,14 @@ public class CSVReader {
body = Blank_Pattern.matcher(body).replaceAll(" ");
email.setBody(body);
}
email.setHtmlBodyFile(csvRecord.get("htmlBodyFile"));
if (csvRecord.get("htmlBodyFile") != null) {
String htmlBody = FileUtils.readFileToString(new File(htmBodyPath + csvRecord.get("htmlBodyFile")));
htmlBody = Blank_Pattern.matcher(htmlBody).replaceAll(" ");
email.setHtmlBody(htmlBody);
}
email.setAttachments(parseAttachments(csvRecord.get("attachment")));
email.setPriority(parseStringInteger(csvRecord.get("priority")));
......
......@@ -32,10 +32,10 @@ import org.apache.commons.io.FileUtils;
public class CheckEmail {
private static final String email_id = "";
private static final String email_id = "zsellami@linagora.com";
private static final String password = "";
private static String directory = "";
private static String directory = "/home/zsellami/newCorpus/zied2/";
/**
......@@ -60,7 +60,7 @@ public class CheckEmail {
private static final String NULL = "null";
//CSV file header
private static final String [] FILE_HEADER = {"messageId","from","to","cc","bcc","receivedTime","sentTime","inReplyTo","subject","bodyFile","attachment","priority","emailFolder", "spamFlag"};
private static final String [] FILE_HEADER = {"messageId","from","to","cc","bcc","receivedTime","sentTime","inReplyTo","subject","bodyFile","htmlBodyFile","attachment","priority","emailFolder", "spamFlag"};
static FileWriter fileWriter = null;
......@@ -165,11 +165,17 @@ private static void readMessage(MimeMessage message) {
String subject = getSubject(message);
String body = getTextFromMessage(message);
String htmlBody = getHtmlFromMessage(message);
String bodyFile = null;
if(body != null && !body.equals("")) {
bodyFile = UUID.randomUUID() + "_" + System.nanoTime() + ".txt";
FileUtils.write(new File(directory + "emails_body/" + bodyFile), body);
}
String htmlBodyFile = null;
if(htmlBody != null && !htmlBody.equals("")) {
htmlBodyFile = UUID.randomUUID() + "_" + System.nanoTime() + ".txt";
FileUtils.write(new File(directory + "emails_htmlBody/" + htmlBodyFile), htmlBody);
}
String attachments = getAttachments(message);
String priority = "0";
......@@ -195,6 +201,7 @@ private static void readMessage(MimeMessage message) {
emailRecord.add(inReplyTo);
emailRecord.add(subject);
emailRecord.add(bodyFile);
emailRecord.add(htmlBodyFile);
emailRecord.add(attachments);
emailRecord.add(priority);
emailRecord.add(emailFolder);
......@@ -223,6 +230,7 @@ private static void readMessage(MimeMessage message) {
}
private static String getSpamFlag(MimeMessage message) {
// TODO Auto-generated method stub
try {
......@@ -298,6 +306,34 @@ private static String getTextFromMessage(Message message) throws MessagingExcept
return result;
}
private static String getHtmlFromMessage(MimeMessage message) throws MessagingException, IOException{
String html = null;
if (message.isMimeType("text/html")) {
html = message.getContent().toString();
} else if (message.isMimeType("multipart/*")) {
MimeMultipart mimeMultipart = (MimeMultipart) message.getContent();
html = getHtmlFromMimeMultipart(mimeMultipart);
}
return html;
}
private static String getHtmlFromMimeMultipart(
MimeMultipart mimeMultipart) throws MessagingException, IOException{
String html = null;
int count = mimeMultipart.getCount();
for (int i = 0; i < count; i++) {
BodyPart bodyPart = mimeMultipart.getBodyPart(i);
if (bodyPart.isMimeType("text/html")) {
html = (String) bodyPart.getContent();
break;
}else if (bodyPart.getContent() instanceof MimeMultipart){
html = html + "\n" + getHtmlFromMimeMultipart((MimeMultipart)bodyPart.getContent());
}
}
return html;
}
private static String getTextFromMimeMultipart(
MimeMultipart mimeMultipart) throws MessagingException, IOException{
String result = "";
......
......@@ -20,6 +20,7 @@ public class Test {
//Text.setStopWordsDirectory("stopwords");
String DATA_FILE_PATH = "/home/zsellami/newCorpus/zied2/DATA.csv";
String BODY_DIR_PATH = "/home/zsellami/newCorpus/zied2/emails_body/";
String HTML_BODY_DIR_PATH = "/home/zsellami/newCorpus/zied2/emails_htmlBody/";
String user = "Zied Sellami";
List<String> alternativeEmails = Arrays.asList("zsellami@linagora.com", "zied.sellami@linagora.com");
......@@ -29,7 +30,7 @@ public class Test {
long startTime = System.currentTimeMillis();
Inbox inbox = CSVReader.parseInbox(DATA_FILE_PATH, BODY_DIR_PATH);
Inbox inbox = CSVReader.parseInbox(DATA_FILE_PATH, BODY_DIR_PATH, HTML_BODY_DIR_PATH);
inbox.setUser(user);
inbox.setEmailAddresses(alternativeEmails);
......
......@@ -68,6 +68,7 @@ public class TestEmailSample {
String DATA_FILE_PATH = "/home/zsellami/newCorpus/zied2/DATA.csv";
String BODY_DIR_PATH = "/home/zsellami/newCorpus/zied2/emails_body/";
String HTML_BODY_DIR_PATH = "/home/zsellami/newCorpus/zied2/emails_htmlBody/";
String user = "Zied Sellami";
List<String> alternativeEmails = Arrays.asList("zsellami@linagora.com", "zied.sellami@linagora.com");
......@@ -77,7 +78,7 @@ public class TestEmailSample {
long startTime = System.currentTimeMillis();
Inbox inbox = CSVReader.parseInbox(DATA_FILE_PATH, BODY_DIR_PATH);
Inbox inbox = CSVReader.parseInbox(DATA_FILE_PATH, BODY_DIR_PATH, HTML_BODY_DIR_PATH);
inbox.setUser(user);
inbox.setEmailAddresses(alternativeEmails);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment