Commit c2f09a87 authored by Rhadia STG_HARMASSI's avatar Rhadia STG_HARMASSI

initial commit

parents

Too many changes to show.

To preserve performance only 1000 of 1000+ files are displayed.

import sys
#sys.path.insert(0,"/home/rhermassi/CRF_suiteV8/CRFSuite")
sys.path.insert(0,"/data/nlp/CRF")
sys.path.insert(0,"/usr/lib/python3.6/site-packages")
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_val_score
import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics
from feature_extraction import FeatureExtractor
import re
import eli5
import nltk
from nltk import pos_tag
import scipy.stats
from six.moves import zip
from token_features import *
import numpy as np
import os
FichList = [ f for f in os.listdir(sys.argv[1]) if os.path.isfile(os.path.join(sys.argv[1],f)) ]
import pandas as pd
a= [pd.read_table(sys.argv[1]+"/"+f, sep = "\s+",names=range(100),engine='python') for f in FichList]
training_set = [ s.astype("str") for s in a ]
training_Set=[t.drop(t.columns[7:100],axis=1) for t in training_set]
train_set=[]
for training in training_Set :
tag= nltk.pos_tag(training.loc[:,0])
l=[]
for t in tag:
l.append(t[1])
training[2]=l
training[3]= training[[3,4,5,6]].apply(tuple, axis=1)
train_set.append(training)
train_Set=[tr.drop(tr.columns[4:8],axis=1) for tr in train_set]
FichList = [ f for f in os.listdir(sys.argv[2]) if os.path.isfile(os.path.join(sys.argv[2],f)) ]
a= [pd.read_table(sys.argv[2]+"/"+f, sep = "\s+",names=range(100),engine='python') for f in FichList]
testing_set = [ s.astype("str") for s in a ]
testing_Set=[tes.drop(tes.columns[7:100],axis=1) for tes in testing_set]
test_set=[]
for test in testing_Set :
tag_test= nltk.pos_tag(test.loc[:,0])
l=[]
for t_test in tag_test:
l.append(t_test[1])
test[2]=l
test[3]= test[[3,4,5,6]].apply(tuple, axis=1)
test_set.append(test)
test_Set=[ts.drop(ts.columns[4:8],axis=1) for ts in test_set]
feature_extractor = FeatureExtractor(
token_features = [
token_identity,
#pos_tag,
is_title,
is_lower,
is_digit,
is_ponct,
looks_like_street_part,
looks_like_house_part,
# looks_like_postcode_part,
# looks_like_city_part,
]
)
def word2features(sent, i):
word= sent.loc[i,0]
pos_tag=sent.loc[i,2]
tag_address=sent.loc[i,3]
seg=[word,pos_tag,tag_address]
features_str = feature_extractor.transform_single(seg)
features_str.update({'postag' :pos_tag})
if i>0 :
lengths=(1,2,3)
featname="Prefix"
end=min(lengths[-1],i)
windows_size = dict(zip(["%s%s" % (featname, j) for j in range(1,end+1)], lengths))
for key, size in windows_size.items():
word1=sent.loc[i-size,0]
pos_tag1=sent.loc[i-size,2]
address_tag1=sent.loc[i-1,3]
seg1=[word1,pos_tag1,address_tag1]
features_str.update({
key+' word': word1,
key+' postag' : pos_tag1,
key+' lower' : is_lower(seg1),
key+' title' : is_title(seg1),
key+' digit' : is_digit(seg1),
key+' ponct': is_ponct(seg1),
key+' street' : looks_like_street_part(seg1),
key+' house' : looks_like_house_part(seg1),
key+' code_postal' : looks_like_postcode_part(seg1),
key+' city_part' : looks_like_city_part(seg1),
})
else:
features_str['BOS']= True
if i< len(sent)-1 :
lengths=(1,2,3)
end1=min(lengths[-1],len(sent)-i-1)
windows_size = dict(zip(["%s%s" % ("Suffix",j) for j in range(1,end1+1)], lengths))
for key, size in windows_size.items():
word1=sent.loc[i+size,0]
pos_tag1=sent.loc[i+size,2]
address_tag1=sent.loc[i+1,3]
seg1=[word1,pos_tag1,address_tag1]
features_str.update({
key+' word':word1,
key+' postag' : pos_tag1,
key+' lower' : is_lower(seg1),
key+' title' : is_title(seg1),
key+' digit' : is_digit(seg1),
key+' ponct': is_ponct(seg1),
key+' code_postal' : looks_like_postcode_part(seg1),
key+' city_part' : looks_like_city_part(seg1),
})
else: features_str['BOS']=True
return features_str
def sent2features(sent):
return [word2features(sent, i) for i in range(0,len(sent)) ]
def sent2labels(sent):
label=sent.loc[ :,1]
l=[]
for lab in label:
l.append(lab[0])
return l
def sent2tokens(sent):
return [sent.loc[ :,0]]
y_train =[sent2labels(s) for s in train_Set]
X_train = [sent2features(s) for s in train_Set]
X_test = [sent2features(s) for s in test_Set]
y_test = [sent2labels(s) for s in test_Set]
crf = sklearn_crfsuite.CRF(
algorithm='lbfgs',
c1=0.1,
c2=0.1,
max_iterations=100,
all_possible_transitions=True
)
crf.fit(X_train, y_train)
labels = ['B','I','E']
labels
y_pred = crf.predict(X_test)
metrics.flat_f1_score(y_test, y_pred,
average='weighted', labels=labels)
sorted_labels = sorted(
labels,
key=lambda name: (name[1:], name[0])
)
print(metrics.flat_classification_report(
y_test, y_pred, labels=sorted_labels, digits=3
))
from token_features import *
EXAMPLE_TAGSET = {'B','I' , 'E','O','ORG','STREET','CITY','POS','STATE','LOCATION','ZIPCODE', 'EMAIL', 'TEL', 'FAX', 'SUBJ', 'FUNC', 'HOURS'}
EXAMPLE_TOKEN_FEATURES = [
bias,
pos_tag,
token_identity,
is_lower,
is_digit,
is_title,
is_ponct,
looks_like_year,
looks_like_month,
looks_like_email,
looks_like_street_part,
looks_like_postcode_part,
looks_like_house_part,
looks_like_city_part,
looks_like_time,
looks_like_weekday,
]
# This file define date/time notations among different countries.
__all__ = ['MONTHS', 'WEEKDAYS']
_NL_MONTHS = [
("Jan", "January", "Januari", "jan."),
("Feb", "February", "Februari", "feb."),
("Mar", "Mrt", "Maart", "March", "mrt.", "maa.", "maa"),
("Apr", "April", "apr."),
("May", "Mei"),
("Jun", "June", "Juni", "jun."),
("Jul", "July", "Juli", "jul."),
("Aug", "August", "Augustus", "aug."),
("Sep", "September", "sep."),
("Okt", "Oct", "October", "Oktober", "okt."),
("Nov", "November", "nov."),
("Dec", "December", "dec.")
]
_NL_WEEKDAYS = [
("Mon", "Monday", "Maandag", "ma.", "ma"),
("Tue", "Tuesday", "Dinsdag", "di.", "di"),
("Wed", "Wednesday", "Woensdag", "wo.", "wo"),
("Thu", "Thursday", "Donderdag", "do.", "do"),
("Fri", "Friday", "Vrijdag", 'vr.', "vr"),