Commit 393e7413 authored by Rhadia STG_HARMASSI's avatar Rhadia STG_HARMASSI

predictor learner

parent 9ee3031f
import sys
#sys.path.insert(0,"/home/rhermassi/CRF_suiteV8/CRFSuite")
sys.path.insert(0,"/data/nlp/CRF")
sys.path.insert(0,"/usr/lib/python3.6/site-packages")
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_val_score
import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics
from feature_extraction import FeatureExtractor
import re
import eli5
import nltk
from nltk import pos_tag
import scipy.stats
from six.moves import zip
from token_features import *
import numpy as np
import os
FichList = [ f for f in os.listdir(sys.argv[1]) if os.path.isfile(os.path.join(sys.argv[1],f)) ]
import pandas as pd
a= [pd.read_table(sys.argv[1]+"/"+f, sep = "\s+",names=range(100),engine='python') for f in FichList]
training_set = [ s.astype("str") for s in a ]
training_Set=[t.drop(t.columns[7:100],axis=1) for t in training_set]
train_set=[]
for training in training_Set :
tag= nltk.pos_tag(training.loc[:,0])
l=[]
for t in tag:
l.append(t[1])
training[2]=l
training[3]= training[[3,4,5,6]].apply(tuple, axis=1)
train_set.append(training)
train_Set=[tr.drop(tr.columns[4:8],axis=1) for tr in train_set]
FichList = [ f for f in os.listdir(sys.argv[2]) if os.path.isfile(os.path.join(sys.argv[2],f)) ]
a= [pd.read_table(sys.argv[2]+"/"+f, sep = "\s+",names=range(100),engine='python') for f in FichList]
testing_set = [ s.astype("str") for s in a ]
testing_Set=[tes.drop(tes.columns[7:100],axis=1) for tes in testing_set]
test_set=[]
print len(test_set)
for test in testing_Set :
tag_test= nltk.pos_tag(test.loc[:,0])
l=[]
for t_test in tag_test:
l.append(t_test[1])
test[2]=l
test[3]= test[[3,4,5,6]].apply(tuple, axis=1)
test_set.append(test)
test_Set=[ts.drop(ts.columns[4:8],axis=1) for ts in test_set]
feature_extractor = FeatureExtractor(
token_features = [
token_identity,
#pos_tag,
is_title,
is_lower,
is_digit,
is_ponct,
looks_like_street_part,
looks_like_house_part,
# looks_like_postcode_part,
# looks_like_city_part,
]
)
def word2features(sent, i):
word= sent.loc[i,0]
pos_tag=sent.loc[i,2]
tag_address=sent.loc[i,3]
seg=[word,pos_tag,tag_address]
features_str = feature_extractor.transform_single(seg)
features_str.update({'postag' :pos_tag})
if i>0 :
lengths=(1,2,3)
featname="Prefix"
end=min(lengths[-1],i)
windows_size = dict(zip(["%s%s" % (featname, j) for j in range(1,end+1)], lengths))
for key, size in windows_size.items():
word1=sent.loc[i-size,0]
pos_tag1=sent.loc[i-size,2]
address_tag1=sent.loc[i-1,3]
seg1=[word1,pos_tag1,address_tag1]
features_str.update({
key+' word': word1,
key+' postag' : pos_tag1,
key+' lower' : is_lower(seg1),
key+' title' : is_title(seg1),
key+' digit' : is_digit(seg1),
key+' ponct': is_ponct(seg1),
key+' street' : looks_like_street_part(seg1),
key+' house' : looks_like_house_part(seg1),
key+' code_postal' : looks_like_postcode_part(seg1),
key+' city_part' : looks_like_city_part(seg1),
})
else:
features_str['BOS']= True
if i< len(sent)-1 :
lengths=(1,2,3)
end1=min(lengths[-1],len(sent)-i-1)
windows_size = dict(zip(["%s%s" % ("Suffix",j) for j in range(1,end1+1)], lengths))
for key, size in windows_size.items():
word1=sent.loc[i+size,0]
pos_tag1=sent.loc[i+size,2]
address_tag1=sent.loc[i+1,3]
seg1=[word1,pos_tag1,address_tag1]
features_str.update({
key+' word':word1,
key+' postag' : pos_tag1,
key+' lower' : is_lower(seg1),
key+' title' : is_title(seg1),
key+' digit' : is_digit(seg1),
key+' ponct': is_ponct(seg1),
key+' code_postal' : looks_like_postcode_part(seg1),
key+' city_part' : looks_like_city_part(seg1),
})
else: features_str['BOS']=True
return features_str
def sent2features(sent):
return [word2features(sent, i) for i in range(0,len(sent)) ]
def sent2labels(sent):
label=sent.loc[ :,1]
l=[]
for lab in label:
l.append(lab[0])
return l
def sent2tokens(sent):
return [sent.loc[ :,0]]
y_train =[sent2labels(s) for s in train_Set]
X_train = [sent2features(s) for s in train_Set]
X_test = [sent2features(s) for s in test_Set]
y_test = [sent2labels(s) for s in test_Set]
crf = sklearn_crfsuite.CRF(
algorithm='lbfgs',
c1=0.1,
c2=0.1,
max_iterations=100,
all_possible_transitions=True
)
crf.fit(X_train, y_train)
labels = ['B','I','E']
labels
y_pred = crf.predict(X_test)
metrics.flat_f1_score(y_test, y_pred,
average='weighted', labels=labels)
sorted_labels = sorted(
labels,
key=lambda name: (name[1:], name[0])
)
print(metrics.flat_classification_report(
y_test, y_pred, labels=sorted_labels, digits=3
))
import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics
from feature_extraction import FeatureExtractor
import re
import nltk
from nltk import pos_tag
import scipy.stats
from sklearn.metrics import make_scorer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from six.moves import zip
from token_features import *
feature_extractor = FeatureExtractor(
token_features = [
token_identity,
#pos_tag,
is_title,
is_lower,
is_digit,
is_ponct,
looks_like_street_part,
looks_like_house_part,
looks_like_postcode_part,
looks_like_city_part,
]
)
def word2features(sent, i):
word= sent.loc[i,0]
pos_tag=sent.loc[i,2]
tag_address=sent.loc[i,3]
seg=[word,pos_tag,tag_address]
features_str = feature_extractor.transform_single(seg)
features_str.update({'postag' :pos_tag})
if i>0 :
lengths=(1,2,3)
featname="Prefix"
end=min(lengths[-1],i)
windows_size = dict(zip(["%s%s" % (featname, j) for j in range(1,end+1)], lengths))
for key, size in windows_size.items():
word1=sent.loc[i-size,0]
pos_tag1=sent.loc[i-size,2]
address_tag1=sent.loc[i-1,3]
seg1=[word1,pos_tag1,address_tag1]
features_str.update({
key+' word': word1,
key+' postag' : pos_tag1,
key+' lower' : is_lower(seg1),
key+' title' : is_title(seg1),
key+' digit' : is_digit(seg1),
key+' ponct': is_ponct(seg1),
key+' street' : looks_like_street_part(seg1),
key+' house' : looks_like_house_part(seg1),
key+' code_postal' : looks_like_postcode_part(seg1),
key+' city_part' : looks_like_city_part(seg1),
})
else:
features_str['BOS']= True
if i< len(sent)-1 :
lengths=(1,2,3)
end1=min(lengths[-1],len(sent)-i-1)
windows_size = dict(zip(["%s%s" % ("Suffix",j) for j in range(1,end1+1)], lengths))
for key, size in windows_size.items():
word1=sent.loc[i+size,0]
pos_tag1=sent.loc[i+size,2]
address_tag1=sent.loc[i+1,3]
seg1=[word1,pos_tag1,address_tag1]
features_str.update({
key+' word':word1,
key+' postag' : pos_tag1,
key+' lower' : is_lower(seg1),
key+' title' : is_title(seg1),
key+' digit' : is_digit(seg1),
key+' ponct': is_ponct(seg1),
key+' street' : looks_like_street_part(seg1),
key+' house' : looks_like_house_part(seg1),
key+' code_postal' : looks_like_postcode_part(seg1),
key+' city_part' : looks_like_city_part(seg1),
})
else: features_str['BOS']=True
return features_str
def sent2features(sent):
return [word2features(sent, i) for i in range(0,len(sent)) ]
def sent2labels(sent):
label=sent.loc[ :,1]
l=[]
for lab in label:
l.append(lab[0])
return l
def sent2tokens(sent):
label=sent.loc[ :,0]
l=[]
for lab in label:
l.append(lab)
return l
import sys
#sys.path.insert(0,"/home/rhermassi/CRF_suiteV8/CRFSuite")
sys.path.insert(0,"/data/nlp/CRF")
sys.path.insert(0,"/usr/lib/python3.6/site-packages")
sys.path.insert(0,"/usr/lib/python3.5/site-packages2")
import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics
import sys
from sklearn.metrics import make_scorer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from crf import*
from joblib import dump, load
import os
import pandas as pd
def learner(a,b):
#a=sys.argv[1] :repertoire de training data
#b=sys.argv[2] :path to model crf.joblib
FichList = [ f for f in os.listdir(a) if os.path.isfile(os.path.join(a,f)) ]
a= [pd.read_table(a+"/"+f, sep = "\s+",names=range(100),engine='python') for f in FichList]
# training_set, testing_set = train_test_split(a, test_size=0.33)
training_set = [ s.astype("str") for s in a]
training_Set=[t.drop(t.columns[7:100],axis=1) for t in training_set]
train_set=[]
for training in training_Set :
tag= nltk.pos_tag(training.loc[:,0])
l=[]
for t in tag:
l.append(t[1])
training[2]=l
training[3]= training[[3,4,5,6]].apply(tuple, axis=1)
train_set.append(training)
train_Set=[tr.drop(tr.columns[4:8],axis=1) for tr in train_set]
y_train =[sent2labels(s) for s in train_Set]
X_train = [sent2features(s) for s in train_Set]
crf = sklearn_crfsuite.CRF(
algorithm='lbfgs',
c1=0.1,
c2=0.1,
max_iterations=100,
all_possible_transitions=True
)
crf.fit(X_train, y_train)
dump(crf,b)
learner(sys.argv[1],sys.argv[2])
import sys
#sys.path.insert(0,"/home/rhermassi/CRF_suiteV8/CRFSuite")
sys.path.insert(0,"/data/nlp/CRF")
sys.path.insert(0,"/usr/lib/python3.6/site-packages")
sys.path.insert(0,"/usr/lib/python3.5/site-packages2")
import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics
import sys
from sklearn.metrics import make_scorer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from crf import*
from joblib import dump, load
import os
import pandas as pd
def predictor(a,b):
#a=sys.argv[1]
#b=sys.argv[2]
#FichList = [ f for f in os.listdir(a) if os.path.isfile(os.path.join(a,f)) ]
a= pd.read_table(a, sep = "\s+",names=range(100),engine='python')
predict = a.astype("str")
predict=predict.drop(predict.columns[7:100],axis=1)
tag_pred= nltk.pos_tag(predict.loc[:,0])
l=[]
for t_pred in tag_pred:
l.append(t_pred[1])
predict[2]=l
predict[3]= predict[[3,4,5,6]].apply(tuple, axis=1)
predict.drop(predict.columns[4:8],axis=1)
X_test= sent2features(predict)
T_test=sent2tokens(predict)
labels = ['B','I','E']
crf=load('crf.joblib')
y_pred = crf.predict(X_test)
addr=[]
j=0
while j <len(y_pred):
found=0
address=""
while (j<len(y_pred) and (y_pred[j] in ['B','I','E']) ):
found=1
address=address+" "+str(T_test[j])
j=j+1
if found==1:
addr.append(address)
j=j+1
return addr
addr=predictor(sys.argv[1],sys.argv[2])
print ("%s" % addr)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment