Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Open sidebar
Zied SELLAMI
address-extractor
Commits
21273398
Commit
21273398
authored
Dec 21, 2018
by
Zied SELLAMI
Browse files
ADD python CRF learner and predictor
parent
9ee3031f
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
208 additions
and
0 deletions
+208
-0
python/crf.py
python/crf.py
+103
-0
python/crf_learner.py
python/crf_learner.py
+52
-0
python/crf_predictor.py
python/crf_predictor.py
+53
-0
No files found.
python/crf.py
0 → 100644
View file @
21273398
import
sklearn_crfsuite
from
sklearn_crfsuite
import
scorers
from
sklearn_crfsuite
import
metrics
from
feature_extraction
import
FeatureExtractor
import
re
import
nltk
from
nltk
import
pos_tag
import
scipy.stats
from
sklearn.metrics
import
make_scorer
from
sklearn.model_selection
import
train_test_split
from
sklearn.model_selection
import
cross_val_score
from
six.moves
import
zip
from
token_features
import
*
feature_extractor
=
FeatureExtractor
(
token_features
=
[
token_identity
,
#pos_tag,
is_title
,
is_lower
,
is_digit
,
is_ponct
,
looks_like_street_part
,
looks_like_house_part
,
looks_like_postcode_part
,
looks_like_city_part
,
]
)
def
word2features
(
sent
,
i
):
word
=
sent
.
loc
[
i
,
0
]
pos_tag
=
sent
.
loc
[
i
,
2
]
tag_address
=
sent
.
loc
[
i
,
3
]
seg
=
[
word
,
pos_tag
,
tag_address
]
features_str
=
feature_extractor
.
transform_single
(
seg
)
features_str
.
update
({
'postag'
:
pos_tag
})
if
i
>
0
:
lengths
=
(
1
,
2
,
3
)
featname
=
"Prefix"
end
=
min
(
lengths
[
-
1
],
i
)
windows_size
=
dict
(
zip
([
"%s%s"
%
(
featname
,
j
)
for
j
in
range
(
1
,
end
+
1
)],
lengths
))
for
key
,
size
in
windows_size
.
items
():
word1
=
sent
.
loc
[
i
-
size
,
0
]
pos_tag1
=
sent
.
loc
[
i
-
size
,
2
]
address_tag1
=
sent
.
loc
[
i
-
1
,
3
]
seg1
=
[
word1
,
pos_tag1
,
address_tag1
]
features_str
.
update
({
key
+
' word'
:
word1
,
key
+
' postag'
:
pos_tag1
,
key
+
' lower'
:
is_lower
(
seg1
),
key
+
' title'
:
is_title
(
seg1
),
key
+
' digit'
:
is_digit
(
seg1
),
key
+
' ponct'
:
is_ponct
(
seg1
),
key
+
' street'
:
looks_like_street_part
(
seg1
),
key
+
' house'
:
looks_like_house_part
(
seg1
),
key
+
' code_postal'
:
looks_like_postcode_part
(
seg1
),
key
+
' city_part'
:
looks_like_city_part
(
seg1
),
})
else
:
features_str
[
'BOS'
]
=
True
if
i
<
len
(
sent
)
-
1
:
lengths
=
(
1
,
2
,
3
)
end1
=
min
(
lengths
[
-
1
],
len
(
sent
)
-
i
-
1
)
windows_size
=
dict
(
zip
([
"%s%s"
%
(
"Suffix"
,
j
)
for
j
in
range
(
1
,
end1
+
1
)],
lengths
))
for
key
,
size
in
windows_size
.
items
():
word1
=
sent
.
loc
[
i
+
size
,
0
]
pos_tag1
=
sent
.
loc
[
i
+
size
,
2
]
address_tag1
=
sent
.
loc
[
i
+
1
,
3
]
seg1
=
[
word1
,
pos_tag1
,
address_tag1
]
features_str
.
update
({
key
+
' word'
:
word1
,
key
+
' postag'
:
pos_tag1
,
key
+
' lower'
:
is_lower
(
seg1
),
key
+
' title'
:
is_title
(
seg1
),
key
+
' digit'
:
is_digit
(
seg1
),
key
+
' ponct'
:
is_ponct
(
seg1
),
key
+
' street'
:
looks_like_street_part
(
seg1
),
key
+
' house'
:
looks_like_house_part
(
seg1
),
key
+
' code_postal'
:
looks_like_postcode_part
(
seg1
),
key
+
' city_part'
:
looks_like_city_part
(
seg1
),
})
else
:
features_str
[
'BOS'
]
=
True
return
features_str
def
sent2features
(
sent
):
return
[
word2features
(
sent
,
i
)
for
i
in
range
(
0
,
len
(
sent
))
]
def
sent2labels
(
sent
):
label
=
sent
.
loc
[
:,
1
]
l
=
[]
for
lab
in
label
:
l
.
append
(
lab
[
0
])
return
l
def
sent2tokens
(
sent
):
label
=
sent
.
loc
[
:,
0
]
l
=
[]
for
lab
in
label
:
l
.
append
(
lab
)
return
l
python/crf_learner.py
0 → 100644
View file @
21273398
import
sys
#sys.path.insert(0,"/home/rhermassi/CRF_suiteV8/CRFSuite")
sys
.
path
.
insert
(
0
,
"/data/nlp/CRF"
)
sys
.
path
.
insert
(
0
,
"/usr/lib/python3.6/site-packages"
)
sys
.
path
.
insert
(
0
,
"/usr/lib/python3.5/site-packages2"
)
import
sklearn_crfsuite
from
sklearn_crfsuite
import
scorers
from
sklearn_crfsuite
import
metrics
import
sys
from
sklearn.metrics
import
make_scorer
from
sklearn.model_selection
import
train_test_split
from
sklearn.model_selection
import
cross_val_score
from
crf
import
*
from
joblib
import
dump
,
load
import
os
import
pandas
as
pd
def
learner
(
a
,
b
):
#a=sys.argv[1] :repertoire de training data
#b=sys.argv[2] :path to model crf.joblib
FichList
=
[
f
for
f
in
os
.
listdir
(
a
)
if
os
.
path
.
isfile
(
os
.
path
.
join
(
a
,
f
))
]
a
=
[
pd
.
read_table
(
a
+
"/"
+
f
,
sep
=
"\s+"
,
names
=
range
(
100
),
engine
=
'python'
)
for
f
in
FichList
]
# training_set, testing_set = train_test_split(a, test_size=0.33)
training_set
=
[
s
.
astype
(
"str"
)
for
s
in
a
]
training_Set
=
[
t
.
drop
(
t
.
columns
[
7
:
100
],
axis
=
1
)
for
t
in
training_set
]
train_set
=
[]
for
training
in
training_Set
:
tag
=
nltk
.
pos_tag
(
training
.
loc
[:,
0
])
l
=
[]
for
t
in
tag
:
l
.
append
(
t
[
1
])
training
[
2
]
=
l
training
[
3
]
=
training
[[
3
,
4
,
5
,
6
]].
apply
(
tuple
,
axis
=
1
)
train_set
.
append
(
training
)
train_Set
=
[
tr
.
drop
(
tr
.
columns
[
4
:
8
],
axis
=
1
)
for
tr
in
train_set
]
y_train
=
[
sent2labels
(
s
)
for
s
in
train_Set
]
X_train
=
[
sent2features
(
s
)
for
s
in
train_Set
]
crf
=
sklearn_crfsuite
.
CRF
(
algorithm
=
'lbfgs'
,
c1
=
0.1
,
c2
=
0.1
,
max_iterations
=
100
,
all_possible_transitions
=
True
)
crf
.
fit
(
X_train
,
y_train
)
dump
(
crf
,
b
)
learner
(
sys
.
argv
[
1
],
sys
.
argv
[
2
])
python/crf_predictor.py
0 → 100644
View file @
21273398
import
sys
#sys.path.insert(0,"/home/rhermassi/CRF_suiteV8/CRFSuite")
sys
.
path
.
insert
(
0
,
"/data/nlp/CRF"
)
sys
.
path
.
insert
(
0
,
"/usr/lib/python3.6/site-packages"
)
sys
.
path
.
insert
(
0
,
"/usr/lib/python3.5/site-packages2"
)
import
sklearn_crfsuite
from
sklearn_crfsuite
import
scorers
from
sklearn_crfsuite
import
metrics
import
sys
from
sklearn.metrics
import
make_scorer
from
sklearn.model_selection
import
train_test_split
from
sklearn.model_selection
import
cross_val_score
from
crf
import
*
from
joblib
import
dump
,
load
import
os
import
pandas
as
pd
def
predictor
(
a
,
b
):
#a=sys.argv[1]
#b=sys.argv[2]
#FichList = [ f for f in os.listdir(a) if os.path.isfile(os.path.join(a,f)) ]
a
=
pd
.
read_table
(
a
,
sep
=
"\s+"
,
names
=
range
(
100
),
engine
=
'python'
)
predict
=
a
.
astype
(
"str"
)
predict
=
predict
.
drop
(
predict
.
columns
[
7
:
100
],
axis
=
1
)
tag_pred
=
nltk
.
pos_tag
(
predict
.
loc
[:,
0
])
l
=
[]
for
t_pred
in
tag_pred
:
l
.
append
(
t_pred
[
1
])
predict
[
2
]
=
l
predict
[
3
]
=
predict
[[
3
,
4
,
5
,
6
]].
apply
(
tuple
,
axis
=
1
)
predict
.
drop
(
predict
.
columns
[
4
:
8
],
axis
=
1
)
X_test
=
sent2features
(
predict
)
T_test
=
sent2tokens
(
predict
)
labels
=
[
'B'
,
'I'
,
'E'
]
crf
=
load
(
'crf.joblib'
)
y_pred
=
crf
.
predict
(
X_test
)
addr
=
[]
j
=
0
while
j
<
len
(
y_pred
):
found
=
0
address
=
""
while
(
j
<
len
(
y_pred
)
and
(
y_pred
[
j
]
in
[
'B'
,
'I'
,
'E'
])
):
found
=
1
address
=
address
+
" "
+
str
(
T_test
[
j
])
j
=
j
+
1
if
found
==
1
:
addr
.
append
(
address
)
j
=
j
+
1
return
addr
addr
=
predictor
(
sys
.
argv
[
1
],
sys
.
argv
[
2
])
print
(
"%s"
%
addr
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment