Commit 3ad59a3d authored by Abdelwahab HEBA's avatar Abdelwahab HEBA
Browse files

Normalize Text

parent 1748ca3f
...@@ -27,7 +27,7 @@ for b in $(cat $in_list); do ...@@ -27,7 +27,7 @@ for b in $(cat $in_list); do
echo "Start processing $id at $(date '+%T %F')" echo "Start processing $id at $(date '+%T %F')"
in_file=$b/$id.trs in_file=$b/$id.trs
[[ -f "$in_file" ]] || { echo "WARNING: $in_file does not exists"; continue; } [[ -f "$in_file" ]] || { echo "WARNING: $in_file does not exists"; continue; }
$PYTHON local/lm/parseText.py $in_file |\ python3 local/lm/parseText.py $in_file |\
$PYTHON local/lm/pre_filter.py /dev/stdin $out_root/corpus_train.txt $PYTHON local/lm/pre_filter.py /dev/stdin $out_root/corpus_train.txt
processed=$((processed + 1)) processed=$((processed + 1))
echo "Processing of $id has finished at $(date '+%T %F') [$processed texts ready so far]" echo "Processing of $id has finished at $(date '+%T %F') [$processed texts ready so far]"
......
#!/usr/bin/env python #!/usr/bin/env python
# -*- coding: latin-1 -*- # -*- coding: utf-8 -*-
from xml.etree import ElementTree as ET from xml.etree import ElementTree as ET
from unicodedata import normalize from unicodedata import normalize
from sys import argv from sys import argv
from num2words import num2words from num2words import num2words
from unidecode import unidecode
import re import re
import os.path import os.path
import sys
# ( in text # ( in text
# ) in text # ) in text
def transformation_text(text): def transformation_text(text):
bool=True bool=True
if "###" in text or len(re.findall(r"\[.+\]", text)) > 0 or \ if "###" in text or len(re.findall(r"\[.+\]", text)) > 0 or \
...@@ -22,7 +22,7 @@ def transformation_text(text): ...@@ -22,7 +22,7 @@ def transformation_text(text):
else: else:
# 4x4 # 4x4
# Remove noise sound (BIP) over Name of places and person # Remove noise sound (BIP) over Name of places and person
#text = re.sub(r"[^ ]+|[^ ]+|", "", text.strip()) #text = re.sub(r"¤[^ ]+|[^ ]+¤|¤", "", text.strip())
if len(re.findall(r"\dx\d",text))>0: if len(re.findall(r"\dx\d",text))>0:
text=re.sub(r"x"," ",text) text=re.sub(r"x"," ",text)
if len(re.findall("\d+h\d+",text))>0: if len(re.findall("\d+h\d+",text))>0:
...@@ -50,12 +50,12 @@ def transformation_text(text): ...@@ -50,12 +50,12 @@ def transformation_text(text):
text=re.sub(r'\.',' ',text) text=re.sub(r'\.',' ',text)
#text=re.sub(r"{[^{]+}"," ",text.strip()) #text=re.sub(r"{[^{]+}"," ",text.strip())
# Remove ? ! < > : OK # Remove ? ! < > : OK
#<[^\p{L}]|[^\p{L}]>|<\p{L}+[ ]|<\p{L}+$ #<[^\p{L}]|[^\p{L}]>|#+|<\p{L}+[ ]|<\p{L}+$
text=re.sub(r":|\?|/|\!|<|>|#+","",text) text=re.sub(r":|\?|/|\!|<|>|#+","",text)
# replace silence character with <sil> : OK # replace silence character with <sil> : OK
#text=re.sub(r"(\+)", "<sil>", text) #text=re.sub(r"(\+)", "<sil>", text)
text=re.sub(r"(\+)", "", text) text=re.sub(r"(\+)", "!SIL", text)
text=re.sub(r"(///)", "", text) text=re.sub(r"(///)", "!SIL", text)
#text=re.sub(r"(///)", "<long-sil>", text) #text=re.sub(r"(///)", "<long-sil>", text)
if len(re.findall(r"/.+/", text)) > 0: if len(re.findall(r"/.+/", text)) > 0:
#print "AVANT***********"+text #print "AVANT***********"+text
...@@ -68,8 +68,8 @@ def transformation_text(text): ...@@ -68,8 +68,8 @@ def transformation_text(text):
choosen_word = choosen_word.replace('/', '') choosen_word = choosen_word.replace('/', '')
text = text.replace(unchoosen_text, choosen_word) text = text.replace(unchoosen_text, choosen_word)
#print "Apres************"+text #print "Apres************"+text
# Remove noise sound (BIP) over Name of places and person # Remove noise sound (BIP) over Name of places and person
text=re.sub(r"(.+)",'',text) text=re.sub(r"(¤.+¤)",'',text)
# replace unkown syllable # replace unkown syllable
text=re.sub(r"\*+","",text) text=re.sub(r"\*+","",text)
# cut of recording : OK # cut of recording : OK
...@@ -80,7 +80,6 @@ def transformation_text(text): ...@@ -80,7 +80,6 @@ def transformation_text(text):
text = re.sub(r"[ ]\'", " ", text) text = re.sub(r"[ ]\'", " ", text)
text = re.sub(r"\'", "\' ", text) text = re.sub(r"\'", "\' ", text)
# convert number if exist : OK # convert number if exist : OK
num_list = re.findall(" \d+| \d+$", text) num_list = re.findall(" \d+| \d+$", text)
if len(num_list) > 0: if len(num_list) > 0:
#print text #print text
...@@ -96,21 +95,19 @@ def transformation_text(text): ...@@ -96,21 +95,19 @@ def transformation_text(text):
# change bounding | to < and > : OK # change bounding | to < and > : OK
#balise=set(re.findall(r"\|\w+_?\w+\|",text)) #balise=set(re.findall(r"\|\w+_?\w+\|",text))
#if len(balise)>0: #if len(balise)>0:
# print(balise) #print(balise)
# for b in balise: # for b in balise:
# new_balise='<'+b[1:len(b)-1]+'>' # new_balise='<'+b[1:len(b)-1]+'>'
# text=text.replace(b,new_balise) # text=text.replace(b,new_balise)
# print(text) #print(text)
# c'est l'essaim .... # c'est l'essaim ....
text=text.lower() text=text.lower()
return bool,text return bool,text
if __name__=="__main__": if __name__=="__main__":
# Inputs # Inputs
file_trs=argv[1] file_trs=argv[1]
basename=os.path.basename(file_trs.split('.')[0]) #print(file_trs)
# MetaData File #print file_trs
file_meta = file_trs.split('.')[0] + '.xml'
#print file_trs.split('.')[0]
# Read Trans File # Read Trans File
tree_trs = ET.parse(file_trs) tree_trs = ET.parse(file_trs)
trsdoc= tree_trs.getroot() trsdoc= tree_trs.getroot()
...@@ -132,7 +129,7 @@ if __name__=="__main__": ...@@ -132,7 +129,7 @@ if __name__=="__main__":
# File text # File text
# File speaker_gender # File speaker_gender
if bool and text!="": if bool and text!="":
print text print(text)
#for spk_tuple in speaker_gender: #for spk_tuple in speaker_gender:
# if spk_tuple[0]==spkr: # if spk_tuple[0]==spkr:
# print >> spk2gender,'%s %s' % (seg_id, spk_tuple[1]) # print >> spk2gender,'%s %s' % (seg_id, spk_tuple[1])
...@@ -144,24 +141,16 @@ if __name__=="__main__": ...@@ -144,24 +141,16 @@ if __name__=="__main__":
if count>0: if count>0:
bool, text = transformation_text(text) bool, text = transformation_text(text)
if bool and text!="": if bool and text!="":
print text print(text)
text=Element.tail.replace('\n', '') text=Element.tail.replace('\n', '')
count=count+1 count=count+1
elif Element.tag=="Comment" and has_attrib_speaker and not Element.tail is None: elif Element.tag=="Comment" and has_attrib_speaker and not Element.tail is None:
text=text+" "+Element.tail.replace('\n', '') text=text+" "+Element.tail.replace('\n', '')
elif Element.tag=="Event" and has_attrib_speaker and not Element.tail is None : elif Element.tag=="Event" and has_attrib_speaker and not Element.tail is None :
if Element.get('type')=='noise': text=text+" "+Element.tail.replace('\n', '')
if Element.get('desc')=='rire':
text=text+" "+Element.tail.replace('\n', '')
else:
text=text+" "+Element.tail.replace('\n', '')
elif Element.get('type')=='pronounce':
text=text+" "+Element.tail.replace('\n', '')
else:
text=text+" "+Element.tail.replace('\n', '')
elif Element.tag=="Who" and has_attrib_speaker and not Element.tail is None: elif Element.tag=="Who" and has_attrib_speaker and not Element.tail is None:
text=text+" "+Element.tail.replace('\n', '') text=text+" "+Element.tail.replace('\n', '')
if count > 0 and has_attrib_speaker and not Element.tail is None: if count > 0 and has_attrib_speaker and not Element.tail is None:
bool, text = transformation_text(text) bool, text = transformation_text(text)
if bool and text != "": if bool and text != "":
print text print(text)
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment