Commit 2316fe66 authored by Abdelwahab HEBA's avatar Abdelwahab HEBA
Browse files

Update Tcof Normalization

parent 3185ce05
......@@ -65,6 +65,7 @@ done
# Sort all files
# text
#export LC_ALL=C
cat $trans | sort -k1 > $trans.txt
rm $trans
mv $trans.txt $trans
......@@ -92,7 +93,7 @@ utils/utt2spk_to_spk2utt.pl <$utt2spk >$spk2utt #|| exit 1
cat $spk2utt | sort -k1 > $spk2utt.txt
rm $spk2utt
mv $spk2utt.txt $spk2utt
#export LC_ALL=fr_FR.UTF-8
ntrans=$(wc -l <$trans)
nutt2spk=$(wc -l <$utt2spk)
! [ "$ntrans" -eq "$nutt2spk" ] && \
......
......@@ -50,8 +50,8 @@ def transformation_text(text):
text=re.sub(r'\.',' ',text)
#text=re.sub(r"{[^{]+}"," ",text.strip())
# Remove ? ! < > : OK
#<[^\p{L}]|[^\p{L}]>|#+|<\p{L}+[ ]|<\p{L}+$
text=re.sub(r"\?|/|\!|<|>","",text)
#<[^\p{L}]|[^\p{L}]>|<\p{L}+[ ]|<\p{L}+$
text=re.sub(r":|\?|/|\!|<|>|#+","",text)
# replace silence character with <sil> : OK
#text=re.sub(r"(\+)", "<sil>", text)
text=re.sub(r"(\+)", "", text)
......
......@@ -62,7 +62,7 @@ def transformation_text(text):
#text=re.sub(r"{[^{]+}"," ",text.strip())
# Remove ? ! < > : OK
#<[^\p{L}]|[^\p{L}]>|#+|<\p{L}+[ ]|<\p{L}+$
text=re.sub(r"\?|/|\!|<|>","",text)
text=re.sub(r":|\?|/|\!|<|>|#+","",text)
# replace silence character with <sil> : OK
#text=re.sub(r"(\+)", "<sil>", text)
text=re.sub(r"(\+)", "!SIL", text)
......
......@@ -138,7 +138,7 @@ if [ $stage -le 3 ]; then
fi
if [ $stage -le 4 ]; then
(echo '!sil SIL'; echo '<spoken_noise> SPN'; echo '<unk> SPN'; echo '<laugh> LAU'; echo '<noise> NSN') |\
(echo '!sil SIL'; echo '<spoken_noise> SPN'; echo '<UNK> SPN'; echo '<laugh> LAU'; echo '<noise> NSN') |\
cat - $lexicon_raw_nosil | sort | uniq >$dst_dir/lexicon.txt
echo "Lexicon text file saved as: $dst_dir/lexicon.txt"
fi
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment