Commit f02e73c9 authored by Yoann HOUPERT's avatar Yoann HOUPERT

Merge branch 'linstt-offline-dispatch-dockerisation'

parents d2e80fb3 bd05d6f7
FROM debian:8
MAINTAINER Abdel HEBA <aheba@linagora.com>
FROM debian:9
MAINTAINER Yoann Houpert <yhoupert@linagora.com>
# Install all our dependencies and set some required build changes
RUN apt-get update && apt-get install -y \
autoconf \
automake \
bzip2 \
default-jre \
g++ \
git \
gzip \
libatlas3-base \
libtool-bin \
make \
python2.7 \
python3 \
python-pip \
RUN apt-get update &&\
apt-get install -y \
sox \
subversion \
wget \
zlib1g-dev &&\
apt-get clean autoclean && \
apt-get autoremove -y && \
ln -s /usr/bin/python2.7 /usr/bin/python ; ln -s -f bash /bin/sh
# Speaker diarization
RUN cd /opt && wget http://www-lium.univ-lemans.fr/diarization/lib/exe/fetch.php/lium_spkdiarization-8.4.1.jar.gz && \
gzip -d lium_spkdiarization-8.4.1.jar.gz
# Build kaldi
RUN git clone https://ci.linagora.com/aheba/kaldi_2015 /opt/kaldi && \
cd /opt/kaldi/tools && \
make && \
cd /opt/kaldi/src && ./configure --shared && make depend && make
python2.7 \
python-pip &&\
apt-get clean
ENV BASE_DIR /opt/speech-to-text
RUN mkdir -p $BASE_DIR
WORKDIR $BASE_DIR
# Install Flask
COPY requirements.txt .
RUN pip install -r requirements.txt
# Deploy our offline server
COPY . .
RUN ./deploy-offline-decoding.sh /opt/kaldi /opt/lium_spkdiarization-8.4.1.jar /opt/models
COPY modules/server/server.cfg .
RUN pip install -r modules/server/requirements.txt
EXPOSE 8888
# Set the default command
EXPOSE 5000
CMD ./LinSTT_webservice.py
\ No newline at end of file
CMD ./modules/server/master_server.py
\ No newline at end of file
Copyright (c) 2014, alumae
All rights reserved.
Redistribution and use in source and binary forms, with or without modification,
are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice, this
list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright notice, this
list of conditions and the following disclaimer in the documentation and/or
other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
\ No newline at end of file
#! /usr/bin/python
# -*- coding:utf-8 -*-
from StringIO import StringIO
from flask import Flask, request
from flask import send_file
from flask import jsonify
import linecache
import os
import subprocess
import wave
app = Flask(__name__)
app.debug = True
app.secret_key = "Abdel"
@app.route('/upload', methods=['GET','POST'])
def upload():
if request.method == 'POST':
fichier=request.files['wavFile']
nom_fichier=fichier.filename
fichier.save('./wavs/'+nom_fichier)
subprocess.call("cd scripts; ./decode.sh ../systems/models "+nom_fichier, shell=True)
data = {}
json = ""
with open('trans/decode_'+nom_fichier.split('.')[0]+'.log', "r") as fp:
line = fp.readline()
json +=line.strip()
while line:
line = fp.readline()
json +=line.strip()
data['transcript'] = json
return jsonify(data);
return '<form action="" method="post" enctype="multipart/form-data"><input type="file" name="wavFile"/><input type="submit" value="Envoyer" /></form>'
if __name__=='__main__':
if "NB_PROCESS" in os.environ:
app.run(host='0.0.0.0', processes=int(os.environ['NB_PROCESS']))
else:
app.run(host='0.0.0.0',threaded=True)
\ No newline at end of file
# linstt-offline-dispatch
This project aims to build a speech-to-text transcriber web service based on kaldi-offline-decoding.
## Getting Started
These instructions will get you a copy of the project up and running on your local machine for development and testing purposes. See deployment for notes on how to deploy the project on a live system.
The project is divided into 3 modules:
- [worker_offline] is the module in charge of the ASR (automatic speech recognition).
- [master_server] is the webserver that provide the ASR service.
- [client] is a simple client meant to transcribe an audio file.
### Prerequisites
#### Python 2.7
This project runs on python 2.7.
In order to run the [master_server] and the [client] you will need to install those python libraries:
- tornado>=4.5.2
- ws4py
```
pip install ws4py
pip install tornado
```
Or
```
pip install -r recquirements.txt
```
within the modules/server folder.
#### Kaldi model
The ASR server that will be setup here require kaldi model, note that the model is not included in the depository.
You must have this model on your machine. You must also check that the model have the specific files bellow :
- final.alimdl
- final.mat
- final.mdl
- splice_opts
- tree
- Graph/HCLG.fst
- Graph/disambig_tid.int
- Graph/num_pdfs
- Graph/phones.txt
- Graph/words.txt
- Graph/phones/*
#### Docker
You must install docker on your machine. Refer to [docker doc](https://docs.docker.com/engine/installation)
```
apt-get install docker
yoaourt -S docker
```
### Installing
You need to build the docker image first.
Go to modules/worker_offline and build the container.
```
cd modules/worker_offline
docker build -t linagora/stt-offline .
```
## Running the tests
To run an automated test go to the test folder
```
cd tests
```
And run the test script:
```
./deployement_test.sh <langageModelPath>
```
The test should display "Test succefull"
## Deployment
#### 1- Server
* Configure the server options by editing the server.conf file.
* Launch the server
```
./master_server.py
```
#### 2- Worker
You can launch as many workers as you want on any machine that you want.
* Configure the worker by editing the server.conf file, provide the server IP adress ans server port.
* Launch the worker using the start_docker.sh command
```
cd modules/worker_offline
./start_docker.sh <langageModelPath>
```
For example if yout model is located at ~/speech/models/mymodel
With mymodel folder containing the following files:
- final.alimdl
- final.mat
- final.mdl
- splice_opts
- tree
- graphs/
```
cd modules/worker_offline
./start_docker.sh ~/speech/models/mymodel/
```
## Built With
* [tornado](http://www.tornadoweb.org/en/stable/index.html) - The web framework used
* [ws4py](https://ws4py.readthedocs.io/en/latest/) - WebSocket interfaces for python
## Authors
* **Abdelwahab Aheba** - *linstt-Offline-Decoding* - [Linagora](https://linagora.com/)
* **Rudy Baraglia** - *linstt-dispatch* - [Linagora] (https://linagora.com/)
## License
See the [LICENSE.md](LICENSE.md) file for details.
## Acknowledgments
* The project has been vastly inspired by [Alumae](https://github.com/alumae)'s project [kaldi-gstreamer-server](https://github.com/alumae/kaldi-gstreamer-server) and use chunk of his code.
Speech-to-Text Offline Decoding
--------
This project aims to build an automatic process for speech recognition from audio file (offline mode) using:
- Speaker Diarization: for speech activity detection, speech segmentation and speaker identification
- Fmllr decoding: for using speaker information to adapt acoustical model
DockerFile for LinSTT Service
--------
Dockerfile for [Offline-LinSTT](https://ci.linagora.com/aheba/offline-decoding).
This dockerfile automatically builds offline Speech-to-Text server using [Kaldi](kaldi-asr.org/doc/about.html)
Using this project, you will be able to run an offline Automatic Speech Recognition (ASR) server in a few minutes.
Attention
--------
The ASR server that will be setup here require kaldi model, In the docker image that I will detail below, there is no kaldi model included.
You must have this model on your machine. You must also check that the model have the specific files bellow :
- final.alimdl
- final.mat
- final.mdl
- splice_opts
- tree
- Graph/HCLG.fst
- Graph/disambig_tid.int
- Graph/num_pdfs
- Graph/phones.txt
- Graph/words.txt
- Graph/phones/*
Install docker
---------
Please, refer to [docker doc](https://docs.docker.com/engine/installation).
Get the image
---------
Currently, the image docker is about (4GB) and based on debian8, the image docker has not yet pulled on DockerHub.
You need to build your own image:
```
docker build -t linagora/stt-offline .
```
How to use
----------
`start_docker.sh` allow to build and create the container assuming that your kaldi model is located at `<Path_model>`
```
./start_docker.sh <Path_model> <Port>
```
The `<Port>` param publish a container's port to the host, you should use POST method to send wav file to the server for transcription.
Run Example
----------
Simple call using curl:
```
curl -F "wav_file=@<wav_path>" http://<IP:PORT_service>/upload > <output_trans>
```
The attribut `wav_file` is needed to submit the wav file to the server using POST Method
Client script is available and allow to connect to the server located at `http://localhost:<Port>/upload`
```
./client/client <wav_path> <IP_server>:<POST> <Output>
```
\ No newline at end of file
# Run example
# Args: $1=<Path_wav> $2=<Ip:port_LinSTT_service> $3=<Output_dir>
wav=$1
IP_service=$2
curl -F "wav_file=@$wav" http://$IP_service/upload > $3
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu Jan 4 11:10:18 2018
@author: rbaraglia
"""
import requests
import json
import logging
import argparse
SERVER_IP = u"localhost"
SERVER_PORT = u"8888"
SERVER_TARGET = u"/client/post/speech"
def main():
parser = argparse.ArgumentParser(description='Client for linstt-dispatch')
parser.add_argument('-u', '--uri', default="http://"+SERVER_IP+":"+SERVER_PORT+SERVER_TARGET, dest="uri", help="Server adress")
parser.add_argument('audioFile', help="The .wav file to be transcripted" )
args = parser.parse_args()
with open(args.audioFile, 'rb') as f:
print("Sendind request to transcribe file %s to server at %s" % (args.audioFile, "http://"+SERVER_IP+":"+SERVER_PORT+SERVER_TARGET))
r = requests.post(args.uri, files={'wavFile': f})
print(r.json()['transcript']
if __name__ == '__main__':
main()
\ No newline at end of file
ws4py
configparser
\ No newline at end of file
#!/usr/bin/env python2
# -*- coding: utf-8 -*-
"""
Created on Wed Jan 3 16:53:16 2018
@author: rbaraglia
"""
import os
import json
import functools
import threading
import uuid
import logging
import configparser
import tornado.ioloop
import tornado.web
import tornado.websocket
from tornado import gen
from tornado.locks import Condition
#LOADING CONFIGURATION
server_settings = configparser.ConfigParser()
server_settings.read('server.cfg')
SERVER_PORT = server_settings.get('server_params', 'listening_port')
TEMP_FILE_PATH = server_settings.get('machine_params', 'temp_file_location')
KEEP_TEMP_FILE = True if server_settings.get('server_params', 'keep_temp_files') == 'true' else False
LOGGING_LEVEL = logging.DEBUG if server_settings.get('server_params', 'debug') == 'true' else logging.INFO
if "OFFLINE_PORT" in os.environ:
SERVER_PORT = os.environ['OFFLINE_PORT']
class Application(tornado.web.Application):
def __init__(self):
settings = dict(
cookie_secret="43oETzKXQAGaYdkL5gEmGeJJFuYh7EQnp2XdTP1o/Vo=",
template_path=os.path.join(os.path.dirname(os.path.dirname(__file__)), "templates"),
static_path=os.path.join(os.path.dirname(os.path.dirname(__file__)), "static"),
xsrf_cookies=False,
autoescape=None,
)
handlers = [
(r"/", MainHandler),
(r"/client/post/speech", DecodeRequestHandler),
(r"/upload", DecodeRequestHandler),
(r"/worker/ws/speech", WorkerWebSocketHandler)
]
tornado.web.Application.__init__(self, handlers, **settings)
self.connected_worker = 0
self.available_workers = set()
self.waiting_client = set()
self.num_requests_processed = 0
#TODO: Abort request when the client is waiting for a determined amount of time
def check_waiting_clients(self):
if len(self.waiting_client) > 0:
try:
client = self.waiting_client.pop()
except:
pass
else:
client.waitWorker.notify()
def display_server_status(self):
logging.info('#'*50)
logging.info("Connected workers: %s (Available: %s)" % (str(self.connected_worker),str(len(self.available_workers))))
logging.info("Waiting clients: %s" % str(len(self.waiting_client)))
logging.info("Requests processed: %s" % str(self.num_requests_processed))
# Return le README
class MainHandler(tornado.web.RequestHandler):
def get(self):
current_directory = os.path.dirname(os.path.abspath(__file__))
parent_directory = os.path.join(current_directory, os.pardir)
readme = os.path.join(parent_directory, "README.md")
self.render(readme)
#Handler des requêtes de décodage.
class DecodeRequestHandler(tornado.web.RequestHandler):
SUPPORTED_METHOD = ('POST')
#Called at the beginning of a request before get/post/etc
def prepare(self):
self.worker = None
self.filePath = None
self.uuid = str(uuid.uuid4())
self.set_status(200, "Initial statut")
self.waitResponse = Condition()
self.waitWorker = Condition()
if self.request.method != 'POST' :
logging.debug("Received a non-POST request")
self.set_status(403, "Wrong request, server handles only POST requests")
self.finish()
#File Retrieval
# TODO: Adapt input to existing controller API
if 'wavFile' not in self.request.files.keys():
self.set_status(403, "POST request must contain a 'file_to_transcript' field.")
self.finish()
logging.debug("POST request from %s does not contain 'file_to_transcript' field.")
temp_file = self.request.files['wavFile'][0]['body']
self.temp_file = temp_file
#Writing file
try:
f = open(TEMP_FILE_PATH+self.uuid+'.wav', 'wb')
except IOError:
logging.error("Could not write file.")
self.set_status(500, "Server error: Counldn't write file on server side.")
self.finish()
else:
f.write(temp_file)
self.filePath = TEMP_FILE_PATH+self.uuid+'.wav'
logging.debug("File correctly received from client")
@gen.coroutine
def post(self, *args, **kwargs):
logging.debug("Allocating Worker to %s" % self.uuid)
yield self.allocate_worker()
self.worker.write_message(json.dumps({'uuid':self.uuid, 'file': self.temp_file.encode('base64')}))
yield self.waitResponse.wait()
self.finish()
@gen.coroutine
def allocate_worker(self):
while self.worker == None:
try:
self.worker = self.application.available_workers.pop()
except:
self.worker = None
self.application.waiting_client.add(self)
self.application.display_server_status()
yield self.waitWorker.wait()
else:
self.worker.client_handler = self
logging.debug("Worker allocated to client %s" % self.uuid)
self.application.display_server_status()
@gen.coroutine
def receive_response(self, message):
logging.debug("Forwarding transcription to client")
self.write({'transcript': message})
os.remove(TEMP_FILE_PATH+self.uuid+'.wav')
self.set_status(200, "Transcription succeded")
self.application.num_requests_processed += 1
self.waitResponse.notify()
def on_finish(self):
#CLEANUP
pass
#WebSocket de communication entre le serveur et le worker
class WorkerWebSocketHandler(tornado.websocket.WebSocketHandler):
def check_origin(self, origin):
return True
def open(self):
self.client_handler = None
self.application.available_workers.add(self)
self.application.connected_worker += 1
self.application.check_waiting_clients()
logging.debug("Worker connected")
self.application.display_server_status()
def on_message(self, message):
try:
json_msg = json.loads(str(message))
except:
logging.debug("Message received from worker:" + message)
else:
if 'transcription' in json_msg.keys(): #Receive the file path to process
response = json.dumps({'transcript':json_msg['transcription'].encode('utf-8')})
logging.debug("Response send by worker : %s" % response)
self.client_handler.receive_response(json.dumps({'transcript':json_msg['transcription']}))
self.client_handler = None
self.application.available_workers.add(self)
self.application.display_server_status()
self.application.check_waiting_clients()
elif 'error' in json_msg.keys():
logging.debug("WORKER Received error message worker, forwardind to client")
#TODO: Error forwarding to client
self.close()
def on_close(self):
if self.client_handler != None:
self.client_handler.set_status(503, "Worker failed to translate file")
self.client_handler.finish()
logging.debug("WORKER WebSocket closed")
self.application.available_workers.discard(self)
self.application.connected_worker -= 1
self.application.display_server_status()
def main():
logging.basicConfig(level=LOGGING_LEVEL, format="%(levelname)8s %(asctime)s %(message)s ")
#Check if the temp_file repository exist
if not os.path.isdir(TEMP_FILE_PATH):
os.mkdir(TEMP_FILE_PATH)
print('#'*50)
app = Application()
app.listen(int(SERVER_PORT))
logging.info('Starting up server listening on port %s' % SERVER_PORT)
try:
tornado.ioloop.IOLoop.instance().start()
except KeyboardInterrupt:
logging.info("Server close by user.")
if __name__ == '__main__':
main()
\ No newline at end of file
ws4py
configparser
tornado
\ No newline at end of file
[server_params]
listening_port : 8888
keep_temp_files : false
max_waiting_time : 10
debug : true
[machine_params]
temp_file_location : ./temp_files/
\ No newline at end of file
FROM debian:8
MAINTAINER Abdel HEBA <aheba@linagora.com>
# Install all our dependencies and set some required build changes
RUN apt-get update && apt-get install -y \
autoconf \
automake \
bzip2 \
default-jre \
g++ \
git \
gzip \
libatlas3-base \
libtool-bin \
make \
python2.7 \
python3 \
python-pip \
sox \
subversion \
wget \
zlib1g-dev &&\
apt-get clean autoclean && \
apt-get autoremove -y && \
ln -s /usr/bin/python2.7 /usr/bin/python ; ln -s -f bash /bin/sh
# Speaker diarization
RUN cd /opt && wget http://www-lium.univ-lemans.fr/diarization/lib/exe/fetch.php/lium_spkdiarization-8.4.1.jar.gz && \
gzip -d lium_spkdiarization-8.4.1.jar.gz
# Build kaldi
RUN git clone https://ci.linagora.com/aheba/kaldi_2015 /opt/kaldi && \
cd /opt/kaldi/tools && \
make && \
cd /opt/kaldi/src && ./configure --shared && make depend && make
ENV BASE_DIR /opt/speech-to-text
RUN mkdir -p $BASE_DIR
WORKDIR $BASE_DIR
# Install tornado
COPY requirements.txt .
RUN pip2 install -r requirements.txt
# Deploy our offline server
COPY . .
RUN ./deploy-offline-decoding.sh /opt/kaldi /opt/lium_spkdiarization-8.4.1.jar /opt/models
# Set the default command
CMD ./worker_offline.py
......@@ -27,4 +27,4 @@ ln -s $PATH_STT_Models $PWD/systems/
##### Create wavs & trans directory #####
mkdir wavs
mkdir trans
echo "Sucess..."
echo "Success..."
ws4py
configparser
tenacity
pydub
\ No newline at end of file
......@@ -80,5 +80,5 @@ sysRootName=$(echo $(basename $sysdir)|cut -f1 -d"=")
### Rescoring with LM
### Get CTM and STM files
echo "End...."
#echo "End...."
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu Jan 18 17:32:23 2018
@author: rbaraglia
"""
import sys
import os
import glob
from pydub import AudioSegment
def detect_leading_silence(sound, silence_threshold=-50.0, chunk_size=100):
'''
sound is a pydub.AudioSegment
silence_threshold in dB
chunk_size in ms
iterate over chunks until you find the first one with sound
'''
trim_ms = 0 # ms
assert chunk_size > 0 # to avoid infinite loop
while sound[trim_ms:trim_ms+chunk_size].dBFS < silence_threshold and trim_ms < len(sound):
trim_ms += chunk_size
return trim_ms
def average_power_level(sound, chunk_size=100):
trim_ms = 0 # ms
nb_chunk = 0
avg_power = 0.0
assert chunk_size >0
while trim_ms < len(sound):
trim_ms += chunk_size
if (sound[trim_ms:trim_ms+chunk_size].dBFS != -float('Inf')):
avg_power += sound[trim_ms:trim_ms+chunk_size].dBFS
nb_chunk += 1
avg_power = avg_power/(nb_chunk if nb_chunk >0 else 1)
return avg_power
'''
trim_silence_segments remove silence (or background noise) from an audio wav file.
It working by trimming signal at the beginning and the end that is below the overall power level
input_file is a .wav file path
output_file is a .wav file path
chunk_size in ms
threshold_factor ]0,1]
side_effect_accomodation is a number of chunk that will be kept at the beginning and end despite being below the threshold
return the silence segment
TODO:Calculate standard deviation, put a threshold on it to acknolegde if the trimming is truly needed (for file without silence for exemple)
'''
def trim_silence_segments(input_file,output_file, chunk_size=100, threshold_factor=0.85, side_effect_accomodation=1):
#sound = AudioSegment.from_file("/home/rbaraglia/data/SG/audio-18_01_18/rec---2018-01-18_081957.wav", format="wav")
sound = AudioSegment.from_file(input_file, format="wav")
avg_power = average_power_level(sound)
start_trim = detect_leading_silence(sound,silence_threshold= threshold_factor * avg_power)
end_trim = detect_leading_silence(sound.reverse(), silence_threshold= threshold_factor * avg_power)
duration = len(sound)
trimmed_sound = sound[start_trim if start_trim - chunk_size*side_effect_accomodation < 0 else start_trim - chunk_size*side_effect_accomodation : duration-end_trim if end_trim + chunk_size*side_effect_accomodation > duration else duration-end_trim + chunk_size*side_effect_accomodation]
trimmed_sound.export(output_file, format="wav")
return (sound[0 : start_trim], sound[len(sound) - end_trim : -1])
if __name__ == '__main__':