Python Speech

Using Python with human speech capabilities

Speech to text

Libraries to install

  • SpeechRecognition
  • pyAudio
  • google-api-python-client

File: main py

import audiowrapper
import speech_recognition as sr

def speech_to_text(filename):
    result = ""
    r = sr.Recognizer()
    with sr.AudioFile(filename) as source:
        audio = r.record(source)
    try:
        result = r.recognize_google(audio)
    except sr.UnknownValueError:
        print("Error: Count not understand audio")
    except sr.RequestError as e:
        print("Error: Request error; {}".format(e))
    finally:
        return result

if __name__ == "__main__":
    audio = audiowrapper.Audio()
    filename = "test.wav"
    s1 = input("Press ENTER to START recording")
    audio.record()
    s2 = input("Press ENTER to STOP recording")
    audio.stop(filename)
    text = speech_to_text(filename)
    print("I think you said: "+text)

File: audiowrapper py

from threading import Timer
import pyaudio
import wave

class Audio():

    CHUNK = 1024
    FORMAT = pyaudio.paInt16
    CHANNELS = 2
    RATE = 44100

    def __init__(self):
        self.frames = []
        self.pa = pyaudio.PyAudio()
        self.stream = None

    def __buffer(self, in_data, frame_count, time_info, status):
        self.frames.append(in_data)
        return in_data, pyaudio.paContinue

    def record(self):
        self.frames = []
        self.stream = self.pa.open(format=self.FORMAT,
                                   channels=self.CHANNELS,
                                   rate=self.RATE,
                                   input=True,
                                   frames_per_buffer=self.CHUNK,
                                   stream_callback=self.__buffer)
        self.stream.start_stream()

    def stop(self, filename):
        self.stream.stop_stream()
        self.stream.close()
        self.pa.terminate()
        wf = wave.open(filename, "wb")
        wf.setnchannels(self.CHANNELS)
        wf.setsampwidth(self.pa.get_sample_size(self.FORMAT))
        wf.setframerate(self.RATE)
        wf.writeframes(b''.join(self.frames))
        wf.close()

    def record_and_stop(self, seconds, filename):
        self.record()
        t = Timer(seconds, self.stop, {filename: filename})
        t.start()

    def print_devices(self):
        device_count = self.pa.get_device_count()
        for i in range(0, device_count):
            info = self.pa.get_device_info_by_index(i)
            print("Device {} = {}".format(info["index"], info["name"]))

Important note
This is currently using an unregistered version of Googles speech recognition system. This only works for a maximum 50 requests per day and they can cancel it for "over use" (even less than the 50). It's only suitable for testing. Google do offer a free one year trial of their full product, but we'd have to sign you up for that. I'll help you through the process of setting that up (it can get a little tricky). There is an "oiffline" library we could also use.

Links

Text to speech

Libraries to install

  • gtts (google text to speech)
  • playsound (to play the mp3)
from gtts import gTTS
import os
from playsound import playsound

message = input("What do you want me to say?")

# Tex to speech, save as mp3 file
tts = gTTS(text=message, lang='en')
tts.save("hello.mp3")

# Play the mp3 file
playsound("hello.mp3")

Links