Mypal/dom/media/webspeech/recognition/PocketSphinxSpeechRecogniti...

358 lines
11 KiB
C++

/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* vim:set ts=2 sw=2 sts=2 et cindent: */
/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
#include "nsThreadUtils.h"
#include "nsXPCOMCIDInternal.h"
#include "PocketSphinxSpeechRecognitionService.h"
#include "nsIFile.h"
#include "SpeechGrammar.h"
#include "SpeechRecognition.h"
#include "SpeechRecognitionAlternative.h"
#include "SpeechRecognitionResult.h"
#include "SpeechRecognitionResultList.h"
#include "nsIObserverService.h"
#include "MediaPrefs.h"
#include "mozilla/Services.h"
#include "nsDirectoryServiceDefs.h"
#include "nsDirectoryServiceUtils.h"
#include "nsMemory.h"
extern "C" {
#include "pocketsphinx/pocketsphinx.h"
#include "sphinxbase/logmath.h"
#include "sphinxbase/sphinx_config.h"
#include "sphinxbase/jsgf.h"
}
namespace mozilla {
using namespace dom;
class DecodeResultTask : public Runnable
{
public:
DecodeResultTask(const nsString& hypstring,
float64 confidence,
WeakPtr<dom::SpeechRecognition> recognition)
: mResult(hypstring),
mConfidence(confidence),
mRecognition(recognition),
mWorkerThread(do_GetCurrentThread())
{
MOZ_ASSERT(
!NS_IsMainThread()); // This should be running on the worker thread
}
NS_IMETHOD
Run() override
{
MOZ_ASSERT(NS_IsMainThread()); // This method is supposed to run on the main
// thread!
// Declare javascript result events
RefPtr<SpeechEvent> event = new SpeechEvent(
mRecognition, SpeechRecognition::EVENT_RECOGNITIONSERVICE_FINAL_RESULT);
SpeechRecognitionResultList* resultList =
new SpeechRecognitionResultList(mRecognition);
SpeechRecognitionResult* result = new SpeechRecognitionResult(mRecognition);
if (0 < mRecognition->MaxAlternatives()) {
SpeechRecognitionAlternative* alternative =
new SpeechRecognitionAlternative(mRecognition);
alternative->mTranscript = mResult;
alternative->mConfidence = mConfidence;
result->mItems.AppendElement(alternative);
}
resultList->mItems.AppendElement(result);
event->mRecognitionResultList = resultList;
NS_DispatchToMainThread(event);
// If we don't destroy the thread when we're done with it, it will hang
// around forever... bad!
// But thread->Shutdown must be called from the main thread, not from the
// thread itself.
return mWorkerThread->Shutdown();
}
private:
nsString mResult;
float64 mConfidence;
WeakPtr<dom::SpeechRecognition> mRecognition;
nsCOMPtr<nsIThread> mWorkerThread;
};
class DecodeTask : public Runnable
{
public:
DecodeTask(WeakPtr<dom::SpeechRecognition> recogntion,
const nsTArray<int16_t>& audiovector, ps_decoder_t* ps)
: mRecognition(recogntion), mAudiovector(audiovector), mPs(ps)
{
}
NS_IMETHOD
Run() override
{
char const* hyp;
int rv;
int32 final;
int32 logprob;
float64 confidence;
nsAutoCString hypoValue;
rv = ps_start_utt(mPs);
rv = ps_process_raw(mPs, &mAudiovector[0], mAudiovector.Length(), FALSE,
FALSE);
rv = ps_end_utt(mPs);
confidence = 0;
if (rv >= 0) {
hyp = ps_get_hyp_final(mPs, &final);
if (hyp && final) {
logprob = ps_get_prob(mPs);
confidence = logmath_exp(ps_get_logmath(mPs), logprob);
hypoValue.Assign(hyp);
}
}
nsCOMPtr<nsIRunnable> resultrunnable =
new DecodeResultTask(NS_ConvertUTF8toUTF16(hypoValue), confidence, mRecognition);
return NS_DispatchToMainThread(resultrunnable);
}
private:
WeakPtr<dom::SpeechRecognition> mRecognition;
nsTArray<int16_t> mAudiovector;
ps_decoder_t* mPs;
};
NS_IMPL_ISUPPORTS(PocketSphinxSpeechRecognitionService,
nsISpeechRecognitionService, nsIObserver)
PocketSphinxSpeechRecognitionService::PocketSphinxSpeechRecognitionService()
{
mSpeexState = nullptr;
// get root folder
nsCOMPtr<nsIFile> tmpFile;
nsAutoString aStringAMPath; // am folder
nsAutoString aStringDictPath; // dict folder
NS_GetSpecialDirectory(NS_GRE_DIR, getter_AddRefs(tmpFile));
#if defined(XP_WIN) // for some reason, on windows NS_GRE_DIR is not bin root,
// but bin/browser
tmpFile->AppendRelativePath(NS_LITERAL_STRING(".."));
#endif
tmpFile->AppendRelativePath(NS_LITERAL_STRING("models"));
tmpFile->AppendRelativePath(NS_LITERAL_STRING("en-US"));
tmpFile->GetPath(aStringAMPath);
NS_GetSpecialDirectory(NS_GRE_DIR, getter_AddRefs(tmpFile));
#if defined(XP_WIN) // for some reason, on windows NS_GRE_DIR is not bin root,
// but bin/browser
tmpFile->AppendRelativePath(NS_LITERAL_STRING(".."));
#endif
tmpFile->AppendRelativePath(NS_LITERAL_STRING("models")); //
tmpFile->AppendRelativePath(NS_LITERAL_STRING("dict")); //
tmpFile->AppendRelativePath(NS_LITERAL_STRING("en-US.dic")); //
tmpFile->GetPath(aStringDictPath);
// FOR B2G PATHS HARDCODED (APPEND /DATA ON THE BEGINING, FOR DESKTOP, ONLY
// MODELS/ RELATIVE TO ROOT
mPSConfig = cmd_ln_init(nullptr, ps_args(), TRUE, "-bestpath", "yes", "-hmm",
ToNewUTF8String(aStringAMPath), // acoustic model
"-dict", ToNewUTF8String(aStringDictPath), nullptr);
if (mPSConfig == nullptr) {
ISDecoderCreated = false;
} else {
mPSHandle = ps_init(mPSConfig);
if (mPSHandle == nullptr) {
ISDecoderCreated = false;
} else {
ISDecoderCreated = true;
}
}
ISGrammarCompiled = false;
}
PocketSphinxSpeechRecognitionService::~PocketSphinxSpeechRecognitionService()
{
if (mPSConfig) {
free(mPSConfig);
}
if (mPSHandle) {
free(mPSHandle);
}
mSpeexState = nullptr;
}
// CALL START IN JS FALLS HERE
NS_IMETHODIMP
PocketSphinxSpeechRecognitionService::Initialize(
WeakPtr<SpeechRecognition> aSpeechRecognition)
{
if (!ISDecoderCreated || !ISGrammarCompiled) {
return NS_ERROR_NOT_INITIALIZED;
} else {
mAudioVector.Clear();
if (mSpeexState) {
mSpeexState = nullptr;
}
mRecognition = aSpeechRecognition;
nsCOMPtr<nsIObserverService> obs = services::GetObserverService();
obs->AddObserver(this, SPEECH_RECOGNITION_TEST_EVENT_REQUEST_TOPIC, false);
obs->AddObserver(this, SPEECH_RECOGNITION_TEST_END_TOPIC, false);
return NS_OK;
}
}
NS_IMETHODIMP
PocketSphinxSpeechRecognitionService::ProcessAudioSegment(
AudioSegment* aAudioSegment, int32_t aSampleRate)
{
if (!mSpeexState) {
mSpeexState = speex_resampler_init(1, aSampleRate, 16000,
SPEEX_RESAMPLER_QUALITY_MAX, nullptr);
}
aAudioSegment->ResampleChunks(mSpeexState, aSampleRate, 16000);
AudioSegment::ChunkIterator iterator(*aAudioSegment);
while (!iterator.IsEnded()) {
mozilla::AudioChunk& chunk = *(iterator);
MOZ_ASSERT(chunk.mBuffer);
const int16_t* buf = static_cast<const int16_t*>(chunk.mChannelData[0]);
for (int i = 0; i < iterator->mDuration; i++) {
mAudioVector.AppendElement((int16_t)buf[i]);
}
iterator.Next();
}
return NS_OK;
}
NS_IMETHODIMP
PocketSphinxSpeechRecognitionService::SoundEnd()
{
speex_resampler_destroy(mSpeexState);
mSpeexState = nullptr;
// To create a new thread, get the thread manager
nsCOMPtr<nsIThreadManager> tm = do_GetService(NS_THREADMANAGER_CONTRACTID);
nsCOMPtr<nsIThread> decodethread;
nsresult rv = tm->NewThread(0, 0, getter_AddRefs(decodethread));
if (NS_FAILED(rv)) {
// In case of failure, call back immediately with an empty string which
// indicates failure
return NS_OK;
}
nsCOMPtr<nsIRunnable> r =
new DecodeTask(mRecognition, mAudioVector, mPSHandle);
decodethread->Dispatch(r, nsIEventTarget::DISPATCH_NORMAL);
return NS_OK;
}
NS_IMETHODIMP
PocketSphinxSpeechRecognitionService::ValidateAndSetGrammarList(
SpeechGrammar* aSpeechGrammar,
nsISpeechGrammarCompilationCallback* aCallback)
{
if (!ISDecoderCreated) {
ISGrammarCompiled = false;
} else if (aSpeechGrammar) {
nsAutoString grammar;
ErrorResult rv;
aSpeechGrammar->GetSrc(grammar, rv);
int result = ps_set_jsgf_string(mPSHandle, "name",
NS_ConvertUTF16toUTF8(grammar).get());
if (result != 0) {
ISGrammarCompiled = false;
} else {
ps_set_search(mPSHandle, "name");
ISGrammarCompiled = true;
}
} else {
ISGrammarCompiled = false;
}
return ISGrammarCompiled ? NS_OK : NS_ERROR_NOT_INITIALIZED;
}
NS_IMETHODIMP
PocketSphinxSpeechRecognitionService::Abort()
{
return NS_OK;
}
NS_IMETHODIMP
PocketSphinxSpeechRecognitionService::Observe(nsISupports* aSubject,
const char* aTopic,
const char16_t* aData)
{
MOZ_ASSERT(MediaPrefs::WebSpeechFakeRecognitionService(),
"Got request to fake recognition service event, "
"but " TEST_PREFERENCE_FAKE_RECOGNITION_SERVICE " is not set");
if (!strcmp(aTopic, SPEECH_RECOGNITION_TEST_END_TOPIC)) {
nsCOMPtr<nsIObserverService> obs = services::GetObserverService();
obs->RemoveObserver(this, SPEECH_RECOGNITION_TEST_EVENT_REQUEST_TOPIC);
obs->RemoveObserver(this, SPEECH_RECOGNITION_TEST_END_TOPIC);
return NS_OK;
}
const nsDependentString eventName = nsDependentString(aData);
if (eventName.EqualsLiteral("EVENT_RECOGNITIONSERVICE_ERROR")) {
mRecognition->DispatchError(
SpeechRecognition::EVENT_RECOGNITIONSERVICE_ERROR,
SpeechRecognitionErrorCode::Network, // TODO different codes?
NS_LITERAL_STRING("RECOGNITIONSERVICE_ERROR test event"));
} else if (eventName.EqualsLiteral("EVENT_RECOGNITIONSERVICE_FINAL_RESULT")) {
RefPtr<SpeechEvent> event = new SpeechEvent(
mRecognition, SpeechRecognition::EVENT_RECOGNITIONSERVICE_FINAL_RESULT);
event->mRecognitionResultList = BuildMockResultList();
NS_DispatchToMainThread(event);
}
return NS_OK;
}
SpeechRecognitionResultList*
PocketSphinxSpeechRecognitionService::BuildMockResultList()
{
SpeechRecognitionResultList* resultList =
new SpeechRecognitionResultList(mRecognition);
SpeechRecognitionResult* result = new SpeechRecognitionResult(mRecognition);
if (0 < mRecognition->MaxAlternatives()) {
SpeechRecognitionAlternative* alternative =
new SpeechRecognitionAlternative(mRecognition);
alternative->mTranscript = NS_LITERAL_STRING("Mock final result");
alternative->mConfidence = 0.0f;
result->mItems.AppendElement(alternative);
}
resultList->mItems.AppendElement(result);
return resultList;
}
} // namespace mozilla