Mypal/dom/media/webspeech/recognition/SpeechRecognition.h

297 lines
8.5 KiB
C++

/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* vim:set ts=2 sw=2 sts=2 et cindent: */
/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
#ifndef mozilla_dom_SpeechRecognition_h
#define mozilla_dom_SpeechRecognition_h
#include "mozilla/Attributes.h"
#include "mozilla/DOMEventTargetHelper.h"
#include "nsCOMPtr.h"
#include "nsString.h"
#include "nsWrapperCache.h"
#include "nsTArray.h"
#include "js/TypeDecls.h"
#include "nsIDOMNavigatorUserMedia.h"
#include "nsITimer.h"
#include "MediaEngine.h"
#include "MediaStreamGraph.h"
#include "AudioSegment.h"
#include "mozilla/WeakPtr.h"
#include "SpeechGrammarList.h"
#include "SpeechRecognitionResultList.h"
#include "SpeechStreamListener.h"
#include "nsISpeechRecognitionService.h"
#include "endpointer.h"
#include "mozilla/dom/SpeechRecognitionError.h"
namespace mozilla {
namespace dom {
#define SPEECH_RECOGNITION_TEST_EVENT_REQUEST_TOPIC "SpeechRecognitionTest:RequestEvent"
#define SPEECH_RECOGNITION_TEST_END_TOPIC "SpeechRecognitionTest:End"
class GlobalObject;
class SpeechEvent;
LogModule* GetSpeechRecognitionLog();
#define SR_LOG(...) MOZ_LOG(GetSpeechRecognitionLog(), mozilla::LogLevel::Debug, (__VA_ARGS__))
class SpeechRecognition final : public DOMEventTargetHelper,
public nsIObserver,
public SupportsWeakPtr<SpeechRecognition>
{
public:
MOZ_DECLARE_WEAKREFERENCE_TYPENAME(SpeechRecognition)
explicit SpeechRecognition(nsPIDOMWindowInner* aOwnerWindow);
NS_DECL_ISUPPORTS_INHERITED
NS_DECL_CYCLE_COLLECTION_CLASS_INHERITED(SpeechRecognition, DOMEventTargetHelper)
NS_DECL_NSIOBSERVER
nsISupports* GetParentObject() const;
JSObject* WrapObject(JSContext* aCx, JS::Handle<JSObject*> aGivenProto) override;
static bool IsAuthorized(JSContext* aCx, JSObject* aGlobal);
static already_AddRefed<SpeechRecognition>
Constructor(const GlobalObject& aGlobal, ErrorResult& aRv);
already_AddRefed<SpeechGrammarList> Grammars() const;
void SetGrammars(mozilla::dom::SpeechGrammarList& aArg);
void GetLang(nsString& aRetVal) const;
void SetLang(const nsAString& aArg);
bool GetContinuous(ErrorResult& aRv) const;
void SetContinuous(bool aArg, ErrorResult& aRv);
bool InterimResults() const;
void SetInterimResults(bool aArg);
uint32_t MaxAlternatives() const;
void SetMaxAlternatives(uint32_t aArg);
void GetServiceURI(nsString& aRetVal, ErrorResult& aRv) const;
void SetServiceURI(const nsAString& aArg, ErrorResult& aRv);
void Start(const Optional<NonNull<DOMMediaStream>>& aStream, ErrorResult& aRv);
void Stop();
void Abort();
IMPL_EVENT_HANDLER(audiostart)
IMPL_EVENT_HANDLER(soundstart)
IMPL_EVENT_HANDLER(speechstart)
IMPL_EVENT_HANDLER(speechend)
IMPL_EVENT_HANDLER(soundend)
IMPL_EVENT_HANDLER(audioend)
IMPL_EVENT_HANDLER(result)
IMPL_EVENT_HANDLER(nomatch)
IMPL_EVENT_HANDLER(error)
IMPL_EVENT_HANDLER(start)
IMPL_EVENT_HANDLER(end)
enum EventType {
EVENT_START,
EVENT_STOP,
EVENT_ABORT,
EVENT_AUDIO_DATA,
EVENT_AUDIO_ERROR,
EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT,
EVENT_RECOGNITIONSERVICE_FINAL_RESULT,
EVENT_RECOGNITIONSERVICE_ERROR,
EVENT_COUNT
};
void DispatchError(EventType aErrorType, SpeechRecognitionErrorCode aErrorCode, const nsAString& aMessage);
uint32_t FillSamplesBuffer(const int16_t* aSamples, uint32_t aSampleCount);
uint32_t SplitSamplesBuffer(const int16_t* aSamplesBuffer, uint32_t aSampleCount, nsTArray<RefPtr<SharedBuffer>>& aResult);
AudioSegment* CreateAudioSegment(nsTArray<RefPtr<SharedBuffer>>& aChunks);
void FeedAudioData(already_AddRefed<SharedBuffer> aSamples, uint32_t aDuration, MediaStreamListener* aProvider, TrackRate aTrackRate);
friend class SpeechEvent;
private:
virtual ~SpeechRecognition() {};
enum FSMState {
STATE_IDLE,
STATE_STARTING,
STATE_ESTIMATING,
STATE_WAITING_FOR_SPEECH,
STATE_RECOGNIZING,
STATE_WAITING_FOR_RESULT,
STATE_COUNT
};
void SetState(FSMState state);
bool StateBetween(FSMState begin, FSMState end);
bool SetRecognitionService(ErrorResult& aRv);
bool ValidateAndSetGrammarList(ErrorResult& aRv);
class GetUserMediaSuccessCallback : public nsIDOMGetUserMediaSuccessCallback
{
public:
NS_DECL_ISUPPORTS
NS_DECL_NSIDOMGETUSERMEDIASUCCESSCALLBACK
explicit GetUserMediaSuccessCallback(SpeechRecognition* aRecognition)
: mRecognition(aRecognition)
{}
private:
virtual ~GetUserMediaSuccessCallback() {}
RefPtr<SpeechRecognition> mRecognition;
};
class GetUserMediaErrorCallback : public nsIDOMGetUserMediaErrorCallback
{
public:
NS_DECL_ISUPPORTS
NS_DECL_NSIDOMGETUSERMEDIAERRORCALLBACK
explicit GetUserMediaErrorCallback(SpeechRecognition* aRecognition)
: mRecognition(aRecognition)
{}
private:
virtual ~GetUserMediaErrorCallback() {}
RefPtr<SpeechRecognition> mRecognition;
};
NS_IMETHOD StartRecording(DOMMediaStream* aDOMStream);
NS_IMETHOD StopRecording();
uint32_t ProcessAudioSegment(AudioSegment* aSegment, TrackRate aTrackRate);
void NotifyError(SpeechEvent* aEvent);
void ProcessEvent(SpeechEvent* aEvent);
void Transition(SpeechEvent* aEvent);
void Reset();
void ResetAndEnd();
void WaitForAudioData(SpeechEvent* aEvent);
void StartedAudioCapture(SpeechEvent* aEvent);
void StopRecordingAndRecognize(SpeechEvent* aEvent);
void WaitForEstimation(SpeechEvent* aEvent);
void DetectSpeech(SpeechEvent* aEvent);
void WaitForSpeechEnd(SpeechEvent* aEvent);
void NotifyFinalResult(SpeechEvent* aEvent);
void DoNothing(SpeechEvent* aEvent);
void AbortSilently(SpeechEvent* aEvent);
void AbortError(SpeechEvent* aEvent);
RefPtr<DOMMediaStream> mDOMStream;
RefPtr<SpeechStreamListener> mSpeechListener;
nsCOMPtr<nsISpeechRecognitionService> mRecognitionService;
FSMState mCurrentState;
Endpointer mEndpointer;
uint32_t mEstimationSamples;
uint32_t mAudioSamplesPerChunk;
// buffer holds one chunk of mAudioSamplesPerChunk
// samples before feeding it to mEndpointer
RefPtr<SharedBuffer> mAudioSamplesBuffer;
uint32_t mBufferedSamples;
nsCOMPtr<nsITimer> mSpeechDetectionTimer;
bool mAborted;
nsString mLang;
RefPtr<SpeechGrammarList> mSpeechGrammarList;
// WebSpeechAPI (http://bit.ly/1gIl7DC) states:
//
// 1. Default value MUST be false
// 2. If true, interim results SHOULD be returned
// 3. If false, interim results MUST NOT be returned
//
// Pocketsphinx does not return interm results; so, defaulting
// mInterimResults to false, then ignoring its subsequent value
// is a conforming implementation.
bool mInterimResults;
// WebSpeechAPI (http://bit.ly/1JAiqeo) states:
//
// 1. Default value is 1
// 2. Subsequent value is the "maximum number of SpeechRecognitionAlternatives per result"
//
// Pocketsphinx can only return at maximum a single SpeechRecognitionAlternative
// per SpeechRecognitionResult. So defaulting mMaxAlternatives to 1, for all non
// zero values ignoring mMaxAlternatives while for a 0 value returning no
// SpeechRecognitionAlternative per result is a conforming implementation.
uint32_t mMaxAlternatives;
void ProcessTestEventRequest(nsISupports* aSubject, const nsAString& aEventName);
const char* GetName(FSMState aId);
const char* GetName(SpeechEvent* aId);
};
class SpeechEvent : public Runnable
{
public:
SpeechEvent(SpeechRecognition* aRecognition, SpeechRecognition::EventType aType)
: mAudioSegment(0)
, mRecognitionResultList(nullptr)
, mError(nullptr)
, mRecognition(aRecognition)
, mType(aType)
, mTrackRate(0)
{
}
~SpeechEvent();
NS_IMETHOD Run() override;
AudioSegment* mAudioSegment;
RefPtr<SpeechRecognitionResultList> mRecognitionResultList; // TODO: make this a session being passed which also has index and stuff
RefPtr<SpeechRecognitionError> mError;
friend class SpeechRecognition;
private:
SpeechRecognition* mRecognition;
// for AUDIO_DATA events, keep a reference to the provider
// of the data (i.e., the SpeechStreamListener) to ensure it
// is kept alive (and keeps SpeechRecognition alive) until this
// event gets processed.
RefPtr<MediaStreamListener> mProvider;
SpeechRecognition::EventType mType;
TrackRate mTrackRate;
};
} // namespace dom
inline nsISupports*
ToSupports(dom::SpeechRecognition* aRec)
{
return ToSupports(static_cast<DOMEventTargetHelper*>(aRec));
}
} // namespace mozilla
#endif