AndroidでCognitiveServiceBingSpeechAPIをつかって音声ファイルから音声認識

AndroidのSpeechRecognizer(音声認識)では同時に録音をすることができない、CognitiveServiceSpeechAPIが使い方がわからないなどの問題があったのでREST APIを叩いてみました。

以下のクラスを使うと録音した音声ファイルから音声認識を行うことができます。

RESTのパラメータについてはここらへんを見ていただければ

JSONデシリアライズにGSONを利用しています。

CognitiveServiceのBingSpeechAPIキーも取得してね

public class BingSpeechRecognizer {

    String apiKey;
    String urlAccessToken = "https://api.cognitive.microsoft.com/sts/v1.0/issueToken";
    String urlRecognize = "https://speech.platform.bing.com/recognize";
    Thread thread;
    File audioFile;

    public BingSpeechRecognizer(String apiKey) {
        this.apiKey = apiKey;

    }

    public void recognizeAsync(File file, final ConsumerListener<List<Result>> onRecognize) throws IOException {
        this.audioFile = file;
        thread = new Thread(new Runnable() {
            @Override
            public void run() {
                try {
                    Log.i("tag","start");
                    String token = requestAccessToken();
                    urlRecognize += "?Version=3.0";
                    urlRecognize += "&requestid="+ UUID.randomUUID();
                    urlRecognize += "&appID=D4D52672-91D7-4C74-8AD8-42B1D98141A5";
                    urlRecognize += "&format=json";
                    urlRecognize += "&locale=ja-JP";
                    urlRecognize += "&device.os=Android";
                    urlRecognize += "&scenarios=ulm";
                    urlRecognize += "&instanceid="+UUID.randomUUID();

                    URL url = new URL(urlRecognize);
                    HttpURLConnection connection = (HttpURLConnection) url.openConnection();
                    connection.setRequestMethod("POST");
                    connection.setRequestProperty("Authorization","Bearer "+token);
                    connection.setRequestProperty("ContentType","audio/wav; samplerate=16000");
                    connection.connect();
                    OutputStream outStream = connection.getOutputStream();
                    FileInputStream fStream = new FileInputStream(audioFile);
                    int data;
                    while ((data = fStream.read()) != -1) {
                        outStream.write((byte) data);
                    }
                    fStream.close();
                    outStream.flush();

                    final int status = connection.getResponseCode();
                    if (status == HttpURLConnection.HTTP_OK) {
                        String json = readStringFromInputStream(connection.getInputStream());
                        Gson gson = new Gson();
                        SpeechResult result = gson.fromJson(json, SpeechResult.class);
                        onRecognize.accept(result.getResults());
                    }else{

                    }
                    connection.disconnect();

                } catch (Exception e) {
                    e.printStackTrace();
                }
            }
        });

        thread.start();

    }

    private String requestAccessToken() throws IOException {
        URL url = new URL(urlAccessToken);
        HttpURLConnection connection = (HttpURLConnection) url.openConnection();
        connection.setRequestMethod("POST");
        connection.setRequestProperty("Ocp-Apim-Subscription-Key", apiKey);
        connection.connect();
        String result = null;
        final int status = connection.getResponseCode();
        if (status == HttpURLConnection.HTTP_OK) {
            // 正常
            // レスポンス取得処理を実行
            InputStream stream = connection.getInputStream();
            result = readStringFromInputStream(stream);

        }
        connection.disconnect();
        return result;
    }

    private String readStringFromInputStream(InputStream stream) throws IOException {
        BufferedReader reader = new BufferedReader(new InputStreamReader(stream));
        String lineBuff = "";
        while(true){
            String line = reader.readLine();
            if(line==null){
                break;
            }
            lineBuff += line;
        }
        return lineBuff;
    }

}
public class Header
{
    private String status;

    public String getStatus() { return this.status; }

    public void setStatus(String status) { this.status = status; }

    private String scenario;

    public String getScenario() { return this.scenario; }

    public void setScenario(String scenario) { this.scenario = scenario; }

    private String name;

    public String getName() { return this.name; }

    public void setName(String name) { this.name = name; }

    private String lexical;

    public String getLexical() { return this.lexical; }

    public void setLexical(String lexical) { this.lexical = lexical; }

    private Properties properties;

    public Properties getProperties() { return this.properties; }

    public void setProperties(Properties properties) { this.properties = properties; }
}

public class Properties
{
    private String requestid;

    public String getRequestid() { return this.requestid; }

    public void setRequestid(String requestid) { this.requestid = requestid; }

    private String HIGHCONF;

    public String getHIGHCONF() { return this.HIGHCONF; }

    public void setHIGHCONF(String HIGHCONF) { this.HIGHCONF = HIGHCONF; }
}

public class Properties2
{
    private String HIGHCONF;

    public String getHIGHCONF() { return this.HIGHCONF; }

    public void setHIGHCONF(String HIGHCONF) { this.HIGHCONF = HIGHCONF; }
}

public class Result
{
    private String scenario;

    public String getScenario() { return this.scenario; }

    public void setScenario(String scenario) { this.scenario = scenario; }

    private String name;

    public String getName() { return this.name; }

    public void setName(String name) { this.name = name; }

    private String lexical;

    public String getLexical() { return this.lexical; }

    public void setLexical(String lexical) { this.lexical = lexical; }

    private String confidence;

    public String getConfidence() { return this.confidence; }

    public void setConfidence(String confidence) { this.confidence = confidence; }

    private Properties2 properties;

    public Properties2 getProperties() { return this.properties; }

    public void setProperties(Properties2 properties) { this.properties = properties; }
}

public class SpeechResult
{
    private String version;

    public String getVersion() { return this.version; }

    public void setVersion(String version) { this.version = version; }

    private Header header;

    public Header getHeader() { return this.header; }

    public void setHeader(Header header) { this.header = header; }

    private ArrayList<Result> results;

    public ArrayList<Result> getResults() { return this.results; }

    public void setResults(ArrayList<Result> results) { this.results = results; }
}

 

使い方はこんな感じ

BingSpeechRecognizer recognizer = new BingSpeechRecognizer(apiKey);
File temp = new File("/sdcard/temp.wav");
try {
    recognizer.recognizeAsync(temp, new ConsumerListener<List<Result>>() {
        @Override
        public void accept(List<Result> val) {
        }
    });
} catch (FileNotFoundException e) {
    e.printStackTrace();
} catch (IOException e) {
    e.printStackTrace();
}

 

コメントを残す

メールアドレスが公開されることはありません。 * が付いている欄は必須項目です

このサイトはスパムを低減するために Akismet を使っています。コメントデータの処理方法の詳細はこちらをご覧ください