关于语音合成和识别

Posted 2020-11-20 liguoyi

tags:

篇首语：本文由小常识网(cha138.com)小编为大家整理，主要介绍了关于语音合成和识别相关的知识，希望对你有一定的参考价值。

最近研究了下语音合成和语音识别。分别看了一些文章，也下载jdk写了些代码测试了下。

发现，对于语音合成。中文来说，百度语音和科大讯飞，基本都差不多。

英文的话，百度合成出来的效果不佳。科大讯飞稍好点。但是总体都没有国外语音合成好。比如 iSpeech、FreeTTS，可能国外的主语都是英语的缘故吧。

百度日调用额度比较多，据说有2万额度。讯飞每天就500，有点少。iSpeech 是要收费的。FreeTTS 可以离线使用。

百度识别和合成代码：

public class SoundAPI
{
	private static final Logger logger = LoggerFactory.getLogger(SoundAPI.class);
	final static String FILE_PATH = Config.getString("download.folder");
	// 设置APPID/AK/SK
	private static final String APP_ID = "你的APP ID";
	private static final String API_KEY = "你的key";
	private static final String SECRET_KEY = "你的秘钥";
	// 初始化一个AipSpeech
	private static AipSpeech client = null;
	private static long iniTime = 0L;
	/** 30 天 24 小时 **/
	private static final long MONTH_TIME = 30 * 24 * 60 * 60 * 1000;
	private static final Base64 base64 = new Base64();

	private static void iniAPI()
	{
		boolean needToReset = false;
		// 判断是否一个月了，如果一个月后，需要重新初始话
		long currentTime = System.currentTimeMillis();
		if (currentTime - iniTime > MONTH_TIME)
		{
			needToReset = true;
		}
		if (client == null || needToReset)
		{
			client = new AipSpeech(APP_ID, API_KEY, SECRET_KEY);
			/** 2秒超时时间 **/
			client.setConnectionTimeoutInMillis(2000);

			iniTime = System.currentTimeMillis();
		}
	}

	public static String getSoundMp3(String text, String fileName, QuestionTypeEnum questionType)
	{
		String rtnfileName = "";
		String type = "zh";
		if (StringUtils.isEmpty(text))
			return "";

		try
		{
			iniAPI();

			if (QuestionTypeEnum.ENGLISH_WORD.getType().equals(questionType.getType()))
			{
				type = "en";
			}

			TtsResponse res = client.synthesis(text, type, 1, null);
			byte[] data = res.getData();
			if (data != null)
			{

				// String uuid = UUID.randomUUID().toString().replace("-",
				// "").toLowerCase();
				String uuid = base64.encodeToString(fileName.getBytes());
				rtnfileName = type + "/" + uuid.replaceAll("=", "") + ".mp3";
				String path = FILE_PATH + rtnfileName;
				File file = new File(path);
				if (!file.exists())
				{
					Util.writeBytesToFileSystem(data, path);
				}

			} else
			{
				JSONObject jsonObj = res.getResult();
				logger.info("invoke baidu synthesis API error:", jsonObj);
			}
		} catch (Exception e)
		{
			rtnfileName = "";
			logger.error("invoke baidu synthesis API error:", e);
		}

		return rtnfileName;
	}

	public static String recognizeSound(String filePath, QuestionTypeEnum questionType)
	{
		String result = "";
		JSONObject asrRes = null;

		if (StringUtils.isEmpty(filePath))
			return "";

		try
		{
			iniAPI();
			if (QuestionTypeEnum.ENGLISH_WORD.getType().equals(questionType.getType()))
			{
				HashMap<String, Object> options = new HashMap<>();
				options.put("dev_pid", 1737);
				asrRes = client.asr(filePath, "pcm", 16000, options);
			} else
			{
				asrRes = client.asr(filePath, "pcm", 16000, null);
			}

			result = getResult(asrRes);

		} catch (Exception e)
		{
			logger.error("invoke baidu asr API error:", e);
		}

		return result;
	}

	private static String getResult(JSONObject asrRes)
	{
		String result = "";
		if (asrRes.getInt("err_no") == 0)
		{
			JSONArray arrayResult = asrRes.getJSONArray("result");
			StringBuilder sbResult = new StringBuilder();
			for (int i = 0; i < arrayResult.length(); i++)
			{
				if (i == 0)
				{
					sbResult.append(arrayResult.get(i).toString());
				} else
				{
					if (!StringUtils.isEmpty(arrayResult.get(i).toString()))
						sbResult.append(";" + arrayResult.get(i).toString());
				}
			}

			result = sbResult.toString().replaceAll("，", "");
		} else
		{
			logger.error("invoke baidu asr API error:", asrRes);
		}
		return result;
	}

　　科大讯飞的语音识别及合成

public class IatAPI
{
	private static final Logger logger = LoggerFactory.getLogger(IatAPI.class);
	/**
	 * 科大讯飞语音识别写入参考
	 * https://github.com/IflytekAIUI/DemoCode/blob/master/webapi/java/Iat.java
	 */
	final static String APPID = "你的APPID";
	final static String APPKEY_IAT = "你的秘钥";
	final static String URL_IAT = "http://api.xfyun.cn/v1/service/v1/iat";
	final static String IP = "服务器IP地址";

	/**
	 * 
	 * 发送语音，获取文字
	 * 
	 * @param audioByteArray
	 * @return
	 * @throws Exception
	 */
	public static String process(String filePath) throws Exception
	{
		Map<String, String> header = getHeader("raw", "sms16k");
		// 读取音频文件，转二进制数组，然后Base64编码
		byte[] audioByteArray = FileUtil.read2ByteArray(filePath);
		String audioBase64 = new String(Base64.encodeBase64(audioByteArray), "UTF-8");
		String bodyParam = "audio=" + audioBase64;
		// logger.info(bodyParam);
		String result = HttpUtil.doPost(URL_IAT, header, bodyParam);

		return result;
	}

	/**
	 * 组装http请求头
	 * 
	 * @param aue
	 * @param resultLevel
	 * @param language
	 * @param category
	 * @return
	 * @throws UnsupportedEncodingException
	 */
	private static Map<String, String> getHeader(String aue, String engineType) throws UnsupportedEncodingException
	{
		// 系统当前时间戳
		String X_CurTime = System.currentTimeMillis() / 1000L + "";
		// 业务参数
		String param = "{"aue":"" + aue + """ + ","engine_type":"" + engineType + ""}";
		String X_Param = new String(Base64.encodeBase64(param.getBytes("UTF-8")));
		// 接口密钥
		String apiKey = APPKEY_IAT;
		// 讯飞开放平台应用ID
		String X_Appid = APPID;
		// 生成令牌
		String X_CheckSum = DigestUtils.md5Hex(apiKey + X_CurTime + X_Param);

		// 组装请求头
		Map<String, String> header = new HashMap<String, String>();
		header.put("Content-Type", "application/x-www-form-urlencoded; charset=utf-8");
		header.put("X-Param", X_Param);
		header.put("X-CurTime", X_CurTime);
		header.put("X-CheckSum", X_CheckSum);
		header.put("X-Appid", X_Appid);
		header.put("X-Real-Ip", IP);
		return header;

	}

public class TtsAPI
{
	private static final Logger logger = LoggerFactory.getLogger(TtsAPI.class);
	/**
	 * 科大讯飞语音识别写入参考
	 * https://github.com/IflytekAIUI/DemoCode/blob/master/webapi/java/Iat.java
	 */
	final static String APPID = "你的APP id";
	final static String APPKEY_TTS = "你的秘钥";
	final static String URL_TTS = "http://api.xfyun.cn/v1/service/v1/tts";
	final static String IP = "服务器地址";
	final static String FILE_PATH = Config.getString("download.folder");

	/**
	 * 
	 * 发送文字，获取语音
	 * 
	 * @param text
	 * @throws Exception
	 */
	public static String process(String text) throws Exception
	{
		String result = null;
		Long startTime = System.currentTimeMillis();
		try
		{
			Map<String, String> header = getHeader("audio/L16;rate=16000", "lame", "xiaoyan", "50", "50", "", "text",
					"50");
			Map<String, Object> resultMap = HttpUtil.doMultiPost(URL_TTS, header, "text=" + text);
			// 合成成功
			if ("audio/mpeg".equals(resultMap.get("Content-Type")))
			{
				FileUtil.save(FILE_PATH, resultMap.get("sid") + ".mp3", (byte[]) resultMap.get("body"));
				result = resultMap.get("sid") + ".mp3";
			} else
			{ // 合成失败
				logger.error(resultMap.get("body").toString());
			}
		} catch (Exception e)
		{
			logger.error("there is error:", e);
		}

		Long endTime = System.currentTimeMillis();
		logger.info("finish get voice:" + (endTime - startTime));

		return result;
	}

	/**
	 * 组装http请求头
	 * 
	 * @param aue
	 * @param resultLevel
	 * @param language
	 * @param category
	 * @return
	 * @throws UnsupportedEncodingException
	 */
	private static Map<String, String> getHeader(String auf, String aue, String voiceName, String speed, String volume,
			String engineType, String textType, String pitch) throws UnsupportedEncodingException
	{
		String curTime = System.currentTimeMillis() / 1000L + "";
		StringBuilder param = new StringBuilder("{"auf":"" + auf + """);
		if (!StringUtil.isNullOrEmpty(aue))
		{
			param.append(","aue":"" + aue + """);
		}
		if (!StringUtil.isNullOrEmpty(voiceName))
		{
			param.append(","voice_name":"" + voiceName + """);
		}
		if (!StringUtil.isNullOrEmpty(speed))
		{
			param.append(","speed":"" + speed + """);
		}
		if (!StringUtil.isNullOrEmpty(volume))
		{
			param.append(","volume":"" + volume + """);
		}
		if (!StringUtil.isNullOrEmpty(pitch))
		{
			param.append(","pitch":"" + pitch + """);
		}
		if (!StringUtil.isNullOrEmpty(engineType))
		{
			param.append(","engine_type":"" + engineType + """);
		}
		if (!StringUtil.isNullOrEmpty(textType))
		{
			param.append(","text_type":"" + textType + """);
		}
		param.append("}");

		String paramBase64 = new String(Base64.encodeBase64(param.toString().getBytes("UTF-8")));
		String checkSum = DigestUtils.md5Hex(APPKEY_TTS + curTime + paramBase64);
		Map<String, String> header = new HashMap<String, String>();
		header.put("Content-Type", "application/x-www-form-urlencoded; charset=utf-8");
		header.put("X-Param", paramBase64);
		header.put("X-CurTime", curTime);
		header.put("X-CheckSum", checkSum);
		header.put("X-Real-Ip", IP);
		header.put("X-Appid", APPID);
		// logger.info(JSON.toJSONString(header));
		return header;
	}

以上是关于关于语音合成和识别的主要内容，如果未能解决你的问题，请参考以下文章

重磅！一行代码轻松搞定中英文语音识别与语音合成|代码开源！

VC++基于微软语音引擎开发语音识别总结

首次开源一行代码中英文语音识别合成翻译核心功能

golang 使用科大讯飞进行语音合成与识别

20160209.CCPP体系详解(0019天)

首次开源！一行代码轻松搞定中英文语音识别合成翻译核心功能！