遍历文件，查找文件下的汉字，并将汉字生成csv文件

Posted 2020-12-18 lwh-12345

tags:

篇首语：本文由小常识网(cha138.com)小编为大家整理，主要介绍了遍历文件，查找文件下的汉字，并将汉字生成csv文件相关的知识，希望对你有一定的参考价值。

package com.shine.eiuop.utils;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.UnsupportedEncodingException;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.UUID;
import java.util.regex.MatchResult;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import com.itextpdf.text.pdf.PdfStructTreeController.returnType;
import com.shine.framework.commutil.typewrap.EDto;

/**
* title: 清除注释
*
* @author
* @时间
*/
public class FileCopyChineseUtils {

/** 根目录 */
public static String rootDir = "C:\Users\14423\Desktop\亚强\msp中文翻译\msp2\msp\WebRoot";

public static void main(String args[]) throws Exception {
dofind(rootDir);
}

public static void dofind(String rootDir) throws Exception {
String alltmSr = deepDir(rootDir);

System.out.println(alltmSr);
String[] stringArrStrings = alltmSr.toString().split("\r\n");
String file_path = "D:\SHINE_ROOT\mspChinese.csv";
String file_name = "mspChinese.csv";
writeDataToCsvFile1(file_path,file_name,stringArrStrings);
}

public static String deepDir(String rootDir) throws Exception {
String string = "";
File folder = new File(rootDir);
StringBuilder alltmSr = new StringBuilder();
if (folder.isDirectory()) {
String[] files = folder.list();

for (int i = 0; i < files.length; i++) {
File file = new File(folder, files[i]);
if (file.isDirectory() && file.isHidden() == false) {
alltmSr.append(deepDir(file.getPath()));
} else if (file.isFile()) {
alltmSr.append(writeComment(file.getPath()));
}
}
} else if (folder.isFile()) {
alltmSr.append(writeComment(folder.getPath()));
}
return alltmSr.toString();
}

/**
* @param currentDir
* 当前目录
* @param currentFileName
* 当前文件名
* @throws FileNotFoundException
* @throws UnsupportedEncodingException
*/
/**
* @param filePathAndName
* @throws FileNotFoundException
* @throws UnsupportedEncodingException
*/
public static String writeComment(String filePathAndName)
throws FileNotFoundException, UnsupportedEncodingException {
StringBuffer buffer = new StringBuffer();
String line = null; // 用来保存每行读取的内容
InputStream is = new FileInputStream(filePathAndName);
BufferedReader reader = new BufferedReader(new InputStreamReader(is,"UTF-8"));
try {
line = reader.readLine();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} // 读取第一行
while (line != null) { // 如果 line 为空说明读完了
buffer.append(line); // 将读到的内容添加到 buffer 中
buffer.append(" "); // 添加换行符
try {
line = reader.readLine();
} catch (IOException e) {
e.printStackTrace();
} // 读取下一行
}
buffer.append(" "); // 添加换行符
String filecontent = buffer.toString();

String regex = "[u4e00-u9fa5]";
Pattern pattern = Pattern.compile(regex);
Matcher matcher = pattern.matcher(filecontent);
StringBuilder tmSr = new StringBuilder();
int tmp = -1;

while (matcher.find()) {
MatchResult result = matcher.toMatchResult();
int start = result.start();
int end = result.end();
if(tmp == start || tmp == -1) {
// 判断连续
tmSr.append(filecontent.substring(start, end));
}else {
// 不连续
tmSr.append(" ");
tmSr.append(filecontent.substring(start, end));
}
tmp = end;
}
tmSr.append(" "); // 添加换行符
return tmSr.toString();

}

/**
*
* @Description 写csv文件，
* @param filePath
* @param fields
* @param dtos void
* @param
* @throws @author
* @date 2019年11月18日上午9:45:31
* @see
*/
public static void writeDataToCsvFile1(String filePath, String fileName,String[] datas) throws Exception {

File csvFile = null;
BufferedWriter csvFileOutputStream = null;
FileOutputStream fos = null;
String uuidFilePath = "D:\SHINE_ROOT\mspChinese.csv";
try {
FileUtils.createNewFile(filePath);
FileUtils.createNewFile(uuidFilePath);
csvFile = new File(filePath);
try {
// 如果文件不存在，则创建新的文件
if (!csvFile.exists()) {
csvFile.createNewFile();
}
} catch (Exception e) {
e.printStackTrace();
}
// 写入bom头
byte[] uft8bom = { (byte) 0xef, (byte) 0xbb, (byte) 0xbf };
fos = new FileOutputStream(csvFile);
//fos.write(uft8bom);

// UTF-8使正确读取分隔符","
// 如果生产文件乱码，windows下用gbk，linux用UTF-8
//csvFileOutputStream = new BufferedWriter(new OutputStreamWriter(fos, "UTF-8"), 1024);

//csvFileOutputStream.newLine();
for (String dto : datas) {
if ("".equals(dto)!=true) {
fos.write((dto+" ").getBytes());
}
}
fos.flush();
fos.close();
} catch (Exception e) {
e.printStackTrace();
}
}
}

以上是关于遍历文件，查找文件下的汉字，并将汉字生成csv文件的主要内容，如果未能解决你的问题，请参考以下文章