遍历文件,查找文件下的汉字,并将汉字生成csv文件
Posted lwh-12345
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了遍历文件,查找文件下的汉字,并将汉字生成csv文件相关的知识,希望对你有一定的参考价值。
package com.shine.eiuop.utils;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.UnsupportedEncodingException;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.UUID;
import java.util.regex.MatchResult;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import com.itextpdf.text.pdf.PdfStructTreeController.returnType;
import com.shine.framework.commutil.typewrap.EDto;
/**
* title: 清除注释
*
* @author
* @时间
*/
public class FileCopyChineseUtils {
/** 根目录 */
public static String rootDir = "C:\Users\14423\Desktop\亚强\msp中文翻译\msp2\msp\WebRoot";
public static void main(String args[]) throws Exception {
dofind(rootDir);
}
public static void dofind(String rootDir) throws Exception {
String alltmSr = deepDir(rootDir);
System.out.println(alltmSr);
String[] stringArrStrings = alltmSr.toString().split("\r\n");
String file_path = "D:\SHINE_ROOT\mspChinese.csv";
String file_name = "mspChinese.csv";
writeDataToCsvFile1(file_path,file_name,stringArrStrings);
}
public static String deepDir(String rootDir) throws Exception {
String string = "";
File folder = new File(rootDir);
StringBuilder alltmSr = new StringBuilder();
if (folder.isDirectory()) {
String[] files = folder.list();
for (int i = 0; i < files.length; i++) {
File file = new File(folder, files[i]);
if (file.isDirectory() && file.isHidden() == false) {
alltmSr.append(deepDir(file.getPath()));
} else if (file.isFile()) {
alltmSr.append(writeComment(file.getPath()));
}
}
} else if (folder.isFile()) {
alltmSr.append(writeComment(folder.getPath()));
}
return alltmSr.toString();
}
/**
* @param currentDir
* 当前目录
* @param currentFileName
* 当前文件名
* @throws FileNotFoundException
* @throws UnsupportedEncodingException
*/
/**
* @param filePathAndName
* @throws FileNotFoundException
* @throws UnsupportedEncodingException
*/
public static String writeComment(String filePathAndName)
throws FileNotFoundException, UnsupportedEncodingException {
StringBuffer buffer = new StringBuffer();
String line = null; // 用来保存每行读取的内容
InputStream is = new FileInputStream(filePathAndName);
BufferedReader reader = new BufferedReader(new InputStreamReader(is,"UTF-8"));
try {
line = reader.readLine();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} // 读取第一行
while (line != null) { // 如果 line 为空说明读完了
buffer.append(line); // 将读到的内容添加到 buffer 中
buffer.append("
"); // 添加换行符
try {
line = reader.readLine();
} catch (IOException e) {
e.printStackTrace();
} // 读取下一行
}
buffer.append("
"); // 添加换行符
String filecontent = buffer.toString();
String regex = "[u4e00-u9fa5]";
Pattern pattern = Pattern.compile(regex);
Matcher matcher = pattern.matcher(filecontent);
StringBuilder tmSr = new StringBuilder();
int tmp = -1;
while (matcher.find()) {
MatchResult result = matcher.toMatchResult();
int start = result.start();
int end = result.end();
if(tmp == start || tmp == -1) {
// 判断连续
tmSr.append(filecontent.substring(start, end));
}else {
// 不连续
tmSr.append("
");
tmSr.append(filecontent.substring(start, end));
}
tmp = end;
}
tmSr.append("
"); // 添加换行符
return tmSr.toString();
}
/**
*
* @Description 写csv文件,
* @param filePath
* @param fields
* @param dtos void
* @param
* @throws @author
* @date 2019年11月18日 上午9:45:31
* @see
*/
public static void writeDataToCsvFile1(String filePath, String fileName,String[] datas) throws Exception {
File csvFile = null;
BufferedWriter csvFileOutputStream = null;
FileOutputStream fos = null;
String uuidFilePath = "D:\SHINE_ROOT\mspChinese.csv";
try {
FileUtils.createNewFile(filePath);
FileUtils.createNewFile(uuidFilePath);
csvFile = new File(filePath);
try {
// 如果文件不存在,则创建新的文件
if (!csvFile.exists()) {
csvFile.createNewFile();
}
} catch (Exception e) {
e.printStackTrace();
}
// 写入bom头
byte[] uft8bom = { (byte) 0xef, (byte) 0xbb, (byte) 0xbf };
fos = new FileOutputStream(csvFile);
//fos.write(uft8bom);
// UTF-8使正确读取分隔符","
// 如果生产文件乱码,windows下用gbk,linux用UTF-8
//csvFileOutputStream = new BufferedWriter(new OutputStreamWriter(fos, "UTF-8"), 1024);
//csvFileOutputStream.newLine();
for (String dto : datas) {
if ("".equals(dto)!=true) {
fos.write((dto+"
").getBytes());
}
}
fos.flush();
fos.close();
} catch (Exception e) {
e.printStackTrace();
}
}
}
以上是关于遍历文件,查找文件下的汉字,并将汉字生成csv文件的主要内容,如果未能解决你的问题,请参考以下文章
python读csv文件中文汉字出现UnicodeDecodeError
如何用vbs程序批量改某个文件夹下的文件名?即原文件名前加001_,002_,......或者某些汉字:百度_,百度_...