54一个大文件夹下所有文件的读取和检索!支持英文
Posted
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了54一个大文件夹下所有文件的读取和检索!支持英文相关的知识,希望对你有一定的参考价值。
package lld;
import org.apache.poi.hssf.usermodel.HSSFCell;
import org.apache.poi.hssf.usermodel.HSSFRow;
import org.apache.poi.hssf.usermodel.HSSFSheet;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import java.io.BufferedInputStream;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStream;
import java.text.DecimalFormat;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Date;
import java.util.List;
import jxl.Cell;
import jxl.Sheet;
import jxl.Workbook;
import org.apache.poi.hssf.usermodel.HSSFDateUtil;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.usermodel.Range;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.TextField;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.pdfbox.pdfparser.PDFParser;
import org.pdfbox.pdmodel.PDDocument;
import org.pdfbox.util.PDFTextStripper;
/**
* @author xinghl
*
*/
public class IndexManager{
private static IndexManager indexManager;
private static String content="";
private static String INDEX_DIR = "E:\\uploadFolders";
private static String DATA_DIR = "E:\\uploadFolder";
private static Analyzer analyzer = null;
private static Directory directory = null;
private static IndexWriter indexWriter = null;
/**
* 创建索引管理器
* @return 返回索引管理器对象
*/
public IndexManager getManager(){
if(indexManager == null){
this.indexManager = new IndexManager();
}
return indexManager;
}
/**
* 读取txt文件的内容
* @param file 想要读取的文件对象
* @return 返回文件内容
*/
public static String txt2String(File file){
String result = "";
try{
BufferedReader br = new BufferedReader(new FileReader(file));//构造一个BufferedReader类来读取文件
String s = null;
while((s = br.readLine())!=null){//使用readLine方法,一次读一行
result = result + "\n" +s;
}
br.close();
}catch(Exception e){
e.printStackTrace();
}
return result;
}
/**
* 读取doc文件内容
* @param file 想要读取的文件对象
* @return 返回文件内容
*/
public static String doc2String(File file){
String result = "";
try{
FileInputStream fis = new FileInputStream(file);
HWPFDocument doc = new HWPFDocument(fis);
Range rang = doc.getRange();
result = result + rang.text().toString();
fis.close();
}catch(Exception e){
e.printStackTrace();
}
return result;
}
/**
* 读取xls文件内容
* @param file 想要读取的文件对象
* @return 返回文件内容
*/
public static String xls2String(File file){
String result = "";
//jxl.Workbook readwb=null;
Cell cell=null;
try{
InputStream instream = new FileInputStream(file);
/*List<List<String>> result = null;
InputStream instream = new FileInputStream(file);
HSSFWorkbook hssfWorkbook = new HSSFWorkbook(instream);
result = new ArrayList<List<String>>();
for(int numSheet = 0;numSheet < hssfWorkbook.getNumberOfSheets();numSheet++){
HSSFSheet hssfSheet = hssfWorkbook.getSheetAt(numSheet);
if(hssfSheet==null){
continue;
}
for(int rowNum =1; rowNum <= hssfSheet.getLastRowNum(); rowNum++){
HSSFRow hssfRow = hssfSheet.getRow(rowNum);
int minColIx = hssfRow.getFirstCellNum();
int maxColIX = hssfRow.getLastCellNum();
List<String> rowList = new ArrayList<String>();
for(int colIx = minColIx; colIx<minColIx;colIx++){
HSSFCell cell = hssfRow.getCell(colIx);
if(cell == null){
continue;
}
rowList.add(ExcelUtils.export(fileName, out););
}
result.add(rowList);
}
}
return result;
}*/
jxl.Workbook readwb = Workbook.getWorkbook(instream);
//Sheet的下标是从0开始
//获取第一张Sheet表
Sheet readsheet = readwb.getSheet(0);
//获取Sheet表中所包含的总列数
int rsColumns = readsheet.getColumns();
//获取Sheet表中所包含的总行数
int rsRows = readsheet.getRows();
//获取指定单元格的对象引用
for (int i = 0; i < rsRows; i++)
{
for (int j = 0; j < rsColumns; j++)
{
cell = readsheet.getCell(j, i);
result = result+cell.getContents()+" ";
}
}
}catch(Exception e){
e.printStackTrace();
}
return result;
}
public static String xlsx2String(File file){
String result = "";
return result;
}
private static String getValue(HSSFCell hssfCell) {
if(hssfCell.getCellType() == hssfCell.CELL_TYPE_BOOLEAN){
return String.valueOf( hssfCell.getBooleanCellValue());
}else if(hssfCell.getCellType() == hssfCell.CELL_TYPE_NUMERIC){
return String.valueOf( hssfCell.getNumericCellValue());
}else{
return String.valueOf( hssfCell.getStringCellValue());
}
}
/*public static String pdf2String(File file){
String result = null;
FileInputStream is = null;
PDDocument document = null;
try {
is = new FileInputStream(file);
PDFParser parser = new PDFParser(is);
parser.parse();
document = parser.getPDDocument();
PDFTextStripper stripper = new PDFTextStripper();
result = stripper.getText(document);
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} finally {
if (is != null) {
try {
is.close();
} catch (IOException e) {
e.printStackTrace();
}
}
if (document != null) {
try {
document.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
return result;
} */
public static String docx2String(File file){
String result = "";
try {
FileInputStream fis = new FileInputStream(file);
XWPFDocument xdoc = new XWPFDocument(fis);
XWPFWordExtractor extractor = new XWPFWordExtractor(xdoc);
result = extractor.getText();
//System.out.println(result);
fis.close();
}catch(Exception e){
e.printStackTrace();
}
return result;
}
/**
* 查找索引,返回符合条件的文件
* @param text 查找的字符串
* @return 符合条件的文件List
*/
public static void searchIndex(String text){
Date date1 = new Date();
try{
directory = FSDirectory.open(new File(INDEX_DIR));
analyzer = new StandardAnalyzer(Version.LUCENE_CURRENT);
DirectoryReader ireader = DirectoryReader.open(directory);
IndexSearcher isearcher = new IndexSearcher(ireader);
QueryParser parser = new QueryParser(Version.LUCENE_CURRENT, "content", analyzer);
Query query = parser.parse(text);
ScoreDoc[] hits = isearcher.search(query, null, 1000).scoreDocs;
for (int i = 0; i < hits.length; i++) {
Document hitDoc = isearcher.doc(hits[i].doc);
System.out.println("_____________查看索引_______________");
System.out.println(hitDoc.get("filename"));
System.out.println(hitDoc.get("content"));
System.out.println(hitDoc.get("path"));
System.out.println("_____________查看索引________________");
}
ireader.close();
directory.close();
}catch(Exception e){
e.printStackTrace();
}
Date date2 = new Date();
System.out.println("查看索引-----耗时:" + (date2.getTime() - date1.getTime()) + "ms\n");
}
/**
* 创建当前文件目录的索引
* @param path 当前文件目录
* @return 是否成功
*/
public static boolean createIndex(String path){
Date date1 = new Date();
List<File> fileList = getFileList(path);
for (File file : fileList) {
content = "";
//获取文件后缀
String type = file.getName().substring(file.getName().lastIndexOf(".")+1);
if("txt".equalsIgnoreCase(type)){
content += txt2String(file);
}else if("doc".equalsIgnoreCase(type)){
content += doc2String(file);
/*}else if("pdf".equalsIgnoreCase(type)){
content += pdf2String(file);*/
}else if("xls".equalsIgnoreCase(type)){
content += xls2String(file);
}else if("xlsx".equalsIgnoreCase(type)){
content += xlsx2String(file);
}else if("docx".equalsIgnoreCase(type)){
content += docx2String(file);
}
System.out.println("name :"+file.getName());
//System.out.println("path :"+file.getPath());
//System.out.println("content :"+content);
System.out.println("_____________索引建立完毕_______________");
try{
analyzer = new StandardAnalyzer(Version.LUCENE_CURRENT);
directory = FSDirectory.open(new File(INDEX_DIR));
File indexFile = new File(INDEX_DIR);
if (!indexFile.exists()) {
indexFile.mkdirs();
}
IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_CURRENT, analyzer);
indexWriter = new IndexWriter(directory, config);
Document document = new Document();
document.add(new TextField("filename", file.getName(), Store.YES));
document.add(new TextField("content", content, Store.YES));
document.add(new TextField("path", file.getPath(), Store.YES));
indexWriter.addDocument(document);
indexWriter.commit();
closeWriter();
}catch(Exception e){
e.printStackTrace();
}
content = "";
}
Date date2 = new Date();
System.out.println("创建索引-----耗时:" + (date2.getTime() - date1.getTime()) + "ms\n");
return true;
}
public static void main(String[] args){
File fileIndex = new File(INDEX_DIR);
if(deleteDir(fileIndex)){
fileIndex.mkdir();
}else{
fileIndex.mkdir();
}
createIndex(DATA_DIR);
searchIndex("love");
}
/**
* 过滤目录下的文件
* @param dirPath 想要获取文件的目录
* @return 返回文件list
*/
static List<File> FileList = new ArrayList<File>();
public static List<File> getFileList(String dirPath) {
File[] files = new File(dirPath).listFiles();// 该文件目录下文件全部放入数组
if (files != null) {
for (int i = 0; i < files.length; i++) {
//List<File> fileList = new ArrayList<File>();
String fileName = files[i].getName();
if (files[i].isDirectory()) { // 判断是文件还是文件夹
getFileList(files[i].getAbsolutePath()); // 获取文件绝对路径
} else if(files != null){ // 判断文件名是否以.avi结尾
String strFileName = files[i].getAbsolutePath();
//System.out.println(strFileName);
//fileList.add(strFileName);
FileList.add(files[i]);
// System.out.println(FileList);
} else {
continue;
}
//FileList.addAll(fileList);
}
//FileList.addAll(fileList);
}
System.out.println(FileList);
return FileList;
}
//return fileList;
/*for (File file : files) {
if (isTxtFile(file.getName())) {
fileList.add(file);
}
}*/
//System.out.println("111111111111111111111111111111111111111111111111111111111111");
//System.out.println(fileList);
//System.out.println("\n");
//return fileList;
//}
/**
* 判断是否为目标文件,目前支持txt xls doc格式
* @param fileName 文件名称
* @return 如果是文件类型满足过滤条件,返回true;否则返回false
*/
public static boolean isTxtFile(String fileName) {
if (fileName.lastIndexOf(".txt") > 0) {
return true;
}else if (fileName.lastIndexOf(".xls") > 0) {
return true;
}else if (fileName.lastIndexOf(".xlsx") > 0) {
return true;
}else if (fileName.lastIndexOf(".doc") > 0) {
return true;
}
else if (fileName.lastIndexOf(".pdf") > 0) {
return true;
}
else if (fileName.lastIndexOf(".docx") > 0) {
return true;
}
return false;
}
public static void closeWriter() throws Exception {
if (indexWriter != null) {
indexWriter.close();
}
}
/**
* 删除文件目录下的所有文件
* @param file 要删除的文件目录
* @return 如果成功,返回true.
*/
public static boolean deleteDir(File file){
if(file.isDirectory()){
File[] files = file.listFiles();
for(int i=0; i<files.length; i++){
deleteDir(files[i]);
}
}
file.delete();
return true;
}
public static String[][] getData(File file, int ignoreRows)
throws FileNotFoundException, IOException {
List<String[]> result = new ArrayList<String[]>();
int rowSize = 0;
BufferedInputStream in = new BufferedInputStream(new FileInputStream(
file));
// 打开HSSFWorkbook
POIFSFileSystem fs = new POIFSFileSystem(in);
HSSFWorkbook wb = new HSSFWorkbook(fs);
HSSFCell cell = null;
for (int sheetIndex = 0; sheetIndex < wb.getNumberOfSheets(); sheetIndex++) {
HSSFSheet st = wb.getSheetAt(sheetIndex);
// 第一行为标题,不取
for (int rowIndex = ignoreRows; rowIndex <= st.getLastRowNum(); rowIndex++) {
HSSFRow row = st.getRow(rowIndex);
if (row == null) {
continue;
}
int tempRowSize = row.getLastCellNum() + 1;
if (tempRowSize > rowSize) {
rowSize = tempRowSize;
}
String[] values = new String[rowSize];
Arrays.fill(values, "");
boolean hasValue = false;
for (short columnIndex = 0; columnIndex <= row.getLastCellNum(); columnIndex++) {
String value = "";
cell = row.getCell(columnIndex);
if (cell != null) {
// 注意:一定要设成这个,否则可能会出现乱码
// cell.setEncoding(HSSFCell.ENCODING_UTF_16);
switch (cell.getCellType()) {
case HSSFCell.CELL_TYPE_STRING:
value = cell.getStringCellValue();
break;
case HSSFCell.CELL_TYPE_NUMERIC:
if (HSSFDateUtil.isCellDateFormatted(cell)) {
Date date = cell.getDateCellValue();
if (date != null) {
value = new SimpleDateFormat("yyyy-MM-dd")
.format(date);
} else {
value = "";
}
} else {
value = new DecimalFormat("0").format(cell
.getNumericCellValue());
}
break;
case HSSFCell.CELL_TYPE_FORMULA:
// 导入时如果为公式生成的数据则无值
if (!cell.getStringCellValue().equals("")) {
value = cell.getStringCellValue();
} else {
value = cell.getNumericCellValue() + "";
}
break;
case HSSFCell.CELL_TYPE_BLANK:
break;
case HSSFCell.CELL_TYPE_ERROR:
value = "";
break;
case HSSFCell.CELL_TYPE_BOOLEAN:
value = (cell.getBooleanCellValue() == true ? "Y"
: "N");
break;
default:
value = "";
}
}
if (columnIndex == 0 && value.trim().equals("")) {
break;
}
values[columnIndex] = rightTrim(value);
hasValue = true;
}
if (hasValue) {
result.add(values);
}
}
}
in.close();
String[][] returnArray = new String[result.size()][rowSize];
for (int i = 0; i < returnArray.length; i++) {
returnArray[i] = (String[]) result.get(i);
}
return returnArray;
}
/**
* 去掉字符串右边的空格
* @param str 要处理的字符串
* @return 处理后的字符串
*/
public static String rightTrim(String str) {
if (str == null) {
return "";
}
int length = str.length();
for (int i = length - 1; i >= 0; i--) {
if (str.charAt(i) != 0x20) {
break;
}
length--;
}
return str.substring(0, length);
}
}
以上是关于54一个大文件夹下所有文件的读取和检索!支持英文的主要内容,如果未能解决你的问题,请参考以下文章