hadoop读取Excel文件转换成txt文件
Posted
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了hadoop读取Excel文件转换成txt文件相关的知识,希望对你有一定的参考价值。
功能说明:从hdfs读取excel文件,经过poi转换成txt文本文件并输出成hdfs文件
一、引入jar包
<!-- https://mvnrepository.com/artifact/org.apache.poi/poi --> <dependency> <groupId>org.apache.poi</groupId> <artifactId>poi</artifactId> <version>3.14</version> </dependency> <!-- https://mvnrepository.com/artifact/org.apache.poi/poi-ooxml --> <dependency> <groupId>org.apache.poi</groupId> <artifactId>poi-ooxml</artifactId> <version>3.14</version> </dependency>
二、代码实现
package operator.excel;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
/**
* @ClassName: ExcelInputFormat
* @Description:
* @Author: mashiwei
* @Date: 2017/6/30
*/
public class ExcelInputFormat {
public static void main(String[] args) throws IOException {
Configuration config = new Configuration();
InputStream inputStream;
String[] strArrayofLines;
String output = "/kettle/excel/test.txt";
String input = "/kettle/excel/test.xls";
FileSystem fileSystem = FileSystem.get(config);
FSDataOutputStream out = fileSystem.create(new Path(output));
inputStream = fileSystem.open(new Path(input));
// strArrayofLines = ExcelParser.parseExcelData(inputStream);
strArrayofLines = ExcelParser.parseExcelData(inputStream,new File(input));
for (String str:strArrayofLines) {
System.out.println("------"+str);
out.writeBytes(str);
out.writeUTF("\n");
}
inputStream.close();
out.close();
}
}
package operator.excel; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.poi.hssf.usermodel.HSSFWorkbook; import org.apache.poi.ss.usermodel.Cell; import org.apache.poi.ss.usermodel.Row; import org.apache.poi.ss.usermodel.Sheet; import org.apache.poi.ss.usermodel.Workbook; import org.apache.poi.xssf.usermodel.XSSFWorkbook; import java.io.File; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStream; import java.util.ArrayList; import java.util.Iterator; import java.util.List; /** * @ClassName: ExcelParser * @Description: * @Author: mashiwei * @Date: 2017/6/30 */ public class ExcelParser { private static final Log logger = LogFactory.getLog(ExcelParser.class); public static void checkFile(File file) throws IOException{ //判断文件是否存在 if(null == file){ logger.error("文件不存在!"); throw new FileNotFoundException("文件不存在!"); } //获得文件名 String fileName = file.getAbsolutePath(); //判断文件是否是excel文件 if(!fileName.endsWith("xls") && !fileName.endsWith("xlsx")){ logger.error(fileName + "不是excel文件"); throw new IOException(fileName + "不是excel文件"); } } /** * 解析is * * @param is 数据源 * @return String[] */ public static String[] parseExcelData(InputStream is,File file) { //获得文件名 String fileName = file.getAbsolutePath(); try { checkFile(file); } catch (IOException e) { e.printStackTrace(); } // 结果集 List<String> resultList = new ArrayList<String>(); Workbook workbook = null; try { // 获取Workbook if(fileName.endsWith("xls")){ //2003 workbook = new HSSFWorkbook(is); }else if(fileName.endsWith("xlsx")) { //2007 workbook = new XSSFWorkbook(is); } // 获取sheet Sheet sheet = workbook.getSheetAt(0); Iterator<Row> rowIterator = sheet.iterator(); while (rowIterator.hasNext()) { // 行 Row row = rowIterator.next(); // 字符串 StringBuilder rowString = new StringBuilder(); Iterator<Cell> colIterator = row.cellIterator(); while (colIterator.hasNext()) { Cell cell = colIterator.next(); switch (cell.getCellType()) { case Cell.CELL_TYPE_BOOLEAN: rowString.append(cell.getBooleanCellValue() + ","); break; case Cell.CELL_TYPE_NUMERIC: rowString.append(cell.getNumericCellValue() + ","); break; case Cell.CELL_TYPE_STRING: rowString.append(cell.getStringCellValue() + ","); break; } } String str = rowString.delete(rowString.lastIndexOf(","),rowString.lastIndexOf(",")+1).toString(); resultList.add(str); // resultList.add(rowString.toString()); } } catch (IOException e) { logger.error("IO Exception : File not found " + e); } return resultList.toArray(new String[0]); } }
以上是关于hadoop读取Excel文件转换成txt文件的主要内容,如果未能解决你的问题,请参考以下文章