hadoop读取Excel文件转换成txt文件

Posted 2020-09-26

tags:

篇首语：本文由小常识网(cha138.com)小编为大家整理，主要介绍了hadoop读取Excel文件转换成txt文件相关的知识，希望对你有一定的参考价值。

功能说明：从hdfs读取excel文件，经过poi转换成txt文本文件并输出成hdfs文件

一、引入jar包

<!-- https://mvnrepository.com/artifact/org.apache.poi/poi -->  
<dependency>  
    <groupId>org.apache.poi</groupId>  
    <artifactId>poi</artifactId>  
    <version>3.14</version>  
</dependency>  
<!-- https://mvnrepository.com/artifact/org.apache.poi/poi-ooxml -->  
<dependency>  
    <groupId>org.apache.poi</groupId>  
    <artifactId>poi-ooxml</artifactId>  
    <version>3.14</version>  
</dependency>

二、代码实现

package operator.excel;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;

import java.io.File;
import java.io.IOException;
import java.io.InputStream;

/**
 * @ClassName: ExcelInputFormat
 * @Description:
 * @Author: mashiwei
 * @Date: 2017/6/30
 */
public class ExcelInputFormat {

    public static void main(String[] args) throws IOException {
        Configuration config = new Configuration();
        InputStream inputStream;
        String[] strArrayofLines;
        String output = "/kettle/excel/test.txt";
        String input = "/kettle/excel/test.xls";
        FileSystem fileSystem = FileSystem.get(config);
        FSDataOutputStream out = fileSystem.create(new Path(output));
        inputStream = fileSystem.open(new Path(input));
//       strArrayofLines = ExcelParser.parseExcelData(inputStream);
       strArrayofLines = ExcelParser.parseExcelData(inputStream,new File(input));
        for (String str:strArrayofLines) {
            System.out.println("------"+str);
            out.writeBytes(str);
            out.writeUTF("\n");
        }
        inputStream.close();
        out.close();
    }
}

package operator.excel;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.ss.usermodel.Cell;
import org.apache.poi.ss.usermodel.Row;
import org.apache.poi.ss.usermodel.Sheet;
import org.apache.poi.ss.usermodel.Workbook;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;

import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;

/**
 * @ClassName: ExcelParser
 * @Description:
 * @Author: mashiwei
 * @Date: 2017/6/30
 */
public class ExcelParser {
    private static final Log logger = LogFactory.getLog(ExcelParser.class);
    public static void checkFile(File file) throws IOException{
        //判断文件是否存在
        if(null == file){
            logger.error("文件不存在！");
            throw new FileNotFoundException("文件不存在！");
        }
        //获得文件名
        String fileName = file.getAbsolutePath();
        //判断文件是否是excel文件
        if(!fileName.endsWith("xls") && !fileName.endsWith("xlsx")){
            logger.error(fileName + "不是excel文件");
            throw new IOException(fileName + "不是excel文件");
        }
    }
    /**
     * 解析is
     *
     * @param is 数据源
     * @return String[]
     */
    public static String[] parseExcelData(InputStream is,File file) {

        //获得文件名
        String fileName = file.getAbsolutePath();
        try {
            checkFile(file);
        } catch (IOException e) {
            e.printStackTrace();
        }
        // 结果集
        List<String> resultList = new ArrayList<String>();
        Workbook workbook = null;
        try {
            // 获取Workbook
            if(fileName.endsWith("xls")){
                //2003
                workbook = new HSSFWorkbook(is);
            }else if(fileName.endsWith("xlsx")) {
                //2007
                workbook = new XSSFWorkbook(is);
            }
                // 获取sheet
            Sheet sheet = workbook.getSheetAt(0);

            Iterator<Row> rowIterator = sheet.iterator();

            while (rowIterator.hasNext()) {
                // 行
                Row row = rowIterator.next();
                // 字符串
                StringBuilder rowString = new StringBuilder();

                Iterator<Cell> colIterator = row.cellIterator();
                while (colIterator.hasNext()) {
                    Cell cell = colIterator.next();

                    switch (cell.getCellType()) {
                        case Cell.CELL_TYPE_BOOLEAN:
                            rowString.append(cell.getBooleanCellValue() + ",");
                            break;
                        case Cell.CELL_TYPE_NUMERIC:
                            rowString.append(cell.getNumericCellValue() + ",");
                            break;
                        case Cell.CELL_TYPE_STRING:
                            rowString.append(cell.getStringCellValue() + ",");
                            break;
                    }

                }
                String str = rowString.delete(rowString.lastIndexOf(","),rowString.lastIndexOf(",")+1).toString();


                resultList.add(str);
//                resultList.add(rowString.toString());
            }
        } catch (IOException e) {
            logger.error("IO Exception : File not found " + e);
        }

        return resultList.toArray(new String[0]);
    }




}

以上是关于hadoop读取Excel文件转换成txt文件的主要内容，如果未能解决你的问题，请参考以下文章