hadoop 将HDFS上多个小文件合并到SequenceFile里

Posted 2021-02-09 我是属车的

tags:

篇首语：本文由小常识网(cha138.com)小编为大家整理，主要介绍了hadoop 将HDFS上多个小文件合并到SequenceFile里相关的知识，希望对你有一定的参考价值。

背景：hdfs上的文件最好和hdfs的块大小的N倍。如果文件太小，浪费namnode的元数据存储空间以及内存，如果文件分块不合理也会影响mapreduce中map的效率。

本例中将小文件的文件名作为key，其内容作为value生成SequenceFile

1、生成文件

 //将目标目录的所有文件以文件名为key，内容为value放入SequenceFile中
    //第一个参数是需要打包的目录，第二个参数生成的文件路径和名称
    private static void combineToSequenceFile(String[] args) throws IOException {
        String sourceDir = args[0];
        String destFile = args[1];

        List<String> files = getFiles(sourceDir);

        Configuration conf = new Configuration();
        FileSystem fs = FileSystem.get(conf);
        Path destPath = new Path(destFile);
        if (fs.exists(destPath)) {
            fs.delete(destPath, true);
        }

        FSDataInputStream in = null;

        Text key = new Text();
        BytesWritable value = new BytesWritable();

        byte[] buff = new byte[4096];
        SequenceFile.Writer writer = null;

        SequenceFile.Writer.Option option1 = SequenceFile.Writer.file(new Path(destFile));
        SequenceFile.Writer.Option option2 = SequenceFile.Writer.keyClass(key.getClass());
        SequenceFile.Writer.Option option3 = SequenceFile.Writer.valueClass(value.getClass());
        SequenceFile.Writer.Option option4 = SequenceFile.Writer.compression(SequenceFile.CompressionType.RECORD);
        try {
            writer = SequenceFile.createWriter(conf, option1, option2, option3, option4);
            for (int i = 0; i < files.size(); i++) {
                Path path = new Path(files.get(i).toString());
                System.out.println("读取文件：" + path.toString());
                key = new Text(files.get(i).toString());
                in = fs.open(path);
//                只能处理小文件，int最大只能表示到1个G的大小，实际上大文件放入SequenceFile也没有意义
                int length = (int) fs.getFileStatus(path).getLen();
                byte[] bytes = new byte[length];
//                read最多只能读取65536的大小
                int readLength = in.read(buff);
                int offset = 0;
                while (readLength > 0) {
                    System.arraycopy(buff, 0, bytes, offset, readLength);
                    offset += readLength;
                    readLength = in.read(buff);
                }
                System.out.println("file length:" + length + ",read length:" + offset);
                value = new BytesWritable(bytes);
                System.out.printf("[%s]\t%s\t%s\n", writer.getLength(), key, value.getLength());
                writer.append(key, value);
            }
        } finally {
            IOUtils.closeStream(in);
            IOUtils.closeStream(writer);
            IOUtils.closeStream(fs);
        }

    }

查找文件：

    private static List<String> getFiles(String dir) throws IOException {
        Configuration conf = new Configuration();
        Path path = new Path(dir);
        FileSystem fs = null;
        List<String> filelist = new ArrayList<>();
        try {
            fs = FileSystem.get(conf);

            //对单个文件或目录下所有文件和目录
            FileStatus[] fileStatuses = fs.listStatus(path);

            for (FileStatus fileStatus : fileStatuses) {
                //递归查找子目录
                if (fileStatus.isDirectory()) {
                    filelist.addAll(getFiles(fileStatus.getPath().toString()));
                } else {
                    filelist.add(fileStatus.getPath().toString());
                }
            }
            return filelist;
        } finally {
            IOUtils.closeStream(fs);
        }
    }

2、还原压缩的SequenceFile文件

    //将combineToSequenceFile生成的文件分解成原文件。
    private static void extractCombineSequenceFile(String[] args) throws IOException {
        String sourceFile = args[0];
//        String destdir = args[1];
        Configuration conf = new Configuration();
        Path sourcePath = new Path(sourceFile);

        SequenceFile.Reader reader = null;
        SequenceFile.Reader.Option option1 = SequenceFile.Reader.file(sourcePath);

        Writable key = null;
        Writable value = null;
//        Text key = null;
//        BytesWritable value = null;

        FileSystem fs = FileSystem.get(conf);
        try {
            reader = new SequenceFile.Reader(conf, option1);
            key = (Writable) ReflectionUtils.newInstance(reader.getKeyClass(), conf);
            value = (Writable) ReflectionUtils.newInstance(reader.getValueClass(), conf);

            //在知道key和value的明确类型的情况下，可以直接用其类型
//            key = ReflectionUtils.newInstance(Text.class, conf);
//            value =  ReflectionUtils.newInstance(BytesWritable.class, conf);
            long position = reader.getPosition();
            while (reader.next(key, value)) {
                FSDataOutputStream out = fs.create(new Path(key.toString()), true);
                //文件头会多出4个字节，用来标识长度，而本例中原文件头是没有长度的，所以不能用这个方式写入流
//                value.write(out);
                out.write(((BytesWritable)value).getBytes(),0,((BytesWritable)value).getLength());

                //                out.write(value.getBytes(),0,value.getLength());
                System.out.printf("[%s]\t%s\t%s\n", position, key, out.getPos());
                out.close();
                position = reader.getPosition();
            }
        } finally {
            IOUtils.closeStream(reader);
            IOUtils.closeStream(fs);
        }
    }

以上是关于hadoop 将HDFS上多个小文件合并到SequenceFile里的主要内容，如果未能解决你的问题，请参考以下文章