FusionInsight MRS Flink DataStream API读写Hudi实践

Posted 华为云开发者联盟

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了FusionInsight MRS Flink DataStream API读写Hudi实践相关的知识,希望对你有一定的参考价值。

摘要:目前Hudi只支持FlinkSQL进行数据读写,但是在实际项目开发中一些客户存在使用Flink DataStream API读写Hudi的诉求。

本文分享自华为云社区《FusionInsight MRS Flink DataStream API读写Hudi实践》,作者: yangxiao_mrs 。

目前Hudi只支持FlinkSQL进行数据读写,但是在实际项目开发中一些客户存在使用Flink DataStream API读写Hudi的诉求。

该实践包含三部分内容:

1)HoodiePipeline.java ,该类将Hudi内核读写接口进行封装,提供Hudi DataStream API。

2)WriteIntoHudi.java ,该类使用 DataStream API将数据写入Hudi。

3)ReadFromHudi.java ,该类使用 DataStream API读取Hudi数据。

1.HoodiePipeline.java 将Hudi内核读写接口进行封装,提供Hudi DataStream API。关键实现逻辑:

第一步:将原来Hudi流表的列名、主键、分区键set后,通过StringBuilder拼接成create table SQL。

第二步:将该hudi流表注册到catalog中。

第三步:将DynamicTable转换为DataStreamProvider后,进行数据produce或者consume。

import org.apache.flink.configuration.ConfigOption;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.configuration.ReadableConfig;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.DataStreamSink;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.table.api.EnvironmentSettings;
import org.apache.flink.table.api.internal.TableEnvironmentImpl;
import org.apache.flink.table.catalog.Catalog;
import org.apache.flink.table.catalog.CatalogTable;
import org.apache.flink.table.catalog.ObjectIdentifier;
import org.apache.flink.table.catalog.ObjectPath;
import org.apache.flink.table.catalog.exceptions.TableNotExistException;
import org.apache.flink.table.connector.sink.DataStreamSinkProvider;
import org.apache.flink.table.connector.source.DataStreamScanProvider;
import org.apache.flink.table.connector.source.ScanTableSource;
import org.apache.flink.table.data.RowData;
import org.apache.flink.table.factories.DynamicTableFactory;
import org.apache.flink.table.runtime.connector.sink.SinkRuntimeProviderContext;
import org.apache.flink.table.runtime.connector.source.ScanRuntimeProviderContext;
import org.apache.hudi.exception.HoodieException;
import org.apache.hudi.table.HoodieTableFactory;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;

/**
 *  A tool class to construct hoodie flink pipeline.
 *
 *  <p>How to use ?</p>
 *  Method @link #builder(String) returns a pipeline builder. The builder
 *  can then define the hudi table columns, primary keys and partitions.
 *
 *  <p>An example:</p>
 *  <pre>
 *    HoodiePipeline.Builder builder = HoodiePipeline.builder("myTable");
 *    DataStreamSink<?> sinkStream = builder
 *        .column("f0 int")
 *        .column("f1 varchar(10)")
 *        .column("f2 varchar(20)")
 *        .pk("f0,f1")
 *        .partition("f2")
 *        .sink(input, false);
 *  </pre>
 */
public class HoodiePipeline 

  /**
   * Returns the builder for hoodie pipeline construction.
   */
  public static Builder builder(String tableName) 
    return new Builder(tableName);
  

    /**
     * Builder for hudi source/sink pipeline construction.
     */
    public static class Builder 
      private final String tableName;
      private final List<String> columns;
      private final Map<String, String> options;

      private String pk;
      private List<String> partitions;

      private Builder(String tableName) 
        this.tableName = tableName;
        this.columns = new ArrayList<>();
        this.options = new HashMap<>();
        this.partitions = new ArrayList<>();
      

      /**
       * Add a table column definition.
       *
       * @param column the column format should be in the form like 'f0 int'
       */
      public Builder column(String column) 
        this.columns.add(column);
        return this;
      

      /**
       * Add primary keys.
       */
      public Builder pk(String... pks) 
        this.pk = String.join(",", pks);
        return this;
      

      /**
       * Add partition fields.
       */
      public Builder partition(String... partitions) 
        this.partitions = new ArrayList<>(Arrays.asList(partitions));
        return this;
      

      /**
       * Add a config option.
       */
      public Builder option(ConfigOption<?> option, Object val) 
        this.options.put(option.key(), val.toString());
        return this;
      

      public Builder option(String key, Object val) 
        this.options.put(key, val.toString());
        return this;
      

      public Builder options(Map<String, String> options) 
        this.options.putAll(options);
        return this;
      

      public DataStreamSink<?> sink(DataStream<RowData> input, boolean bounded) 
        TableDescriptor tableDescriptor = getTableDescriptor();
        return HoodiePipeline.sink(input, tableDescriptor.getTableId(), tableDescriptor.getCatalogTable(), bounded);
      

      public TableDescriptor getTableDescriptor() 
        EnvironmentSettings environmentSettings = EnvironmentSettings
            .newInstance()
            .build();
        TableEnvironmentImpl tableEnv = TableEnvironmentImpl.create(environmentSettings);
        String sql = getCreateHoodieTableDDL(this.tableName, this.columns, this.options, this.pk, this.partitions);
        tableEnv.executeSql(sql);
        String currentCatalog = tableEnv.getCurrentCatalog();
        CatalogTable catalogTable = null;
        String defaultDatabase = null;
        try 
            Catalog catalog = tableEnv.getCatalog(currentCatalog).get();
            defaultDatabase = catalog.getDefaultDatabase();
            catalogTable = (CatalogTable) catalog.getTable(new ObjectPath(defaultDatabase, this.tableName));
         catch (TableNotExistException e) 
            throw new HoodieException("Create table " + this.tableName + " exception", e);
        
        ObjectIdentifier tableId = ObjectIdentifier.of(currentCatalog, defaultDatabase, this.tableName);
        return new TableDescriptor(tableId, catalogTable);
      

      public DataStream<RowData> source(StreamExecutionEnvironment execEnv) 
        TableDescriptor tableDescriptor = getTableDescriptor();
        return HoodiePipeline.source(execEnv, tableDescriptor.tableId, tableDescriptor.getCatalogTable());
      
    

    private static String getCreateHoodieTableDDL(
      String tableName,
      List<String> fields,
      Map<String, String> options,
      String pkField,
      List<String> partitionField) 
      StringBuilder builder = new StringBuilder();
      builder.append("create table ")
          .append(tableName)
          .append("(\\n");
      for (String field : fields) 
        builder.append("  ")
              .append(field)
              .append(",\\n");
      
      builder.append("  PRIMARY KEY(")
          .append(pkField)
          .append(") NOT ENFORCED\\n")
          .append(")\\n");
      if (!partitionField.isEmpty()) 
        String partitons = partitionField
            .stream()
            .map(partitionName -> "`" + partitionName + "`")
            .collect(Collectors.joining(","));
        builder.append("PARTITIONED BY (")
            .append(partitons)
            .append(")\\n");
      
      builder.append("with ('connector' = 'hudi'");
      options.forEach((k, v) -> builder
          .append(",\\n")
          .append("  '")
          .append(k)
          .append("' = '")
          .append(v)
          .append("'"));
      builder.append("\\n)");

      System.out.println(builder.toString());
      return builder.toString();
    

    /**
     * Returns the data stream sink with given catalog table.
     *
     * @param input        The input datastream
     * @param tablePath    The table path to the hoodie table in the catalog
     * @param catalogTable The hoodie catalog table
     * @param isBounded    A flag indicating whether the input data stream is bounded
     */
    private static DataStreamSink<?> sink(DataStream<RowData> input, ObjectIdentifier tablePath, CatalogTable catalogTable, boolean isBounded) 
      DefaultDynamicTableContext context = new DefaultDynamicTableContext(tablePath, catalogTable,
          Configuration.fromMap(catalogTable.getOptions()), Thread.currentThread().getContextClassLoader(), false);
      HoodieTableFactory hoodieTableFactory = new HoodieTableFactory();
      return ((DataStreamSinkProvider) hoodieTableFactory.createDynamicTableSink(context)
          .getSinkRuntimeProvider(new SinkRuntimeProviderContext(isBounded)))
          .consumeDataStream(input);
    

    /**
     * Returns the data stream source with given catalog table.
     *
     * @param execEnv      The execution environment
     * @param tablePath    The table path to the hoodie table in the catalog
     * @param catalogTable The hoodie catalog table
     */
    private static DataStream<RowData> source(StreamExecutionEnvironment execEnv, ObjectIdentifier tablePath, CatalogTable catalogTable) 
      DefaultDynamicTableContext context = new DefaultDynamicTableContext(tablePath, catalogTable,
          Configuration.fromMap(catalogTable.getOptions()), Thread.currentThread().getContextClassLoader(), false);
      HoodieTableFactory hoodieTableFactory = new HoodieTableFactory();
      DataStreamScanProvider dataStreamScanProvider = (DataStreamScanProvider) ((ScanTableSource) hoodieTableFactory
          .createDynamicTableSource(context))
          .getScanRuntimeProvider(new ScanRuntimeProviderContext());
      return  dataStreamScanProvider.produceDataStream(execEnv);
    

    /***
     *  A POJO that contains tableId and resolvedCatalogTable.
     */
    public static class TableDescriptor 
      private ObjectIdentifier tableId;
      private CatalogTable catalogTable;

      public TableDescriptor(ObjectIdentifier tableId, CatalogTable catalogTable) 
          this.tableId = tableId;
          this.catalogTable = catalogTable;
      

      public ObjectIdentifier getTableId() 
          return tableId;
      

      public CatalogTable getCatalogTable() 
            return catalogTable;
        
    

    private static class DefaultDynamicTableContext implements DynamicTableFactory.Context 

      private final ObjectIdentifier objectIdentifier;
      private final CatalogTable catalogTable;
      private final ReadableConfig configuration;
      private final ClassLoader classLoader;
      private final boolean isTemporary;

      DefaultDynamicTableContext(
        ObjectIdentifier objectIdentifier,
        CatalogTable catalogTable,
        ReadableConfig configuration,
        ClassLoader classLoader,
        boolean isTemporary) 
        this.objectIdentifier = objectIdentifier;
        this.catalogTable = catalogTable;
        this.configuration = configuration;
        this.classLoader = classLoader;
        this.isTemporary = isTemporary;
      

      @Override
      public ObjectIdentifier getObjectIdentifier() 
        return objectIdentifier;
      

      @Override
      public CatalogTable getCatalogTable() 
        return catalogTable;
      

      @Override
      public ReadableConfig getConfiguration() 
        return configuration;
      

      @Override
      public ClassLoader getClassLoader() 
        return classLoader;
      

      @Override
      public boolean isTemporary() 
            return isTemporary;
        
    

2.WriteIntoHudi.java 使用 DataStream API将数据写入Hudi。关键实现逻辑:

第一步:Demo中的数据源来自datagen connector Table。

第二步:使用toAppendStream将Table转化为Stream。

第三步:build hudi sink stream后写入Hudi。

在项目实践中也可以直接使用DataStream源写入Hudi。

import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.table.api.Table;
import org.apache.flink.table.api.bridge.java.StreamTableEnvironment;
import org.apache.flink.table.data.RowData;
import org.apache.hudi.common.model.HoodieTableType;
import org.apache.hudi.configuration.FlinkOptions;

import java.util.HashMap;
import java.util.Map;

public class WriteIntoHudi 
    public static void main(String[] args) throws Exception 

        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        StreamTableEnvironment tableEnv = StreamTableEnvironment.create(env);
        env.getCheckpointConfig().setCheckpointInterval(10000);

        tableEnv.executeSql("CREATE TABLE datagen (\\n"
            + "  uuid varchar(20),\\n"
            + "  name varchar(10),\\n"
            + "  age int,\\n"
            + "  ts timestamp(3),\\n"
            + "  p varchar(20)\\n"
            + ") WITH (\\n"
            + "  'connector' = 'datagen',\\n"
            + "  'rows-per-second' = '5'\\n"
            + ")");

        Table table = tableEnv.sqlQuery("SELECT * FROM datagen");

        DataStream<RowData> dataStream = tableEnv.toAppendStream(table, RowData.class);
        String targetTable = "hudiSinkTable";

        String basePath = "hdfs://hacluster/tmp/flinkHudi/hudiTable";

        Map<String, String> options = new HashMap<>();
        options.put(FlinkOptions.PATH.key(), basePath);
        options.put(FlinkOptions.TABLE_TYPE.key(), HoodieTableType.MERGE_ON_READ.name());
        options.put(FlinkOptions.PRECOMBINE_FIELD.key(), "ts");
        options.put(FlinkOptions.INDEX_BOOTSTRAP_ENABLED.key(), "true");

        HoodiePipeline.Builder builder = HoodiePipeline.builder(targetTable)
            .column("uuid VARCHAR(20)")
            .column("name VARCHAR(10)")
            .column("age INT")
            .column("ts TIMESTAMP(3)")
            .column("p VARCHAR(20)")
            .pk("uuid")
            .partition("p")
            .options(options);

        builder.sink(dataStream, false); // The second parameter indicating whether the input data stream is bounded
        env.execute("Api_Sink");
    

3.ReadFromHudi.java 使用 DataStream API读取Hudi数据。关键实现逻辑:

第一步:build hudi source stream读取hudi数据。

第二步:使用fromDataStream将stream转化为table。

第三步:将Hudi table的数据使用print connector打印输出。

在项目实践中也可以直接读取Hudi数据后写入sink DataStream。

import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.table.api.Table;
import org.apache.flink.table.api.bridge.java.StreamTableEnvironment;
import org.apache.flink.table.data.RowData;
import org.apache.hudi.common.model.HoodieTableType;
import org.apache.hudi.configuration.FlinkOptions;

import java.util.HashMap;
import java.util.Map;

public class ReadFromHudi 
    public static void main(String[] args) throws Exception 
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        String targetTable = "hudiSourceTable";
        String basePath = "hdfs://hacluster/tmp/flinkHudi/hudiTable";

        Map<String, String> options = new HashMap<>();
        options.put(FlinkOptions.PATH.key(), basePath);
        options.put(FlinkOptions.TABLE_TYPE.key(), HoodieTableType.MERGE_ON_READ.name());
        options.put(FlinkOptions.READ_AS_STREAMING.key(), "true"); // this option enable the streaming read
        options.put("read.streaming.start-commit", "20210316134557"); // specifies the start commit instant time

        HoodiePipeline.Builder builder = HoodiePipeline.builder(targetTable)
            .column("uuid VARCHAR(20)")
            .column("name VARCHAR(10)")
            .column("age INT")
            .column("ts TIMESTAMP(3)")
            .column("p VARCHAR(20)")
            .pk("uuid")
            .partition("p")
            .options(options);

        DataStream<RowData> rowDataDataStream = builder.source(env);

        StreamTableEnvironment tableEnv = StreamTableEnvironment.create(env);
        Table table = tableEnv.fromDataStream(rowDataDataStream,"uuid, name, age, ts, p");

        tableEnv.registerTable("hudiSourceTable",table);

        tableEnv.executeSql("CREATE TABLE print("
            + "   uuid varchar(20),\\n"
            + "   name varchar(10),\\n"
            + "   age int,\\n"
            + "   ts timestamp(3),\\n"
            + "   p varchar(20)\\n"
            + ") WITH (\\n"
            + " 'connector' = 'print'\\n"
            + ")");

        tableEnv.executeSql("insert into print select * from hudiSourceTable");
        env.execute("Api_Source");
    

4.在项目实践中如果有解析Kafka复杂Json的需求:

1)使用FlinkSQL: https://bbs.huaweicloud.com/forum/thread-153494-1-1.html

2)使用Flink DataStream MapFunction实现。

点击关注,第一时间了解华为云新鲜技术~

 

以上是关于FusionInsight MRS Flink DataStream API读写Hudi实践的主要内容,如果未能解决你的问题,请参考以下文章

解密华为云FusionInsight MRS新特性:一架构三湖

解密华为云FusionInsight MRS新特性:一架构三湖

一文讲清楚FusionInsight MRS CDL如何使用

一文了解华为FusionInsight MRS HBase的集群隔离方案RSGroup

一文了解华为FusionInsight MRS HBase的集群隔离方案RSGroup

华为云FusionInsight MRS:助力企业构建“一企一湖,一城一湖”