sql 使用Hadoop，Hive和HBase进行Apache日志分析

Posted 2021-05-13

tags:

篇首语：本文由小常识网(cha138.com)小编为大家整理，主要介绍了sql 使用Hadoop，Hive和HBase进行Apache日志分析相关的知识，希望对你有一定的参考价值。

-- This is a Hive program. Hive is an SQL-like language that compiles
-- into Hadoop Map/Reduce jobs. It's very popular among analysts at
-- Facebook, because it allows them to query enormous Hadoop data
-- stores using a language much like SQL.

-- Our logs are stored on the Hadoop Distributed File System, in the
-- directory /logs/randomhacks.net/access.  They're ordinary Apache
-- logs in *.gz format.
--
-- We want to pretend that these gzipped log files are a database table,
-- and use a regular expression to split lines into database columns.
CREATE EXTERNAL TABLE access(
  host STRING,
  identity STRING,
  user STRING,
  time STRING,
  request STRING,
  status STRING,
  size STRING,
  referer STRING,
  agent STRING)
ROW FORMAT SERDE 'org.apache.hadoop.hive.contrib.serde2.RegexSerDe'
WITH SERDEPROPERTIES (
  "input.regex" = "([^ ]*) ([^ ]*) ([^ ]*) (-|\\[[^\\]]*\\]) ([^ \"]*|\"[^\"]*\") (-|[0-9]*) (-|[0-9]*)(?: ([^ \"]*|\"[^\"]*\") ([^ \"]*|\"[^\"]*\"))?",
  "output.format.string" = "%1$s %2$s %3$s %4$s %5$s %6$s %7$s %8$s %9$s"
)
STORED AS TEXTFILE
LOCATION '/logs/randomhacks.net/access';

-- We want to store our logs in HBase, which is designed to hold tables
-- with billions of rows and millions of columns. HBase stores rows
-- sorted by primary key, so you can efficiently read all records within
-- a given range of keys.
--
-- Here, our key is a Unix time stamp, and we assume that it always has
-- the same number of digits (hey, this was a dodgy late night hack).
-- So we could easily grab all the records from a specific time period.
--
-- We store our data in 3 column families: "m" for metadata, "r" for
-- referrer data, and "a" for user-agent data. This allows us to only
-- load a subset of columns for a given query.
CREATE EXTERNAL TABLE access_hbase(
  key STRING,          -- Unix time + ":" + unique identifier.
  host STRING,         -- The IP address of the host making the request.
  identity STRING,     -- ??? (raw log data)
  user STRING,         -- ??? (raw log data)
  time BIGINT,         -- Unix time, UTC.
  method STRING,       -- "GET", etc.
  path STRING,         -- "/logo.png", etc.
  protocol STRING,     -- "HTTP/1.1", etc.
  status SMALLINT,     -- 200, 404, etc.
  size BIGINT,         -- Response size, in bytes.
  referer_host STRING, -- "www.google.com", etc.
  referer STRING,      -- Full referrer string.
  agent STRING)        -- Full agent string.
STORED BY 'org.apache.hadoop.hive.hbase.HBaseStorageHandler'
WITH SERDEPROPERTIES (
  "hbase.columns.mapping" = ":key,m:host,m:identity,m:user,m:time,m:method,m:path,m:protocol,m:status,m:size,r:referer_host,r:referer,a:agent"
)
TBLPROPERTIES ("hbase.table.name" = "randomhacks_access");

-- Copy our data from raw Apache log files to HBase, cleaning it up as we go.  This is basically
-- a pseudo-SQL query which calls a few Java helpers.
--
-- Note the "TABLESAMPLE" clause, which says to pick one of every 20 records at random.
INSERT OVERWRITE TABLE access_hbase
  SELECT concat(cast(unix_timestamp(time, "[dd/MMM/yyyy:HH:mm:ss Z]") AS STRING), ":", guid()) AS key,
         host,
         unquote_apache(identity),
         unquote_apache(user),
         unix_timestamp(time, "[dd/MMM/yyyy:HH:mm:ss Z]"),
         re_extract(unquote_apache(request), "([^ ]*) ([^ ]*) ([^\"]*)", 1) AS method,
         re_extract(unquote_apache(request), "([^ ]*) ([^ ]*) ([^\"]*)", 2) AS path,
         re_extract(unquote_apache(request), "([^ ]*) ([^ ]*) ([^\"]*)", 3) AS protocol,
         cast(status AS SMALLINT) AS status,
         cast(size AS BIGINT) AS size,
         re_extract(unquote_apache(referer), "[^:]+:?/+([^/]*).*", 1) AS referer_host,
         unquote_apache(referer) AS referer,
         unquote_apache(agent)
    FROM access TABLESAMPLE(BUCKET 1 OUT OF 20 ON rand())
    WHERE unix_timestamp(time, "[dd/MMM/yyyy:HH:mm:ss Z]") IS NOT NULL;

-- Find the 50 most popular pages on the site.
SELECT path, count(*) AS cnt
  FROM access_hbase GROUP BY path
  ORDER BY cnt DESC LIMIT 50;

-- Categorize our articles by year and count how many hits each year received.
SELECT pubyear, count(*)
  FROM (SELECT re_extract(path, "/articles/([0-9]+)/.*", 1) AS pubyear
          FROM access_hbase) access
  WHERE pubyear IS NOT NULL
  GROUP BY pubyear;

以上是关于sql 使用Hadoop，Hive和HBase进行Apache日志分析的主要内容，如果未能解决你的问题，请参考以下文章

hdfs hbase hive hbase分别的适用场景