WITH raw_messages AS (SELECT lines
FROM `my_table`
WHERE REGEXP_CONTAINS(lines, '^Date of application: '))
SELECT
REGEXP_EXTRACT(lines, r'^Date of application: [0-9]2/[0-9]2/[0-9]4') as date
FROM raw_messages
#standardSQL
WITH raw_messages AS (
SELECT lines FROM `project.dataset.my_table`
WHERE REGEXP_CONTAINS(lines, '^Date of application: ')
), fields AS (
SELECT 'Date of application' field, 'date' column UNION ALL
SELECT 'Request', 'request' UNION ALL
SELECT 'Contact', 'contact' UNION ALL
SELECT 'email', 'email' UNION ALL
SELECT 'Tel', 'phone' UNION ALL
SELECT 'Order ID', 'id' UNION ALL
SELECT 'BL', 'bl' UNION ALL
SELECT 'Product', 'product' UNION ALL
SELECT 'Ordered inquiry', '' UNION ALL
SELECT 'Boundary of string', ''
), patterns AS (
SELECT f1.field, f1. column, CONCAT(r'(?i) ',f1.field,': (.*)',f2.field,': ') pattern
FROM fields f1 CROSS JOIN fields f2
), splits AS (SELECT ARRAY(
SELECT AS STRUCT column, ARRAY_AGG(value ORDER BY LENGTH(value) LIMIT 1)[OFFSET(0)] value
FROM (SELECT column, REGEXP_EXTRACT(CONCAT(' Boundary of string: ', lines, ' Boundary of string: '), pattern) value
FROM patterns )
WHERE NOT value IS NULL AND NOT column = '' GROUP BY column
) arr FROM raw_messages
) SELECT
(SELECT value FROM UNNEST(arr) WHERE column='date') AS DATE,
(SELECT value FROM UNNEST(arr) WHERE column='request') AS request,
(SELECT value FROM UNNEST(arr) WHERE column='contact') AS contact,
(SELECT value FROM UNNEST(arr) WHERE column='email') AS email,
(SELECT value FROM UNNEST(arr) WHERE column='phone') AS phone,
(SELECT value FROM UNNEST(arr) WHERE column='id') AS id,
(SELECT value FROM UNNEST(arr) WHERE column='bl') AS bl,
(SELECT value FROM UNNEST(arr) WHERE column='product') AS product
FROM splits
您可以使用与我的另一个答案相同的虚拟数据进行上述测试,显然结果应该是相同的
注意:如您所见 - 您需要明确设置 fields AS (...) CTE,字符串中的所有字段和相应的列名以任何顺序使用但很重要 - 您需要在此处再添加一个条目 - 'Boundary of string'
【讨论】:
【参考方案2】:
以下是 BigQuery 标准 SQL
假设您的行中字段的顺序设置为您的示例中的设置
#standardSQL
WITH raw_messages AS (
SELECT lines
FROM `my_table`
WHERE REGEXP_CONTAINS(lines, '^Date of application: ')
)
SELECT
REGEXP_EXTRACT(lines, r'(?i)^Date of application: ([0-9]2/[0-9]2/[0-9]4)') AS DATE,
REGEXP_EXTRACT(lines, r'(?i) Request: (.*?) Contact: ') AS request,
REGEXP_EXTRACT(lines, r'(?i) Contact: (.*?) email: ') AS contact,
REGEXP_EXTRACT(lines, r'(?i) email: (.*?) Tel: ') AS email,
REGEXP_EXTRACT(lines, r'(?i) Tel: (.*?) Ordered inquiry: ') AS phone,
REGEXP_EXTRACT(lines, r'(?i) Order ID: (.*?) BL: ') AS id,
REGEXP_EXTRACT(lines, r'(?i) BL: (.*?) Product: ') AS bl,
REGEXP_EXTRACT(lines, r'(?i) Product: (.*?)$') AS product
FROM raw_messages
您可以使用您问题中的虚拟数据进行测试,如下所示
#standardSQL
WITH `project.dataset.my_table` AS (
SELECT 'Date of application: 01/02/2018 Request: Buy books Contact: email: hi@gmail.com Tel: 0123456789 Ordered inquiry: Order ID: 12345678 BL: 87654321 Product: 123456 Books' lines UNION ALL
SELECT 'Date of application: 01/04/2018 Request: Retour table Contact: Rodion Raskólnikov email: hello@outlook.com Tel: 9876543210 Ordered inquiry: Order Id: 87654321 BL: 12345678 Product: 654321 Tables'
), raw_messages AS (
SELECT lines
FROM `project.dataset.my_table`
WHERE REGEXP_CONTAINS(lines, '^Date of application: ')
)
SELECT
REGEXP_EXTRACT(lines, r'(?i)^Date of application: ([0-9]2/[0-9]2/[0-9]4)') AS DATE,
REGEXP_EXTRACT(lines, r'(?i) Request: (.*?) Contact: ') AS request,
REGEXP_EXTRACT(lines, r'(?i) Contact: (.*?) email: ') AS contact,
REGEXP_EXTRACT(lines, r'(?i) email: (.*?) Tel: ') AS email,
REGEXP_EXTRACT(lines, r'(?i) Tel: (.*?) Ordered inquiry: ') AS phone,
REGEXP_EXTRACT(lines, r'(?i) Order ID: (.*?) BL: ') AS id,
REGEXP_EXTRACT(lines, r'(?i) BL: (.*?) Product: ') AS bl,
REGEXP_EXTRACT(lines, r'(?i) Product: (.*?)$') AS product
FROM raw_messages