Filter Hive Compactions

From Beeline or a standard JDBC client connected to Hive, compactions can be seen with the standard SQL:

SHOW COMPACTIONS;

But this method has a couple of problems:

  1. No Filtering
  2. Timestamps are hard to interpret

Until additional functionality is available for this built in function, we can do the following.

Add links to the Metastore DB tables and create custom views to review compaction details. See The Power of Hive JDBC Federation

Build Out Metadata Elements

A word of caution here. All the ‘extras’ you create against the metastore DB can/may break with the next release. This isn’t a supported method of accessing metadata.

Don’t add new tables to the current sys.db. That’s bad form and could cause issues for the platform during the next upgrade cycle, especially if there are naming conflicts. Put it somewhere else!!

CREATE DATABASE IF NOT EXISTS custom_sys;

DROP TABLE IF EXISTS `custom_sys`.`completed_compactions`;
DROP TABLE IF EXISTS `custom_sys`.`compaction_queue`;

CREATE EXTERNAL TABLE `custom_sys`.`completed_compactions`(
	CC_ID bigint COMMENT 'from deserializer',
	CC_DATABASE string COMMENT 'from deserializer',
	CC_TABLE string COMMENT 'from deserializer',
	CC_PARTITION string COMMENT 'from deserializer',
	CC_STATE string COMMENT 'from deserializer',
	CC_TYPE string COMMENT 'from deserializer',
	CC_TBLPROPERTIES string COMMENT 'from deserializer',
	CC_WORKER_ID string COMMENT 'from deserializer',
	CC_START bigint COMMENT 'from deserializer',
	CC_END bigint COMMENT 'from deserializer',
	CC_RUN_AS string COMMENT 'from deserializer',
	CC_HIGHEST_WRITE_ID bigint COMMENT 'from deserializer',
	CC_META_INFO string COMMENT 'from deserializer',
	CC_HADOOP_JOB_ID string COMMENT 'from deserializer'
)
 ROW FORMAT SERDE                                   
   'org.apache.hive.storage.jdbc.JdbcSerDe'         
 STORED BY                                          
   'org.apache.hive.storage.jdbc.JdbcStorageHandler'  
 WITH SERDEPROPERTIES (                             
   'serialization.format'='1')                      
 TBLPROPERTIES (                                    
   'bucketing_version'='2',                         
   'hive.sql.database.type'='METASTORE',            
   'hive.sql.query'='SELECT CC_ID, CC_DATABASE, CC_TABLE, CC_PARTITION, CC_STATE, CC_TYPE, CC_TBLPROPERTIES, CC_WORKER_ID, CC_START, CC_END, CC_RUN_AS, CC_HIGHEST_WRITE_ID, CC_META_INFO, CC_HADOOP_JOB_ID FROM COMPLETED_COMPACTIONS');

CREATE EXTERNAL TABLE `custom_sys`.`compaction_queue`(
	CQ_ID bigint COMMENT 'from deserializer',
	CQ_DATABASE string COMMENT 'from deserializer',
	CQ_TABLE string COMMENT 'from deserializer',
	CQ_PARTITION string COMMENT 'from deserializer',
	CQ_STATE string COMMENT 'from deserializer',
	CQ_TYPE string COMMENT 'from deserializer',
	CQ_TBLPROPERTIES string COMMENT 'from deserializer',
	CQ_WORKER_ID string COMMENT 'from deserializer',
	CQ_START bigint COMMENT 'from deserializer',
	CQ_RUN_AS string COMMENT 'from deserializer',
	CQ_HIGHEST_WRITE_ID bigint COMMENT 'from deserializer',
	CQ_META_INFO string COMMENT 'from deserializer',
	CQ_HADOOP_JOB_ID string COMMENT 'from deserializer'
)
 ROW FORMAT SERDE                                   
   'org.apache.hive.storage.jdbc.JdbcSerDe'         
 STORED BY                                          
   'org.apache.hive.storage.jdbc.JdbcStorageHandler'  
 WITH SERDEPROPERTIES (                             
   'serialization.format'='1')                      
 TBLPROPERTIES (                                    
   'bucketing_version'='2',                         
   'hive.sql.database.type'='METASTORE',            
   'hive.sql.query'='SELECT CQ_ID, CQ_DATABASE, CQ_TABLE, CQ_PARTITION, CQ_STATE, CQ_TYPE, CQ_TBLPROPERTIES, CQ_WORKER_ID, CQ_START, CQ_RUN_AS, CQ_HIGHEST_WRITE_ID, CQ_META_INFO, CQ_HADOOP_JOB_ID FROM COMPACTION_QUEUE');

DROP VIEW IF EXISTS `custom_sys`.`compactions`;

-- TODO: Handle / Show Aborted Transactions (maybe) I think txn data may be transient...
CREATE VIEW IF NOT EXISTS `custom_sys`.`compactions` AS
SELECT CC_ID AS id,
       CC_DATABASE AS `database`,
       CC_TABLE AS `table`,
       CC_PARTITION AS `partition`,
       CASE CC_STATE
           WHEN 's' THEN 'SUCCEEDED'
           WHEN 'a' THEN 'ATTEMPTED'
           WHEN 'f' THEN 'FAILED'
       END AS STATE,
       CASE CC_TYPE
           WHEN 'a' THEN 'MAJOR'
           WHEN 'i' THEN 'MINOR'
       END AS TYPE,
       CC_TBLPROPERTIES AS tblproperties,
       CC_WORKER_ID AS worker_id,
       to_utc_timestamp(CC_START,'UTC') AS `start`,
       to_utc_timestamp(CC_END, 'UTC') AS `end`,
       CC_RUN_AS AS run_as,
       CC_HIGHEST_WRITE_ID AS highest_write_id,
       CC_META_INFO AS meta_info,
       CC_HADOOP_JOB_ID AS hadoop_job_id
FROM `custom_sys`.`completed_compactions`
UNION ALL
SELECT CQ_ID AS id,
       CQ_DATABASE AS `database`,
       CQ_TABLE AS `table`,
       CQ_PARTITION AS `partition`,
       CASE CQ_STATE
           WHEN 'i' THEN 'INITIATED'
           WHEN 'w' THEN 'WORKING'
           WHEN 'r' THEN 'READY_FOR_CLEANING'
       END AS `state`,
       CASE CQ_TYPE
           WHEN 'a' THEN 'MAJOR'
           WHEN 'i' THEN 'MINOR'
       END AS `type`,
       CQ_TBLPROPERTIES AS tblproperties,
       CQ_WORKER_ID AS worker_id,
       to_utc_timestamp(CQ_START, 'UTC') AS `start`,
       NULL AS `end`,
               CQ_RUN_AS AS run_as,
               CQ_HIGHEST_WRITE_ID AS highest_write_id,
               CQ_META_INFO AS meta_info,
               CQ_HADOOP_JOB_ID AS hadoop_job_id
FROM `custom_sys`.`compaction_queue`;

See the last 10 compaction events

select 
	`database`, `table`, `partition`, `state`, `type`, `start`, `end`, hadoop_job_id 
FROM
	`custom_sys`.`compactions` 
ORDER BY `start` DESC 
LIMIT 10;

Show the last 10 failed compaction event

-- `state` options are 'SUCCEEDED', 'ATTEMPTED', 'FAILED', 'INITIATED', 'WORKING', and 'READY_FOR_CLEANING'
-- `type` options are 'MAJOR' and 'MINOR'
select 
`database`, `table`, `partition`, `state`, `type`, `start`, `end` 
FROM
	`custom_sys`.`compactions` 
WHERE `state` = 'FAILED' 
ORDER BY `start` DESC 
LIMIT 10;
Avatar
David W. Streever
Vice President - Engineering

Platform Enablement - Tiger Team

Related