Thursday, October 1, 2015

Utility to create lots of parquet files using hive

Create table in hive
CREATE TABLE partition_table_par(id INT, username string)
 PARTITIONED BY(year STRING, month STRING,day STRING,eventtype STRING,varfunction STRING,varname STRING)
 STORED AS PARQUET;

Bash Script to pump the data into the table which will store it in the parquet files
#!/bin/bash

for i in {1..100}
do
echo $i
year=`expr $i + 1996`
yy=\'$year\'
echo $yy
month=`expr $i % 12`
mm=\'$month\'
day=`expr $i % 30 `
dd=\'$day\'
eventtype=eventtype$i
et=\'$eventtype\'
varfunction=varfunction$i
vf=\'$varfunction\'
varname=varname$i
v=\'$varname\'


hive -e 'insert into table partition_table_par PARTITION(year='$yy',month='$mm',day='$dd',eventtype='$et',varfunction='$vf',varname='$v') select 1,'$yy' from test_table limit 1;'
#sleep 1m
done

No comments: