The following bash script can be used to import Oracle records into a Hive table, partitioned by date. It uses Sqoop. Both Hive and Sqoop are part of typical Hadoop distributions, like the Hortonworks Sandbox, for example.
#!/bin/sh
function upper() {
echo "$1" | tr [a-z] [A-Z]
}
if [ $# -ge 5 ]; then
schema=$(upper $1)
table=$(upper $2)
column_to_split_by=$(upper $3)
date_column=$(upper $4)
date_value="$5"
else
echo
echo "Usage: $(basename $0) schema table column-to-split-by date-column YYYY-MM-DD"
echo
echo "Imports all records where value of date-column is \$date_value from"
echo "Oracle table \$schema.\$table as a Hive table partition."
echo "Hadoop will split the import job based on the column-to-split-by."
echo "* The table must have the columns specified as column-to-split-by and date-column."
echo "* The column-to-split-by must be finer granularity than date-column, ideally unique."
echo "* The date_value must be in YYYY-MM-DD format."
echo "* If date_value is unspecified, the current date will be used."
exit 1
fi
echo "schema = $schema"
echo "table = $table"
echo "column_to_split_by = $column_to_split_by"
echo "date_column = $date_column"
echo "date_value = $date_value"
# we have to drop the partition, because --hive-overwrite does not seem to do it
hive -e "use $schema; alter table $table drop if exists partition($date_column='$date_value');"
columns=$( \
sqoop eval \
--options-file /usr/local/etc/sqoop-options.txt \
--query "select column_name from all_tab_columns where table_name = '$table'" \
| tr -d " |" \
| grep -Ev "\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-|COLUMN_NAME|$date_column" \
| tr '\n' ',' \
| sed -e 's/\,$//'
)
query="select $columns from $schema.$table \
where $date_column = to_date('$date_value', 'YYYY-MM-DD') \
and \$CONDITIONS"
echo "query = $query"
sqoop import \
--options-file "/usr/local/etc/sqoop-options.txt" \
--query "$query" \
--split-by "$column_to_split_by" \
--target-dir "$schema.$table" \
--hive-import \
--hive-overwrite \
--hive-table "$schema.$table" \
--hive-partition-key "$date_column" \
--hive-partition-value "$date_value" \
--outdir $HOME/java
JDBC connection details
Put them into /usr/local/etc/sqoop-options.txt, in a format like this:
--connect
jdbc:oracle:thin:@hostname:port:hostname
--username
oracle_username
--password
oracle_password