This is the documentation for CDH 4.7.1. Documentation for other versions is available at Cloudera Documentation.

Cloudera Manager 4 and CDH 4 have reached End of Maintenance (EOM) on August 9, 2015. Cloudera will not support or provide patches for any of the Cloudera Manager 4 or CDH 4 releases after that date.

Accessing Table Data with MapReduce

The following example MapReduce program reads from the groups table (consisting of data from /etc/group), extracts the first and third columns, and inserts them into the groupids table.

package com.cloudera.test;

import java.util.*;

import org.apache.hadoop.fs.Path;
import org.apache.hadoop.conf.*;
import org.apache.hadoop.mapreduce.*;
import org.apache.hadoop.util.*;
import org.apache.hcatalog.common.*;
import org.apache.hcatalog.mapreduce.*;

public class UseHCat extends Configured implements Tool {

    public static class Map extends Mapper<WritableComparable, HCatRecord, Text, IntWritable> {
        String groupname;

      protected void map( WritableComparable key,
                          HCatRecord value,
                          org.apache.hadoop.mapreduce.Mapper<WritableComparable, HCatRecord,
                          Text, IntWritable>.Context context)
            throws IOException, InterruptedException {
            // The group table from /etc/group has name, 'x', id
            groupname = (String) value.get(0);
            int id = (Integer) value.get(2);
            // Just select and emit the name and ID
            context.write(new Text(groupname), new IntWritable(id));

    public static class Reduce extends Reducer<Text, IntWritable,
                                       WritableComparable, HCatRecord> {

        protected void reduce( Text key,
                               java.lang.Iterable<IntWritable> values,
                               org.apache.hadoop.mapreduce.Reducer<Text, IntWritable,
                               WritableComparable, HCatRecord>.Context context)
            throws IOException, InterruptedException {
            // Only expecting one ID per group name
            Iterator<IntWritable> iter = values.iterator();
            IntWritable iw =;
            int id = iw.get();
            // Emit the group name and ID as a record
            HCatRecord record = new DefaultHCatRecord(2);
            record.set(0, key.toString());
            record.set(1, id);
            context.write(null, record);

    public int run(String[] args) throws Exception {
        Configuration conf = getConf();
        args = new GenericOptionsParser(conf, args).getRemainingArgs();

        // Get the input and output table names as arguments
        String inputTableName = args[0];
        String outputTableName = args[1];
        // Assume the default database
        String dbName = null;

        Job job = new Job(conf, "UseHCat");
        HCatInputFormat.setInput(job, InputJobInfo.create(dbName,
                inputTableName, null));

        // An HCatalog record as input

        // Mapper emits a string as key and an integer as value

        // Ignore the key for the reducer output; emitting an HCatalog record as value

        HCatOutputFormat.setOutput(job, OutputJobInfo.create(dbName,
                   outputTableName, null));
        HCatSchema s = HCatOutputFormat.getTableSchema(job);
        System.err.println("INFO: output schema explicitly set for writing:" + s);
        HCatOutputFormat.setSchema(job, s);
        return (job.waitForCompletion(true) ? 0 : 1);

    public static void main(String[] args) throws Exception {
        int exitCode = UseHCat(), args);

Load data from the local file system into the groups table:

$ hive -e "load data local inpath '/etc/group' overwrite into table groups"
After compiling and creating a JAR file, set up the environment that is needed for copying required JAR files to HDFS and then run the job, for example:
  Note: You can find current version numbers for CDH dependencies in CDH's root pom.xml file for the current release, for example cdh-root-4.4.0.pom.)
$ export HCAT_HOME=/usr/lib/hcatalog
$ export HIVE_HOME=/usr/lib/hive
$ HCATJAR=$HCAT_HOME/share/hcatalog/hcatalog-core-0.5.0-cdh4.4.0.jar
$ HCATPIGJAR=$HCAT_HOME/share/hcatalog/hcatalog-pig-adapter-0.5.0-cdh4.4.0.jar
$ HIVE_VERSION=0.10.0-cdh4.4.0
$ export HADOOP_CLASSPATH=$HCATJAR:$HCATPIGJAR:$HIVE_HOME/lib/hive-exec-$HIVE_VERSION.jar:$HIVE_HOME/lib/hive-metastore-$HIVE_VERSION.jar:$HIVE_HOME/lib/jdo2-api-2.3-ec.jar:$HIVE_HOME/lib/libfb303-0.9.0.jar:$HIVE_HOME/lib/libthrift-0.9.0.jar:$HIVE_HOME/lib/slf4j-api-1.6.4.jar:$HIVE_HOME/conf:/etc/hadoop/conf
$ LIBJARS=`echo $HADOOP_CLASSPATH | sed -e 's/:/,/g'`
$ export LIBJARS=$LIBJARS,$HIVE_HOME/lib/antlr-runtime-3.4.jar

$ hadoop jar target/UseHCat-1.0.jar com.cloudera.test.UseHCat -files $HCATJAR -libjars $LIBJARS groups groupids
This page last updated August 5, 2015