hbase on spark -use HiveCxt

来源:互联网 发布:2016年淘宝女装排行榜 编辑:程序博客网 时间:2024/05/18 18:42

maven文件

<properties>
    <!-- CDH-5.4.1 & Spark-1.3 -->
    <maven.compiler.source>1.7</maven.compiler.source>
    <maven.compiler.target>1.7</maven.compiler.target>
    <encoding>UTF-8</encoding>
    <scala.version>2.10.4</scala.version>
    <spark.version>1.3.0-cdh5.4.2</spark.version>
    <hadoop.version>2.6.0-cdh5.4.2</hadoop.version>
    <hbase.version>1.0.0-cdh5.4.2</hbase.version>
    <guava.groupId>com.google.guava</guava.groupId>
    <guava.artifactId>guava</guava.artifactId>
    <guava.version>16.0</guava.version>
  </properties>


  <dependencies>
    <dependency>
      <groupId>org.apache.spark</groupId>
      <artifactId>spark-hive_2.10</artifactId>
      <version>1.2.0-cdh5.3.0</version>
      <exclusions>
        <exclusion>
          <artifactId>commons-lang3</artifactId>
          <groupId>org.apache.commons</groupId>
        </exclusion>
      </exclusions>
    </dependency>


    <dependency>
      <groupId>org.apache.spark</groupId>
      <artifactId>spark-sql_2.10</artifactId>
      <version>${spark.version}</version>
      <scope>provided</scope>
    </dependency>


    <dependency>
      <groupId>org.scala-lang</groupId>
      <artifactId>scala-library</artifactId>
      <version>${scala.version}</version>
      <scope>provided</scope>
    </dependency>
    <dependency>
      <groupId>org.apache.spark</groupId>
      <artifactId>spark-core_2.10</artifactId>
      <version>${spark.version}</version>
      <scope>provided</scope>
      <exclusions>
        <exclusion>
          <artifactId>commons-lang3</artifactId>
          <groupId>org.apache.commons</groupId>
        </exclusion>
      </exclusions>
    </dependency>


    <dependency>
      <groupId>org.apache.hadoop</groupId>
      <artifactId>hadoop-client</artifactId>
      <version>${hadoop.version}</version>
      <scope>provided</scope>
    </dependency>
    <dependency>
      <groupId>org.apache.hbase</groupId>
      <artifactId>hbase-client</artifactId>
      <version>${hbase.version}</version>
      <scope>provided</scope>
    </dependency>
    <dependency>
      <groupId>org.apache.hbase</groupId>
      <artifactId>hbase-server</artifactId>
      <version>${hbase.version}</version>
      <scope>provided</scope>
    </dependency>
    <dependency>
      <groupId>org.apache.hbase</groupId>
      <artifactId>hbase-common</artifactId>
      <version>${hbase.version}</version>
      <scope>provided</scope>
    </dependency>


    <dependency>
      <groupId>${guava.groupId}</groupId>
      <artifactId>${guava.artifactId}</artifactId>
      <version>${guava.version}</version>
    </dependency>


    <!-- Test-->
    <dependency>
      <groupId>junit</groupId>
      <artifactId>junit</artifactId>
      <version>4.11</version>
    </dependency>
    <dependency>
      <groupId>org.scala-tools.testing</groupId>
      <artifactId>specs_2.10</artifactId>
      <version>1.6.9</version>
      <scope>test</scope>
    </dependency>
    <dependency>
      <groupId>org.scalatest</groupId>
      <artifactId>scalatest</artifactId>
      <version>1.2</version>
      <scope>test</scope>
    </dependency>
  </dependencies>


shell command:



THIS_PATH='/home/bigdata/script/weibo-userid'
cd $THIS_PATH


if [ -z "$1" ];then
   echo "please input date,The data format is yyyymmdd."


TMP_INPUT_TABLE=ad.click,ad.impression,wd.pageview


WEIBO_USERDATA_HBASE_TABLE=WeiboUserData:CF
OUTPUT_TABLE_DATA_LOCATION=/user/bigdata/weibooutput/event


param_date=`date  +'%Y-%m-%d %H:%M:%S'`
echo "$param_date weibo-userid begin"


#?兼.绫诲?


HIVE_HOME=/opt/cloudera/parcels/CDH/lib/hive


#?ц?spark绋.?
 export SPARK_CLASSPATH=$CLASSPATH &&
 --executor-cores 10 \
 --master yarn-client \
weibo-userid-storage-1.3.3-jar-with-dependencies.jar \
$TMP_INPUT_TABLE \
$DAY \
$OUTPUT_TABLE_DATA_LOCATION \
$OUTPUT_HBASE_TABLE \
$WEIBO_USERDATA_HBASE_TABLE


err=$?
if [ $err != 0 ];then
  echo "weibo-userid-storage job failed"
  exit  1
fi


##瀵煎.impala
cp ./template/loadweibousereventsdata ./loadweibousereventsdata.tmp
sed -i "s#\$\$day#${DAY}#g" ./loadweibousereventsdata.tmp
sed -i "s#\$\$month#${MONTH}#g" ./loadweibousereventsdata.tmp
sed -i "s#\$\$year#${YEAR}#g" ./loadweibousereventsdata.tmp
impala-shell -f ./loadweibousereventsdata.tmp
err=$?
if [ $err != 0 ];then
   echo "loadweibousereventsdata failed"
   exit 1
fi


hadoop fs -rm -r $OUTPUT_TABLE_DATA_LOCATION/$DAY
err=$?
if [ $err != 0 ];then
   echo "delete $OUTPUT_TABLE_DATA_LOCATION/$DAY  failed"
   exit 1
fi


param_date=`date  +'%Y-%m-%d %H:%M:%S'`
echo "$param_date  weibo-userid finished"


0 0
原创粉丝点击