`
bupt04406
  • 浏览: 343654 次
  • 性别: Icon_minigender_1
  • 来自: 杭州
社区版块
存档分类
最新评论

Hive tag

    博客分类:
  • Hive
阅读更多
ExecReducer{
   private boolean isTagged = false;
  
   @Override
   public void configure(JobConf job) {
        MapredWork gWork = Utilities.getMapRedWork(job);
        isTagged = gWork.getNeedsTagging(); //初始化
   }
}

//只有在reducer是JoinOperator.class的时候才会设置needsTagging为true
org.apache.hadoop.hive.ql.plan.MapredWork
  public void setNeedsTagging(boolean needsTagging) {
    this.needsTagging = needsTagging;
  }
        if (reducer.getClass() == JoinOperator.class) {
          plan.setNeedsTagging(true);
        }

org.apache.hadoop.hive.ql.exec.ExecReducer.reduce(Object key, Iterator values, OutputCollector output,
      Reporter reporter){
      BytesWritable keyWritable = (BytesWritable) key;
      tag.set((byte) 0); //默认tag为0
      if (isTagged) { // tag上了
        // remove the tag
        int size = keyWritable.getSize() - 1; // keyWritable的长度
        tag.set(keyWritable.get()[size]); //最后一个byte是tag的值。
        keyWritable.setSize(size);
      }
}

join1.q.out:在一个MR中计算两个表的join,tag为0,1,在reduce阶段就可以区分一个row是来自哪个table。
EXPLAIN
FROM src src1 JOIN src src2 ON (src1.key = src2.key)
INSERT OVERWRITE TABLE dest_j1 SELECT src1.key, src2.value
STAGE DEPENDENCIES:
  Stage-1 is a root stage
  Stage-0 depends on stages: Stage-1

STAGE PLANS:
  Stage: Stage-1
    Map Reduce
      Alias -> Map Operator Tree:
        src1
          TableScan
            alias: src1
            Reduce Output Operator
              key expressions:
                    expr: key
                    type: string
              sort order: +
              Map-reduce partition columns:
                    expr: key
                    type: string
              tag: 0
              value expressions:
                    expr: key
                    type: string
        src2
          TableScan
            alias: src2
            Reduce Output Operator
              key expressions:
                    expr: key
                    type: string
              sort order: +
              Map-reduce partition columns:
                    expr: key
                    type: string
              tag: 1
              value expressions:
                    expr: value
                    type: string
      Reduce Operator Tree:
        Join Operator
          condition map:
               Inner Join 0 to 1
          condition expressions:
            0 {VALUE._col0}
            1 {VALUE._col1}
          handleSkewJoin: false
          outputColumnNames: _col0, _col3
          Select Operator
            expressions:
                  expr: _col0
                  type: string
                  expr: _col3
                  type: string
            outputColumnNames: _col0, _col1
            Select Operator
              expressions:
                    expr: UDFToInteger(_col0)
                    type: int
                    expr: _col1
                    type: string
              outputColumnNames: _col0, _col1
              File Output Operator
                compressed: false
                GlobalTableId: 1
                table:
                    input format: org.apache.hadoop.mapred.TextInputFormat
                    output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
                    serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
                    name: dest_j1

  Stage: Stage-0
    Move Operator
      tables:
          replace: true
          table:
              input format: org.apache.hadoop.mapred.TextInputFormat
              output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
              serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
              name: dest_j1


join3.q.out:在一个MR中计算三个表的Join,tag为0,1,2,在reduce阶段就可以区分一个row是来自哪个table。
EXPLAIN
FROM src src1 JOIN src src2 ON (src1.key = src2.key) JOIN src src3 ON (src1.key = src3.key)
INSERT OVERWRITE TABLE dest1 SELECT src1.key, src3.value

STAGE DEPENDENCIES:
  Stage-1 is a root stage
  Stage-0 depends on stages: Stage-1

STAGE PLANS:
  Stage: Stage-1
    Map Reduce
      Alias -> Map Operator Tree:
        src1
          TableScan
            alias: src1
            Reduce Output Operator
              key expressions:
                    expr: key
                    type: string
              sort order: +
              Map-reduce partition columns:
                    expr: key
                    type: string
              tag: 0
              value expressions:
                    expr: key
                    type: string
        src2
          TableScan
            alias: src2
            Reduce Output Operator
              key expressions:
                    expr: key
                    type: string
              sort order: +
              Map-reduce partition columns:
                    expr: key
                    type: string
              tag: 1
        src3
          TableScan
            alias: src3
            Reduce Output Operator
              key expressions:
                    expr: key
                    type: string
              sort order: +
              Map-reduce partition columns:
                    expr: key
                    type: string
              tag: 2
              value expressions:
                    expr: value
                    type: string
      Reduce Operator Tree:
        Join Operator
          condition map:
               Inner Join 0 to 1
               Inner Join 0 to 2
          condition expressions:
            0 {VALUE._col0}
            1
            2 {VALUE._col1}
          handleSkewJoin: false
          outputColumnNames: _col0, _col5
          Select Operator
            expressions:
                  expr: _col0
                  type: string
                  expr: _col5
                  type: string
            outputColumnNames: _col0, _col1
            Select Operator
              expressions:
                    expr: UDFToInteger(_col0)
                    type: int
                    expr: _col1
                    type: string
              outputColumnNames: _col0, _col1
              File Output Operator
                compressed: false
                GlobalTableId: 1
                table:
                    input format: org.apache.hadoop.mapred.TextInputFormat
                    output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
                    serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
                    name: dest1

  Stage: Stage-0
    Move Operator
      tables:
          replace: true
          table:
              input format: org.apache.hadoop.mapred.TextInputFormat
              output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
              serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
              name: dest1
分享到:
评论

相关推荐

Global site tag (gtag.js) - Google Analytics