Supersonic关于一个Hash Join的测试代码分析

来源:互联网 发布:网络有什么危害 编辑:程序博客网 时间:2024/05/21 10:13

    这几天在看Supersonic的代码,下面我自己根据其提供的hash join的示例改写的一个测试示例,实现了两个表做Hash Join。

    编译:

g++ -I/home/kernel0017/supersonic/supersonic/ -I../glog/src/ -I../gflags/src/ -I../protobuf/src/ -I../re2 -I /home/kernel0017/supersonic/supersonic/gmock/gtest/include -L/usr/local/lib -lsupersonic -lgflags -lglog -lprotobuf -lre2 -lboost_timer -Wno-deprecated -g -o testhashjoin testhashjoin.cc
 
运行:./testhashjoin。

/home/kernel0017/supersonic/supersonic/是supersonic的头文件目录,../glog/src是glog的头文件目录,../gflags/src是gflags的头文件目录,../protobuf/src是protobuf的头文件目录,../re2是re2的头文件目录,/home/kernel0017/supersonic/supersonic/gmock/gtest/include是Supersonic源码中使用的gtest头文件目录,/usr/local/lib是所有用到的库文件的安装目录


#include <map>

using std::map;
using std::multimap;
#include <set>
using std::multiset;
using std::set;
#include <utility>
using std::make_pair;
using std::pair;


#include "gtest/gtest.h"


#include "supersonic/supersonic.h"
#include "supersonic/cursor/core/sort.h"
#include "supersonic/cursor/infrastructure/ordering.h"
#include "supersonic/utils/strings/stringpiece.h"


// Include some map utilities to use for result verification.
#include "supersonic/utils/map-util.h"


using supersonic::Attribute;
using supersonic::Block;
using supersonic::Cursor;
using supersonic::Operation;
using supersonic::FailureOr;
using supersonic::FailureOrOwned;
using supersonic::GetConstantExpressionValue;
using supersonic::TupleSchema;
using supersonic::Table;
using supersonic::TableRowWriter;
using supersonic::View;
using supersonic::ViewCopier;
using supersonic::HashJoinOperation;
using supersonic::HeapBufferAllocator;
using supersonic::JoinType;
using supersonic::ProjectNamedAttribute;
using supersonic::ProjectNamedAttributeAs;
using supersonic::rowid_t;
using supersonic::SingleSourceProjector;
using supersonic::MultiSourceProjector;
using supersonic::CompoundSingleSourceProjector;
using supersonic::CompoundMultiSourceProjector;
using supersonic::ResultView;
using supersonic::ScanView;
using supersonic::SucceedOrDie;


using supersonic::If;
using supersonic::IfNull;
using supersonic::Less;
using supersonic::CompoundExpression;
using supersonic::Expression;
using supersonic::Compute;
using supersonic::Generate;
using supersonic::ParseStringNulling;
using supersonic::ConstBool;
using supersonic::ConstString;
using supersonic::ConstInt32;
using supersonic::Null;


using supersonic::INNER;
using supersonic::UNIQUE;


using supersonic::INT32;
using supersonic::NOT_NULLABLE;
using supersonic::NULLABLE;
using supersonic::STRING;
using supersonic::DATE;
using supersonic::BOOL;


using supersonic::rowcount_t;


//本例实现一个Hash join。
//本测试也是用存在内存中的表,以便在表里插入行。我们也将用sinks来将计算数据传入tables里面。
class HashJoinTest {
 public:
   void SetUp() {
    //author表和book表做hash join.简单起见,一本书只有一个作者。
    author_schema.add_attribute(Attribute("author_id", INT32, NOT_NULLABLE));
    author_schema.add_attribute(Attribute("name", STRING, NOT_NULLABLE));
    author_schema.add_attribute(Attribute("nobel", BOOL, NOT_NULLABLE));



    //supersonic提供了两种时间数据类型:DATE和DATETIME。DATE以天为单位,DATETIME以ms为单位。
    //DATE和DATETIME对象分别是以32位和64位的整型存储的。
    //我们也会处理空值,一个列为空,用Attribute(),如下date_published,值可为空的意思。
    book_schema.add_attribute(Attribute("book_id", INT32, NOT_NULLABLE));
    book_schema.add_attribute(Attribute("author_id_ref", INT32, NULLABLE));
    book_schema.add_attribute(Attribute("title", STRING, NOT_NULLABLE));
    book_schema.add_attribute(Attribute("date_published",
                                        DATE,
                                        NULLABLE));


    //首先我们先创建tables;
    author_table.reset(new Table(author_schema,
                                 HeapBufferAllocator::Get()));
    book_table.reset(new Table(book_schema,
                               HeapBufferAllocator::Get()));
    //两种方法写入数据到tables中:
    //1、TableRowWriter  比较适合于简单的测试环境。
    //2、直接写入table。


    author_table_writer.reset(new TableRowWriter(author_table.get()));


    //输入计数器来产生ID。
    author_count = 0;
    book_count = 0;
  }


  void PrepareJoin() {
    //


//在开始实现join之前,我们必须先考虑left hand side(lhs) 和 right hand side(rhs)的问题。
 //在Supersonic right hand side本作为index,它应该是相对小的表。来自lhs cursor的数据
 //以流的形式和index进行匹配。在本例中,authors明显要比books少,所以book表作为lhs,author表作为rhs.
 //Supersonic可以为index开启一些特定的优化,我们不久就会涉及。


 //我们现在为两个表准备single source projectors(key selectors)。
 scoped_ptr<const SingleSourceProjector> book_selector(
        ProjectNamedAttribute("author_id_ref"));


    scoped_ptr<const SingleSourceProjector> author_selector(
        ProjectNamedAttribute("author_id"));


    //我们用一个mutisource projector来表示我们想要得到的结果,因此我们用CompoundMultiSourceProjector
    //Supersonic将把两个schema与其绑定。我们现在指定哪些列要投影到result里面,并且要消除重复,
    //我们可以对于重复的列想个新名字,也可以直接舍弃其中一些。
    //
    scoped_ptr<CompoundMultiSourceProjector> result_projector(
        new CompoundMultiSourceProjector());


    //add()函数用于multi source projector,不像single source需要两个参数,source index和single source projector
    //我们现在需要指定哪些属性要投影。我们可以走捷径用ProjectAllAtributes,但是它对于我们要做join的两列产生不好的影响。
    //
    scoped_ptr<CompoundSingleSourceProjector> result_book_projector(
        new CompoundSingleSourceProjector());
    result_book_projector->add(ProjectNamedAttribute("title"));
    result_book_projector->add(ProjectNamedAttribute("date_published"));
    result_book_projector->add(ProjectNamedAttribute("book_id"));


    scoped_ptr<CompoundSingleSourceProjector> result_author_projector(
        new CompoundSingleSourceProjector());

    result_author_projector->add(
        ProjectNamedAttributeAs("name", "author_name"));
    result_author_projector->add(ProjectNamedAttribute("nobel"));
    result_author_projector->add(ProjectNamedAttribute("author_id"));


    //将single source projector的内容写入result_projector中
    result_projector->add(0, result_book_projector.release());
    result_projector->add(1, result_author_projector.release());


    //首先,我们要决定我们要进行什么类型的Join,目前Supersonic只支持两种:内连接和左外连接。
    //为了将不知道作者的书的条目排除,此处用的是内连接


    //其次,supersonic也要求我们检查rhs schema的数据,所有的关键字是不是唯一的。如果我们事先知道这个信息,
    //我们就可以开启hash join优化。如果有重复值,或者我们不能确定是否有重复值,我们要用NOT_UNIQUE选项。
    //在这个案例中,对于rhs index我们可以启用优化。


    //现在我们来创建一个Operation
    scoped_ptr<Operation> hash_join(
        new HashJoinOperation(/* join type */ INNER,
                              /* select left */ book_selector.release(),
                              /* select right */ author_selector.release(),
                              /* project result */ result_projector.release(),
                              /* unique keys on the right ? */ UNIQUE,
                              /* left data */ ScanView(book_table->view()),
                              /* right data */ ScanView(author_table->view())));
    result_cursor.reset(SucceedOrDie(hash_join->CreateCursor()));
  }


  //添加作者的方法会创造一个是否获过诺贝尔奖的条目。返回author_id用以关联books和authors.
  int32 AddAuthor(const StringPiece& name, bool nobel) {
    int32 author_id = author_count++;
    // 在table中写数据时一定要注意字段的顺序。
    author_table_writer
        ->AddRow().Int32(author_id).String(name).Bool(nobel).CheckSuccess();
    return author_id;
  }


  //我们用直接的方法在book table里面写入数据。在这儿我们也增加了对于Null值的支持。
  int32 AddBook(const StringPiece& title,
                const StringPiece& date_published,
                int32 author_id) {
    int32 book_id = book_count++;


    CHECK_EQ(book_id, book_table->row_count());


    rowid_t row_id = book_table->AddRow();


    // setting Attribute("book_id", INT32, NOT_NULLABLE).
    book_table->Set<INT32>(0, row_id, book_id);


    // setting Attribute("author_id_ref", INT32, NULLABLE).
    if (author_id >= 0) {
      book_table->Set<INT32>(1, row_id, author_id);
    } else {
      book_table->SetNull(1, row_id);
    }
    // setting Attribute("title", STRING, NOT_NULLABLE).
    // This makes a deep copy of the StringPiece.
    book_table->Set<STRING>(2, row_id, title);


    // setting Attribute("date_published", DATE, NULLABLE).


    //DATEs 内部表示是32位整型,我们用32位整型表示这个值。另一个方法就是存string到table里面来表示
    //在我们调用之前用ParseStringNulling转换
    //ParseStringNulling可以将string expression转换为date object。空输入或者无效输入将创建一个null entry
    //DATETIME有一个DATE没有的捷径,即用ConstDateTime方法直接从StringPieces创建对象
    scoped_ptr<const Expression> date_or_null(
        ParseStringNulling(DATE, ConstString(date_published)));
    bool date_published_is_null = false;
    FailureOr<int32> data_published_as_int32 =
        GetConstantExpressionValue<DATE>(*date_or_null,
                                         &date_published_is_null);
    CHECK(data_published_as_int32.is_success())
        << data_published_as_int32.exception().ToString();


    if (!date_published_is_null) {
      book_table->Set<DATE>(3, row_id, data_published_as_int32.get());
    } else {
      book_table->SetNull(3, row_id);
    }
    return book_id;
  }




  //将author names和book titles用ids(authors)和author reference ids(books)映射起来
  typedef map<int32, StringPiece> author_name_map;
  typedef multimap<int32, StringPiece> book_title_map;


  // Utilities for storing pairs of (name, title).
  typedef pair<StringPiece, StringPiece> author_book_entry;
  typedef set<author_book_entry> author_book_set;


  void TestResults() {
     //检查结果是否满足需求,首先,我们必须把轮询rows,将它们放到一个内存块里。
    scoped_ptr<Block> result_space(new Block(result_cursor->schema(),
                                             HeapBufferAllocator::Get()));


    ViewCopier copier(result_cursor->schema(), /* deep copy */ true);
    rowcount_t offset = 0;
    scoped_ptr<ResultView> rv(new ResultView(result_cursor->Next(-1)));


    //!rv->is_done()的意思是游标既没有读完而且也没有发生错误的情况下,执行循环体。
    while (!rv->is_done()) {
      const View& view = rv->view();
      rowcount_t view_row_count = view.row_count();


      //为新值分配block,我们事先不知道需要多少个。
      result_space->Reallocate(offset + view_row_count);


      rowcount_t rows_copied = copier.Copy(view_row_count,
                                           view,
                                           offset,
                                           result_space.get());


      offset += rows_copied;
      rv.reset(new ResultView(result_cursor->Next(-1)));
    }


    const View& result_view(result_space->view());

//输出生成的结果表。

    for (int32 k=0;k<result_view.column_count();k++)
         {
           std::cout<<result_view.schema().attribute(k).name()<<"\t";
         }
    std::cout<<std::endl;
    for(int j=0; j<result_view.row_count();j++)
          {


             std::cout<<result_view.column(0).typed_data<STRING>()[j]<<"\t";
             std::cout<<result_view.column(1).typed_data<DATE>()[j]<<"\t";
             std::cout<<result_view.column(2).typed_data<INT32>()[j]<<"\t";
             std::cout<<result_view.column(3).typed_data<STRING>()[j]<<"\t";
             std::cout<<result_view.column(4).typed_data<BOOL>()[j]<<"\t";
             std::cout<<result_view.column(5).typed_data<INT32>()[j]<<"\t";
             std::cout<<std::endl;
          }




  }


  // Supersonic objects.
  scoped_ptr<Cursor> result_cursor;


  TupleSchema author_schema;
  TupleSchema book_schema;


  scoped_ptr<Table> author_table;
  scoped_ptr<TableRowWriter> author_table_writer;
  scoped_ptr<Table> book_table;


  // Sequence counters.
  int32 author_count;
  int32 book_count;
};


int main(void) {
  // DISCLAIMER: The values below should by no means be used as a reliable
  // information source, especially the publishing dates are not accurate,
  // although the years should match reality... :)
  HashJoinTest test;
  test.SetUp();
  int32 terry_id = test.AddAuthor("Terry Pratchett", false);
  int32 chuck_id = test.AddAuthor("Chuck Palahniuk", false);
  int32 ernest_id = test.AddAuthor("Ernest Hemingway", true);


  // Again, in a production environment one would use a simpler INT32 field
  // if they didn't care about full dates, but we are excused by demonstration
  // purposes.
  test.AddBook("The Reaper Man", "1991/01/01", terry_id);
  test.AddBook("Colour of Magic", "1983/01/01", terry_id);
  test.AddBook("Light Fantastic", "1986/01/01", terry_id);
  test.AddBook("Mort", NULL, terry_id);


  test.AddBook("Fight Club", "1996/01/01", chuck_id);
  test.AddBook("Survivor", NULL, chuck_id);
  test.AddBook("Choke", "2001/01/01", chuck_id);


  test.AddBook("The old man and the sea", NULL, ernest_id);
  test.AddBook("For whom the bell tolls", NULL, ernest_id);
  test.AddBook("A farewell to arms", "1929/01/01", ernest_id);


  test.AddBook("Carpet People", NULL, -1);
  test.AddBook("Producing open source software.", NULL, -1);
  test.AddBook("Quantum computation and quantum information.", NULL, -1);


  test.PrepareJoin();


  test.TestResults();
  return 0;
}


0 0
原创粉丝点击