mapreduce 报错信息

来源:互联网 发布:马东最新网络节目 编辑:程序博客网 时间:2024/05/16 14:54

# This example MapReduce job processes documents and looks for keywords in them.
# It takes two database tables as input:
#   - documents (doc_id integer, url text, data text)
#   - keywords  (keyword_id integer, keyword text)#
# The documents data is searched for occurences of keywords and returns results of
# url, data and keyword (a keyword can be multiple words, such as "high performance
# computing")
%YAML 1.1
---
VERSION: 1.0.0.1
# Connect to Greenplum Database using this database and role
DATABASE: cqdmkt
USER: gpadmin
#HOST:
#PORT:
# Begin definition section
DEFINE:
  # Declare the input, which selects all columns and rows from the
  # 'documents' and 'keywords' tables.
  - INPUT:
      NAME:  doc
      TABLE: documents
  - INPUT:
      NAME:  kw
      TABLE: keywords
          #FILE:
          # - hostname:/path/to/file
          #GPFDIST:
          # - hostname:port:/file_pattern
          #TABLE: table_name
          #QUERY: select_statement
          #EXEC: command_string
          #COLUMNS:
          # - filed_name data_type
          #FORMAT: TEXT|CSV (default TEXT)
          #DELIMITER: delimiter_character
          #ESCAPE: escape_character
          #NULL: null_string
          #QUOTE: csv_quote_character
          #ERROR_LIMIT: integer
          #ENCODING: database_encoding
# Define the map functions to extract terms from documents and keyword
# This example simply splits on white space, but it would be possible
# to make use of a python library like nltk (the natural language toolkit)
# to perform more complex tokenization and word stemming.
  - MAP:
      NAME:     doc_map
      LANGUAGE: python
      FUNCTION: |
        i = 0            # the index of a word within the document
        terms = {}       # a hash of terms and their indexes within the document
        # Lower-case and split the text string on space
        for term in data.lower().split():
          i = i + 1      # increment i (the index)
        # Check for the term in the terms list:
        # if stem word already exists, append the i value to the array entry
        # corresponding to the term. This counts multiple occurances of the word.
        # If stem word does not exist, add it to the dictionary with position i.
        # For example:
          #   data: "a computer is a machine that manipulates data"
          #   "a" [1, 4]
          #   "computer" [2]
          #   "machine" [3]
          #   …
          if term in terms:
            terms[term] += ','+str(i)
          else:
            terms[term] = str(i)        
        # Return multiple lines for each document. Each line consists of
        # the doc_id, a term and the positions in the data where the term appeared.
        # For example:
        #   (doc_id => 100, term => "a", [1,4]
        #   (doc_id => 100, term => "computer", [2]
        #    …
        for term in terms:
          yield([doc_id, term, terms[term]])
      OPTIMIZE: STRICT IMMUTABLE
      PARAMETERS:
        - doc_id integer
        - data text
      RETURNS:
        - doc_id integer
        - term text
        - positions text
# The map function for keywords is almost identical to the one for documents
  # but it also counts of the number of terms in the keyword.
  - MAP:
      NAME: kw_map
      LANGUAGE: python
      FUNCTION: |
        i = 0
        terms = {}
       
        for term in keyword.lower().split():
          i = i + 1
          if term in terms:
            terms[term] += ','+str(i)
          else:
            terms[term] = str(i)
        # output 4 values including i (the total count for term in terms):
          yield([keyword_id, i, term, terms[term]])
      OPTIMIZE: STRICT IMMUTABLE
      PARAMETERS:
        - keyword_id integer
        - keyword text
      RETURNS:
        - keyword_id integer
        - nterms integer
        - term text
        - positions text
# A TASK is an object that defines an entire INPUT/MAP/REDUCE stage
# within a Greenplum MapReduce pipeline. It is like EXECUTION, but it is
# executed only when called as input to other processing stages.
# Identify a task called 'doc_prep' which takes in the 'doc' INPUT defined earlier
# and runs the 'doc_map' MAP function which returns doc_id, term, [term_position]
  - TASK:
      NAME: doc_prep
      SOURCE: doc
      MAP: doc_map
        # Identify a task called 'kw_prep' which takes in the 'kw' INPUT defined earlier
# and runs the kw_map MAP function which returns kw_id, term, [term_position]
  - TASK:
      NAME: kw_prep
      SOURCE: kw
      MAP: kw_map
# One advantage of Greenplum MapReduce is that MapReduce tasks can be
# used as input to SQL operations and SQL can be used to process a MapReduce task.
# This INPUT defines a SQL query that joins the output of the 'doc_prep'
# TASK to that of the 'kw_prep' TASK. Matching terms are output to the 'candidate'
# list (any keyword that shares at least one term with the document).
  - INPUT:
      NAME: term_join
      QUERY: |
        SELECT doc.doc_id, kw.keyword_id, kw.term, kw.nterms,
               doc.positions as doc_positions,
               kw.positions as kw_positions
          FROM doc_prep doc INNER JOIN kw_prep kw ON (doc.term = kw.term)
# In Greenplum MapReduce, a REDUCE function is comprised of one or more functions.
# A REDUCE has an initial 'state' variable defined for each grouping key. that is
# A TRANSITION function adjusts the state for every value in a key grouping.
# If present, an optional CONSOLIDATE function combines multiple
# 'state' variables.  This allows the TRANSITION function to be executed locally at
# the segment-level and only redistribute the accumulated 'state' over
# the network. If present, an optional FINALIZE function can be used to perform
# final computation on a state and emit one or more rows of output from the state.
#
# This REDUCE function is called 'term_reducer' with a TRANSITION function
# called 'term_transition' and a FINALIZE function called 'term_finalizer'
  - REDUCE:
      NAME: term_reducer
      TRANSITION: term_transition
      FINALIZE: term_finalizer
  - TRANSITION:
      NAME: term_transition
      LANGUAGE: python
      PARAMETERS:
        - state text
        - term text
        - nterms integer
        - doc_positions text
        - kw_positions text
      FUNCTION: |
        # 'state' has an initial value of '' and is a colon delimited set
        # of keyword positions. keyword positions are comma delimited sets of
        # integers. For example, '1,3,2:4:'
        # If there is an existing state, split it into the set of keyword positions
        # otherwise construct a set of 'nterms' keyword positions - all empty
        if state:
          kw_split = state.split(':')
        else:
          kw_split = []
          for i in range(0,nterms):
            kw_split.append('')
        # 'kw_positions' is a comma delimited field of integers indicating what
        # position a single term occurs within a given keyword. 
        # Splitting based on ',' converts the string into a python list.
        # add doc_positions for the current term
        for kw_p in kw_positions.split(','):
          kw_split[int(kw_p)-1] = doc_positions
        # This section takes each element in the 'kw_split' array and strings
        # them together placing a ':' in between each element from the array.
        # For example: for the keyword "computer software computer hardware",
        # the 'kw_split' array matched up to the document data of
        # "in the business of computer software software engineers"
        # would look like: ['5', '6,7', '5', '']
        # and the outstate would look like: 5:6,7:5:
        outstate = kw_split[0]
        for s in kw_split[1:]:
          outstate = outstate + ':' + s
        return outstate
  - FINALIZE:
      NAME: term_finalizer
      LANGUAGE: python
      RETURNS:
        - count integer
      MODE: MULTI
      FUNCTION: |
        if not state:
          return 0
        kw_split = state.split(':')
        # This function does the following:
        # 1) Splits 'kw_split' on ':'
        #    for example, 1,5,7:2,8 creates '1,5,7' and '2,8'
        # 2) For each group of positions in 'kw_split', splits the set on ','
        #    to create ['1','5','7'] from Set 0: 1,5,7 and
        #    eventually ['2', '8'] from Set 1: 2,8
        # 3) Checks for empty strings
        # 4) Adjusts the split sets by subtracting the position of the set
        #      in the 'kw_split' array
        #      ['1','5','7'] - 0 from each element = ['1','5','7']
        #      ['2', '8'] - 1 from each element = ['1', '7']
        # 5) Resulting arrays after subtracting the offset in step 4 are
        #    intersected and their overlaping values kept:
        #    ['1','5','7'].intersect['1', '7'] = [1,7]
        # 6) Determines the length of the intersection, which is the number of
        #    times that an entire keyword (with all its pieces) matches in the
        #    document data.
        previous = None
        for i in range(0,len(kw_split)):
          isplit = kw_split[i].split(',')
          if any(map(lambda(x): x == '', isplit)):
            return 0
          adjusted = set(map(lambda(x): int(x)-i, isplit))
          if (previous):
            previous = adjusted.intersection(previous)
          else:
            previous = adjusted
        # return the final count
        if previous:
          return len(previous)
        return 0
            # Define the 'term_match' task which is then executed as part
   # of the 'final_output' query. It takes the INPUT 'term_join' defined
   # earlier and uses the REDUCE function 'term_reducer' defined earlier
  - TASK:
      NAME: term_match
      SOURCE: term_join
      REDUCE: term_reducer
  - INPUT:
      NAME: final_output
      QUERY: |
        SELECT doc.*, kw.*, tm.count
        FROM documents doc, keywords kw, term_match tm
        WHERE doc.doc_id = tm.doc_id
          AND kw.keyword_id = tm.keyword_id
          AND tm.count > 0
# Execute this MapReduce job and send output to STDOUT
EXECUTE:
  - RUN:
      SOURCE: final_output
      TARGET: STDOUT

--------------------

我在运行的时候报错

[gpadmin@gmaster dev]$ gpmapreduce  -f gpmap.yml
mapreduce_2854_run_1
ERROR:  returned object cannot be iterated  (seg0 slice4 gseg1:40000 pid=2419)
DETAIL:  PL/Python set-returning functions must return an iterable object.
Error: Execution Failure

---但是能够expain,这个是怎么回事呀!

[gpadmin@gmaster dev]$ gpmapreduce  -f gpmap.yml  --explain
mapreduce_2933_run_1
QUERY PLAN                                                                                                                                                    
---------------------------------------------------------------------------------------------------------------------------------------------------------------
Gather Motion 4:1  (slice6; segments: 4)  (cost=8.70..8.71 rows=1 width=80)                                                                                   
  Merge Key: doc.doc_id, doc.url, doc.data, kw.keyword_id, kw.keyword, tm.count                                                                               
  ->  Sort  (cost=8.70..8.71 rows=1 width=80)                                                                                                                 
        Sort Key: doc.doc_id, doc.url, doc.data, kw.keyword_id, kw.keyword, tm.count                                                                          
        ->  Hash Join  (cost=5.48..8.67 rows=1 width=80)                                                                                                      
              Hash Cond: tm.doc_id = doc.doc_id                                                                                                               
              ->  Redistribute Motion 4:4  (slice5; segments: 4)  (cost=4.46..7.60 rows=1 width=18)                                                           
                    Hash Key: tm.doc_id                                                                                                                       
                    ->  Hash Join  (cost=4.46..7.54 rows=1 width=18)                                                                                          
                          Hash Cond: kw.keyword_id = tm.keyword_id                                                                                            
                          ->  Seq Scan on keywords kw  (cost=0.00..3.03 rows=1 width=10)                                                                      
                          ->  Hash  (cost=4.45..4.45 rows=1 width=12)                                                                                         
                                ->  Redistribute Motion 4:4  (slice4; segments: 4)  (cost=4.32..4.45 rows=1 width=12)                                         
                                      Hash Key: tm.keyword_id                                                                                                 
                                      ->  Subquery Scan tm  (cost=4.32..4.43 rows=1 width=12)                                                                 
                                            Filter: count > 0                                                                                                 
                                            ->  Result  (cost=4.32..4.39 rows=1 width=40)                                                                     
                                                  ->  HashAggregate  (cost=4.32..4.35 rows=1 width=64)                                                        
                                                        Group By: "?column3?", "?column4?"                                                                    
                                                        ->  Redistribute Motion 4:4  (slice3; segments: 4)  (cost=1.05..4.29 rows=1 width=64)                 
                                                              Hash Key: "?column3?", "?column4?"                                                              
                                                              ->  Hash Join  (cost=1.05..4.23 rows=1 width=64)                                                
                                                                    Hash Cond: (mapxq.m).term = (mapxq.m).term                                                
                                                                    ->  Redistribute Motion 4:4  (slice1; segments: 4)  (cost=0.00..3.13 rows=1 width=32)     
                                                                          Hash Key: (mapxq.m).term                                                            
                                                                          ->  Result  (cost=0.00..3.04 rows=1 width=10)                                       
                                                                                ->  Seq Scan on keywords  (cost=0.00..3.04 rows=1 width=10)                   
                                                                    ->  Hash  (cost=1.04..1.04 rows=1 width=32)                                               
                                                                          ->  Redistribute Motion 4:4  (slice2; segments: 4)  (cost=0.00..1.04 rows=1 width=32)
                                                                                Hash Key: (mapxq.m).term                                                      
                                                                                ->  Result  (cost=0.00..1.01 rows=1 width=50)                                 
                                                                                      ->  Seq Scan on documents  (cost=0.00..1.01 rows=1 width=50)            
              ->  Hash  (cost=1.01..1.01 rows=1 width=66)                                                                                                     
                    ->  Seq Scan on documents doc  (cost=0.00..1.01 rows=1 width=66)                                                                          
(34 rows)

[gpadmin@gmaster dev]$


原创粉丝点击