@@ -118,11 +118,8 @@ def _extract_content(self, extraction_page, start_index, end_index, ignored_regi
118118 region = FragmentedHtmlPageRegion (extraction_page .htmlpage , list (regions ))
119119 else :
120120 region = extraction_page .htmlpage_region_inside (start_index , end_index )
121- if kwargs .get ('no_content_validate' ):
122- validated = True
123- else :
124- validated = self .content_validate (region )
125- return [(self .annotation .surrounds_attribute , self .content_validate (region ))] if validated else []
121+ validated = self .content_validate (region )
122+ return [(self .annotation .surrounds_attribute , validated )] if validated else []
126123
127124 def _extract_attribute (self , extraction_page , start_index , end_index , ignored_regions = None , ** kwargs ):
128125 data = []
@@ -497,25 +494,27 @@ def extract(self, page, start_index=0, end_index=None, ignored_regions=None, **k
497494 warnings .warn ("MDRExtractor can't find element with xpath: %s" % self .xpath )
498495 return [{}]
499496
500- items = {}
501-
502- _ , mapping = mdr .extract (element [0 ], record = self .record )
503- for seed_elem , elements in mapping .iteritems ():
504- annotation_elem = [elem for elem in ([seed_elem ] + elements ) if elem .attrib .get ('data-scrapy-annotate' )]
505- if annotation_elem :
506- annotation = self ._read_template_annotation (annotation_elem [0 ])
507- name = annotation .get ('annotations' , {}).get ('content' )
508- ex = self .extractors [name ]
509- for elem in elements :
497+ items = []
498+ _ , mappings = mdr .extract (element [0 ], record = self .record )
499+
500+ for record , mapping in mappings .iteritems ():
501+ item = {}
502+ for seed_elem , element in mapping .iteritems ():
503+ annotation_elem = [elem for elem in [seed_elem , element ] if elem .attrib .get ('data-scrapy-annotate' )]
504+ if annotation_elem :
505+ annotation = self ._read_template_annotation (annotation_elem [0 ])
506+ group_name = annotation .get ('listingDateGroupName' , 'default_group' )
507+ name = annotation .get ('annotations' , {}).get ('content' )
508+ ex = self .extractors [name ]
510509 elem_page = HtmlPage (None , {}, tostring (elem , encoding = 'unicode' ))
511510 parsed_elem_page = parse_extraction_page (self .token_dict , elem_page )
512- items .setdefault (name , []).extend ([v for _ , v in ex .extract (parsed_elem_page , 0 ,
513- len (parsed_elem_page .page_tokens ) - 1 , no_content_validate = True )])
511+ item .setdefault (name , []).extend ([v for _ , v in ex .extract (parsed_elem_page , 0 ,
512+ len (parsed_elem_page .page_tokens ) - 1 )])
513+ items .append (item )
514514
515515 if items :
516- lengths = [len (values ) for values in items .values ()]
517- assert len (set (lengths )) == 1 , 'extract items %r should be have same count' % items
518- return [items ]
516+ return [{group_name : items }]
517+ return []
519518
520519 @classmethod
521520 def apply (cls , template , extractors ):
@@ -558,8 +557,8 @@ def apply(cls, template, extractors):
558557 if name == extractor .annotation .surrounds_attribute :
559558 listing_data_extractors .append (extractor )
560559 extractors .remove (extractor )
561- record , mapping = mdr .extract (candidate )
562- cls ._propagate_annotations (mapping )
560+ record , mappings = mdr .extract (candidate )
561+ cls ._propagate_annotations (mappings )
563562 return cls (template .token_dict , cls ._get_candidate_xpath (doc , candidate ), record , listing_data_extractors ), extractors
564563
565564 return None , extractors
@@ -605,16 +604,16 @@ def _get_common_ancestor_xpath(doc, elements):
605604 return "/" .join (common_prefix (* [doc .getpath (elem ).split ('/' ) for elem in elements ]))
606605
607606 @staticmethod
608- def _propagate_annotations (mapping ):
609- for elem , targ_elements in mapping .iteritems ():
610- elements = [elem ] + targ_elements
611- for _elem in elements :
612- annotation = _elem .attrib .get ('data-scrapy-annotate' )
607+ def _propagate_annotations (mappings ):
608+ for record , mapping in mappings .iteritems ():
609+ for elem , targ_elem in mapping .iteritems ():
610+ for _elem in [elem , targ_elem ]:
611+ annotation = _elem .attrib .get ('data-scrapy-annotate' )
612+ if annotation :
613+ break
613614 if annotation :
614- break
615- if annotation :
616- for _elem in elements :
617- _elem .attrib ['data-scrapy-annotate' ] = annotation
615+ for _elem in [elem , targ_elem ]:
616+ _elem .attrib ['data-scrapy-annotate' ] = annotation
618617
619618 def __repr__ (self ):
620619 return "MdrExtractor(%s %r)" % (self .xpath , self .extractors )
0 commit comments