spider-inline-requests
Posted linpd
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了spider-inline-requests相关的知识,希望对你有一定的参考价值。
1 # -*- coding: utf-8 -*- 2 import re 3 import time 4 5 import scrapy 6 from inline_requests import inline_requests 7 from requests_toolbelt import MultipartEncoder 8 9 from tracking.items import TrackingBlNoItem, TrackingEventItem, TrackingContainerNoItem 10 11 12 class OOCLSpider(scrapy.Spider): 13 name = ‘OOCL‘ 14 allowed_domains = [‘moc.oocl.com‘] 15 start_urls = [‘http://moc.oocl.com/‘] 16 17 def start_requests(self): 18 bl_no = ‘2104523720‘ 19 post_url = ‘http://moc.oocl.com/party/cargotracking/ct_search_from_other_domain.jsf‘ 20 form_data = { 21 "ANONYMOUS_BEHAVIOR": "BUILD_UP", 22 "domainName": "PARTY_DOMAIN", 23 "ENTRY_TYPE": "OOCL", 24 "ENTRY": "MCC", 25 "ctSearchType": "BL", 26 "ctShipmentNumber": bl_no 27 } 28 29 header = { 30 "HOST": "moc.oocl.com", 31 "Cache-Control": "max-age=0", 32 "Origin": "http://localhost:63343", 33 "Upgrade-Insecure-Requests": "1", 34 "Content-Type": "application/x-www-form-urlencoded", 35 "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_1) AppleWebKit/537.36 (Khtml, like Gecko) Chrome/71.0.3578.98 Safari/537.36", 36 ‘Accept‘: ‘text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8‘, 37 "Referer": "http://localhost:63343/yqn_standard_shipping_info/test2.html?_ijt=hj7nk9lapnrpo3k5unameggs59", 38 # "Accept-Encoding": "gzip, deflate", 39 ‘Accept-Language‘: ‘zh-CN,zh;q=0.9,en;q=0.8,zh-TW;q=0.7,ja;q=0.6‘ 40 } 41 yield scrapy.FormRequest( 42 url=post_url, 43 formdata=form_data, 44 headers=header, 45 callback=self.parse 46 ) 47 48 def parse(self, response): 49 bl_no = ‘2104523720‘ 50 post_url = ‘http://moc.oocl.com‘ 51 html_str = response.text 52 # cookies = response.headers.getlist(‘Set-Cookie‘)[0] 53 # print(str(cookies)) 54 # session = re.findall(‘(?<=JSESSIONID=)[^;]*‘, str(cookies), re.S)[0] 55 # str(response.headers.getlist(‘Set-Cookie‘)) 56 url_plus = re.findall(‘ANONYMOUS_TOKEN=.*&ENTRY=.*&PREFER_LANGUAGE=[^"]*‘, html_str, re.S)[0] 57 form = response.xpath(‘//*[@id="hiddenForm"]‘) 58 refer = form.xpath(‘./@action‘)[0].extract() 59 post_url = post_url + refer + "?" + url_plus 60 input_list = form.xpath(‘./input‘) 61 form_data = dict() 62 UT = [] 63 form_data[‘USER_TOKEN‘] = UT 64 for input_item in input_list: 65 name = input_item.xpath(‘./@name‘) 66 name_str = name[0].extract() 67 if name_str != ‘USER_TOKEN‘: 68 value_str = ‘‘ 69 value = input_item.xpath(‘./@value‘) 70 if len(value) > 0: 71 value_str = value[0].extract() 72 form_data[name_str] = value_str 73 else: 74 value_str = ‘‘ 75 value = input_item.xpath(‘./@value‘) 76 if len(value) > 0: 77 value_str = value[0].extract() 78 UT.append(value_str) 79 80 form_data["hiddenForm:searchType"] = "BL" 81 form_data["hiddenForm:billOfLadingNumber"] = bl_no 82 form_data["hiddenForm:supportUtfChars"] = "true" 83 form_data["hiddenForm:_link_hidden_"] = "hiddenForm:goToCargoTrackingBL" 84 header = { 85 "HOST": "moc.oocl.com", 86 "Cache-Control": "max-age=0", 87 "Origin": "http://moc.oocl.com", 88 "Upgrade-Insecure-Requests": "1", 89 "Content-Type": "application/x-www-form-urlencoded", 90 "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36", 91 ‘Accept‘: ‘text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8‘, 92 "Referer": "http://moc.oocl.com/party/cargotracking/ct_search_from_other_domain.jsf", 93 # "Accept-Encoding": "gzip, deflate", 94 ‘Accept-Language‘: ‘zh-CN,zh;q=0.9,en;q=0.8,zh-TW;q=0.7,ja;q=0.6‘ 95 } 96 97 yield scrapy.FormRequest( 98 url=post_url, 99 formdata=form_data, 100 headers=header, 101 # cookies={"JSESSIONID": session}, 102 callback=self.parse_container 103 ) 104 # 哪里使用for循环,装饰器就放在那个方法头上 105 @inline_requests 106 def parse_container(self, response): 109 # 出发港口 110 depart = response.xpath(‘//*[@id="form:PORLocation0"]/text()‘)[0].extract() 111 # 装货港 112 load = response.xpath(‘//*[@id="form:POLLocation0"]/text()‘)[0].extract() 113 vessel_voyage = response.xpath(‘//*[@id="eventListTable"]/tr/td[5]/text()‘).extract_first() 114 info = vessel_voyage.split() 115 # 船名 116 vessel = info[2] 117 # 航次 118 voyage = info[3] 119 # 卸货港 120 discharge = response.xpath(‘//*[@id="form:PortOfDischarge0"]/text()‘).extract_first() 121 # 目的港 122 arrive = response.xpath(‘//*[@id="form:finalDestination0"]/text()‘).extract_first() 123 124 # 表单 125 bl_item = TrackingBlNoItem() 126 bl_item[‘bl_no‘] = ‘2104523720‘ 127 bl_item[‘depart‘] = depart 128 bl_item[‘load‘] = load 129 bl_item[‘arrive‘] = arrive 130 bl_item[‘discharge‘] = discharge 131 bl_item[‘carrier_id‘] = 4558 132 bl_item[‘carrier_code‘] = ‘OOCL‘ 133 bl_item[‘vessel_name‘] = vessel 134 bl_item[‘voyage‘] = voyage 135 bl_item[‘request_url‘] = response.url 136 bl_item[‘weight‘] = ‘‘ 137 bl_item[‘pieces‘] = ‘‘ 138 bl_item[‘volume‘] = ‘‘ 139 bl_item[‘version‘] = ‘‘ 140 bl_item[‘event_list‘] = [] 141 container_list = [] 142 container_info_tr_list = response.xpath(‘//*[@id="summaryTable"]/tr‘) 143 no = 0 144 for tr in container_info_tr_list[3:]: 145 container_info = TrackingContainerNoItem() 146 # 箱大小 147 container_info[‘container_size‘] = ‘‘ 148 # 箱类型 149 container_info[‘container_type‘] = ‘‘ 150 # 封号 151 container_info[‘seal_no‘] = ‘‘ 152 # 箱号 153 container_info[‘container_no‘] = tr.xpath(‘./td/a/text()‘).extract_first().split(‘-‘)[0] 154 # 事件 155 container_info[‘event_list‘] = [] 156 container_list.append(container_info) 157 158 # # 增加箱事件 发起post请求 159 # # 获取post_url 的动态参数token 160 ANONYMOUS_TOKEN = response.xpath(‘//*[@id="form"]/input[1]/@value‘).extract_first() 161 jsf_tree_64 = response.xpath(‘//*[@id="jsf_tree_64"]/@value‘).extract_first() 162 jsf_state_64 = response.xpath(‘//*[@id="jsf_state_64"]/@value‘).extract_first() 163 jsf_viewid = response.xpath(‘//*[@id="jsf_viewid"]/@value‘).extract_first() 164 ANONYMOUS_BEHAVIOR = response.xpath(‘//*[@id="ANONYMOUS_BEHAVIOR"]/@value‘).extract_first() 165 container_no = container_info[‘container_no‘] 166 form_link = ‘form:link‘ + str(no) 167 no += 1 168 token = ANONYMOUS_TOKEN 169 post_url = ‘http://moc.oocl.com/party/cargotracking/ct_result_bl.jsf?&ANONYMOUS_TOKEN=%s&‘ 170 ‘ENTRY=MCC&ENTRY_TYPE=OOCL&PREFER_LANGUAGE=en-US‘ % token 171 headers = { 172 ‘Host‘: ‘moc.oocl.com‘, 173 # ‘Content-Length‘: ‘12911‘, 174 ‘Pragma‘: ‘no-cache‘, 175 ‘Cache-Control‘: ‘no-cache‘, 176 ‘Origin‘: ‘http://moc.oocl.com‘, 177 ‘Upgrade-Insecure-Requests‘: ‘1‘, 178 # ‘Content-Type‘: ‘multipart/form-data; boundary=----WebKitFormBoundary6tLY0zPjk3DPKQIr‘, 179 ‘User-Agent‘: ‘Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36‘, 180 ‘Accept‘: ‘text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8‘, 181 ‘Referer‘: ‘http://moc.oocl.com/party/cargotracking/ct_result_bl.jsf?&ANONYMOUS_TOKEN=LxzcmeKTFHdxDMLieyzHMCCOOCL&ENTRY=MCC&ENTRY_TYPE=OOCL&PREFER_LANGUAGE=en-US‘, 182 # ‘Accept-Encoding‘: ‘gzip, deflate‘, 183 ‘Accept-Language‘: ‘zh-CN,zh;q=0.9‘, 184 # ‘Cookie‘: ‘JSESSIONID=7a8fc465850fcd03590d06ebbd340ea583a91689025a7e658861bdf4a57632cf.e38NbxiLa3qOci0Oa3eQaxmSci0; AcceptCookie=yes; BIGipServerpool_moc=737985227.25886.0000; _ga=GA1.3.1816166690.1545736626; _gid=GA1.3.133395829.1545736626; _mkto_trk=id:098-DFA-145&token:_mch-oocl.com-1545736627194-22944; WT_FPC=id=2aeb5802f8c5bd76dbc1545736626441:lv=1545830754900:ss=1545829378448‘, 185 ‘Connection‘: ‘keep-alive‘, 186 } 187 multipart = { 188 ‘ANONYMOUS_TOKEN‘: ANONYMOUS_TOKEN, 189 ‘ENTRY‘: ‘MCC‘, 190 ‘ENTRY_TYPE‘: ‘OOCL‘, 191 ‘PREFER_LANGUAGE‘: ‘en-US‘, 192 ‘ANONYMOUS_BEHAVIOR‘: ANONYMOUS_BEHAVIOR, 193 ‘form:forward_uri‘: ‘‘, 194 ‘form:rtContainerNumberValueID‘: ‘‘, 195 ‘form:crossDomainSaveStateMapString‘: ‘‘, 196 ‘form_SUBMIT‘: ‘1‘, 197 ‘searchCriteriaSearchTypeCode‘: ‘BL‘, 198 ‘searchCriteriaBookingNumber‘: ‘‘, 199 ‘searchCriteriaContainerNumbers‘: ‘‘, 200 ‘form:_link_hidden_‘: form_link, 201 ‘searchCriteriaBillOfLadingNumber‘: bl_item[‘bl_no‘], 202 ‘currentContainerNumber‘: container_no, 203 ‘jsf_tree_64‘: jsf_tree_64, 204 ‘jsf_state_64‘: jsf_state_64, 205 ‘jsf_viewid‘: jsf_viewid, 206 } 207 response = yield scrapy.FormRequest( 208 url=post_url, 209 headers=headers, 210 formdata=multipart, 211 dont_filter=True, 212 )
# 内联请求的回调函数 215 self.parse_container_event(response, container_info, vessel, voyage) 216 # yield 的返回必须在 for 循环结束 217 yield { 218 ‘bl_item‘: bl_item, 219 ‘container_list‘: container_list 220 } 221 222 def parse_container_event(self, response, container_info, vessel, voyage): 223 event_info_tr_list = response.xpath(‘//*[@id="eventListTable"]/tr‘) 224 for event_tr in event_info_tr_list[3:]: 225 event = TrackingEventItem() 226 # 船名 227 event[‘vessel_name‘] = vessel 228 # 航次 229 event[‘voyage‘] = voyage 230 # 事件名称 231 event[‘event_name‘] = ‘‘ 232 # 事件 code 233 event[‘event_code‘] = ‘‘ 234 # 事件描述 235 event[‘event_description‘] = event_tr.xpath(‘./td/text()‘).extract_first() 236 if event[‘event_description‘] is not None: 237 event[‘event_description‘] = event[‘event_description‘].strip() 238 else: 239 event[‘event_description‘] = ‘‘ 240 241 # 修改时间日期格式 242 def transform_format_event_data(event_time): 243 event_date_list = event_time.split(‘,‘) 244 event_time = event_date_list[0] + ‘ ‘ + event_date_list[1].lstrip().split()[0] 245 to_event_time = time.strptime(event_time, "%d %b %Y %H:%M") 246 event_time = time.strftime("%Y-%m-%d %H:%M:%S", to_event_time) 247 return event_time 248 249 # 事件预计时间 250 event[‘expected_time‘] = ‘‘ 251 if event[‘expected_time‘]: 252 event[‘expected_time‘] = transform_format_event_data(event[‘expected_time‘]) 253 # 事件实际时间 254 event[‘actual_time‘] = event_tr.xpath(‘./td[5]/span/text()‘).extract_first() 255 if event[‘actual_time‘]: 256 event[‘actual_time‘] = transform_format_event_data(event[‘actual_time‘]) 257 # 事件位置 258 event[‘location_name‘] = event_tr.xpath(‘./td[3]/span/text()‘).extract_first() 259 # 位置类型 260 event[‘location_type‘] = ‘‘ 261 # 位置 code 262 event[‘location_code‘] = ‘‘ 263 container_info[‘event_list‘].append(event)