spider-inline-requests

Posted linpd

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了spider-inline-requests相关的知识,希望对你有一定的参考价值。

  1 # -*- coding: utf-8 -*-
  2 import re
  3 import time
  4 
  5 import scrapy
  6 from inline_requests import inline_requests
  7 from requests_toolbelt import MultipartEncoder
  8 
  9 from tracking.items import TrackingBlNoItem, TrackingEventItem, TrackingContainerNoItem
 10 
 11 
 12 class OOCLSpider(scrapy.Spider):
 13     name = OOCL
 14     allowed_domains = [moc.oocl.com]
 15     start_urls = [http://moc.oocl.com/]
 16 
 17     def start_requests(self):
 18         bl_no = 2104523720
 19         post_url = http://moc.oocl.com/party/cargotracking/ct_search_from_other_domain.jsf
 20         form_data = {
 21             "ANONYMOUS_BEHAVIOR": "BUILD_UP",
 22             "domainName": "PARTY_DOMAIN",
 23             "ENTRY_TYPE": "OOCL",
 24             "ENTRY": "MCC",
 25             "ctSearchType": "BL",
 26             "ctShipmentNumber": bl_no
 27         }
 28 
 29         header = {
 30             "HOST": "moc.oocl.com",
 31             "Cache-Control": "max-age=0",
 32             "Origin": "http://localhost:63343",
 33             "Upgrade-Insecure-Requests": "1",
 34             "Content-Type": "application/x-www-form-urlencoded",
 35             "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_1) AppleWebKit/537.36 (Khtml, like Gecko) Chrome/71.0.3578.98 Safari/537.36",
 36             Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,
 37             "Referer": "http://localhost:63343/yqn_standard_shipping_info/test2.html?_ijt=hj7nk9lapnrpo3k5unameggs59",
 38             # "Accept-Encoding": "gzip, deflate",
 39             Accept-Language: zh-CN,zh;q=0.9,en;q=0.8,zh-TW;q=0.7,ja;q=0.6
 40         }
 41         yield scrapy.FormRequest(
 42             url=post_url,
 43             formdata=form_data,
 44             headers=header,
 45             callback=self.parse
 46         )
 47 
 48     def parse(self, response):
 49         bl_no = 2104523720
 50         post_url = http://moc.oocl.com
 51         html_str = response.text
 52         # cookies = response.headers.getlist(‘Set-Cookie‘)[0]
 53         # print(str(cookies))
 54         # session = re.findall(‘(?<=JSESSIONID=)[^;]*‘, str(cookies), re.S)[0]
 55         # str(response.headers.getlist(‘Set-Cookie‘))
 56         url_plus = re.findall(ANONYMOUS_TOKEN=.*&ENTRY=.*&PREFER_LANGUAGE=[^"]*, html_str, re.S)[0]
 57         form = response.xpath(//*[@id="hiddenForm"])
 58         refer = form.xpath(./@action)[0].extract()
 59         post_url = post_url + refer + "?" + url_plus
 60         input_list = form.xpath(./input)
 61         form_data = dict()
 62         UT = []
 63         form_data[USER_TOKEN] = UT
 64         for input_item in input_list:
 65             name = input_item.xpath(./@name)
 66             name_str = name[0].extract()
 67             if name_str != USER_TOKEN:
 68                 value_str = ‘‘
 69                 value = input_item.xpath(./@value)
 70                 if len(value) > 0:
 71                     value_str = value[0].extract()
 72                 form_data[name_str] = value_str
 73             else:
 74                 value_str = ‘‘
 75                 value = input_item.xpath(./@value)
 76                 if len(value) > 0:
 77                     value_str = value[0].extract()
 78                 UT.append(value_str)
 79 
 80         form_data["hiddenForm:searchType"] = "BL"
 81         form_data["hiddenForm:billOfLadingNumber"] = bl_no
 82         form_data["hiddenForm:supportUtfChars"] = "true"
 83         form_data["hiddenForm:_link_hidden_"] = "hiddenForm:goToCargoTrackingBL"
 84         header = {
 85             "HOST": "moc.oocl.com",
 86             "Cache-Control": "max-age=0",
 87             "Origin": "http://moc.oocl.com",
 88             "Upgrade-Insecure-Requests": "1",
 89             "Content-Type": "application/x-www-form-urlencoded",
 90             "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36",
 91             Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,
 92             "Referer": "http://moc.oocl.com/party/cargotracking/ct_search_from_other_domain.jsf",
 93             # "Accept-Encoding": "gzip, deflate",
 94             Accept-Language: zh-CN,zh;q=0.9,en;q=0.8,zh-TW;q=0.7,ja;q=0.6
 95         }
 96 
 97         yield scrapy.FormRequest(
 98             url=post_url,
 99             formdata=form_data,
100             headers=header,
101             # cookies={"JSESSIONID": session},
102             callback=self.parse_container
103         )
104     # 哪里使用for循环,装饰器就放在那个方法头上
105     @inline_requests
106     def parse_container(self, response):
109         # 出发港口
110         depart = response.xpath(//*[@id="form:PORLocation0"]/text())[0].extract()
111         # 装货港
112         load = response.xpath(//*[@id="form:POLLocation0"]/text())[0].extract()
113         vessel_voyage = response.xpath(//*[@id="eventListTable"]/tr/td[5]/text()).extract_first()
114         info = vessel_voyage.split()
115         # 船名
116         vessel = info[2]
117         # 航次
118         voyage = info[3]
119         # 卸货港
120         discharge = response.xpath(//*[@id="form:PortOfDischarge0"]/text()).extract_first()
121         # 目的港
122         arrive = response.xpath(//*[@id="form:finalDestination0"]/text()).extract_first()
123 
124         # 表单
125         bl_item = TrackingBlNoItem()
126         bl_item[bl_no] = 2104523720
127         bl_item[depart] = depart
128         bl_item[load] = load
129         bl_item[arrive] = arrive
130         bl_item[discharge] = discharge
131         bl_item[carrier_id] = 4558
132         bl_item[carrier_code] = OOCL
133         bl_item[vessel_name] = vessel
134         bl_item[voyage] = voyage
135         bl_item[request_url] = response.url
136         bl_item[weight] = ‘‘
137         bl_item[pieces] = ‘‘
138         bl_item[volume] = ‘‘
139         bl_item[version] = ‘‘
140         bl_item[event_list] = []
141         container_list = []
142         container_info_tr_list = response.xpath(//*[@id="summaryTable"]/tr)
143         no = 0
144         for tr in container_info_tr_list[3:]:
145             container_info = TrackingContainerNoItem()
146             # 箱大小
147             container_info[container_size] = ‘‘
148             # 箱类型
149             container_info[container_type] = ‘‘
150             # 封号
151             container_info[seal_no] = ‘‘
152             # 箱号
153             container_info[container_no] = tr.xpath(./td/a/text()).extract_first().split(-)[0]
154             # 事件
155             container_info[event_list] = []
156             container_list.append(container_info)
157 
158             # # 增加箱事件   发起post请求
159             # # 获取post_url 的动态参数token
160             ANONYMOUS_TOKEN = response.xpath(//*[@id="form"]/input[1]/@value).extract_first()
161             jsf_tree_64 = response.xpath(//*[@id="jsf_tree_64"]/@value).extract_first()
162             jsf_state_64 = response.xpath(//*[@id="jsf_state_64"]/@value).extract_first()
163             jsf_viewid = response.xpath(//*[@id="jsf_viewid"]/@value).extract_first()
164             ANONYMOUS_BEHAVIOR = response.xpath(//*[@id="ANONYMOUS_BEHAVIOR"]/@value).extract_first()
165             container_no = container_info[container_no]
166             form_link = form:link + str(no)
167             no += 1
168             token = ANONYMOUS_TOKEN
169             post_url = http://moc.oocl.com/party/cargotracking/ct_result_bl.jsf?&ANONYMOUS_TOKEN=%s& 170                        ENTRY=MCC&ENTRY_TYPE=OOCL&PREFER_LANGUAGE=en-US % token
171             headers = {
172                 Host: moc.oocl.com,
173                 # ‘Content-Length‘: ‘12911‘,
174                 Pragma: no-cache,
175                 Cache-Control: no-cache,
176                 Origin: http://moc.oocl.com,
177                 Upgrade-Insecure-Requests: 1,
178                 # ‘Content-Type‘: ‘multipart/form-data; boundary=----WebKitFormBoundary6tLY0zPjk3DPKQIr‘,
179                 User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36,
180                 Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,
181                 Referer: http://moc.oocl.com/party/cargotracking/ct_result_bl.jsf?&ANONYMOUS_TOKEN=LxzcmeKTFHdxDMLieyzHMCCOOCL&ENTRY=MCC&ENTRY_TYPE=OOCL&PREFER_LANGUAGE=en-US,
182                 # ‘Accept-Encoding‘: ‘gzip, deflate‘,
183                 Accept-Language: zh-CN,zh;q=0.9,
184                 # ‘Cookie‘: ‘JSESSIONID=7a8fc465850fcd03590d06ebbd340ea583a91689025a7e658861bdf4a57632cf.e38NbxiLa3qOci0Oa3eQaxmSci0; AcceptCookie=yes; BIGipServerpool_moc=737985227.25886.0000; _ga=GA1.3.1816166690.1545736626; _gid=GA1.3.133395829.1545736626; _mkto_trk=id:098-DFA-145&token:_mch-oocl.com-1545736627194-22944; WT_FPC=id=2aeb5802f8c5bd76dbc1545736626441:lv=1545830754900:ss=1545829378448‘,
185                 Connection: keep-alive,
186             }
187             multipart = {
188                 ANONYMOUS_TOKEN: ANONYMOUS_TOKEN,
189                 ENTRY: MCC,
190                 ENTRY_TYPE: OOCL,
191                 PREFER_LANGUAGE: en-US,
192                 ANONYMOUS_BEHAVIOR: ANONYMOUS_BEHAVIOR,
193                 form:forward_uri: ‘‘,
194                 form:rtContainerNumberValueID: ‘‘,
195                 form:crossDomainSaveStateMapString: ‘‘,
196                 form_SUBMIT: 1,
197                 searchCriteriaSearchTypeCode: BL,
198                 searchCriteriaBookingNumber: ‘‘,
199                 searchCriteriaContainerNumbers: ‘‘,
200                 form:_link_hidden_: form_link,
201                 searchCriteriaBillOfLadingNumber: bl_item[bl_no],
202                 currentContainerNumber: container_no,
203                 jsf_tree_64: jsf_tree_64,
204                 jsf_state_64: jsf_state_64,
205                 jsf_viewid: jsf_viewid,
206             }
207             response = yield scrapy.FormRequest(
208                 url=post_url,
209                 headers=headers,
210                 formdata=multipart,
211                 dont_filter=True,
212             )
          # 内联请求的回调函数 215 self.parse_container_event(response, container_info, vessel, voyage) 216      # yield 的返回必须在 for 循环结束 217 yield { 218 bl_item: bl_item, 219 container_list: container_list 220 } 221 222 def parse_container_event(self, response, container_info, vessel, voyage): 223 event_info_tr_list = response.xpath(//*[@id="eventListTable"]/tr) 224 for event_tr in event_info_tr_list[3:]: 225 event = TrackingEventItem() 226 # 船名 227 event[vessel_name] = vessel 228 # 航次 229 event[voyage] = voyage 230 # 事件名称 231 event[event_name] = ‘‘ 232 # 事件 code 233 event[event_code] = ‘‘ 234 # 事件描述 235 event[event_description] = event_tr.xpath(./td/text()).extract_first() 236 if event[event_description] is not None: 237 event[event_description] = event[event_description].strip() 238 else: 239 event[event_description] = ‘‘ 240 241 # 修改时间日期格式 242 def transform_format_event_data(event_time): 243 event_date_list = event_time.split(,) 244 event_time = event_date_list[0] + + event_date_list[1].lstrip().split()[0] 245 to_event_time = time.strptime(event_time, "%d %b %Y %H:%M") 246 event_time = time.strftime("%Y-%m-%d %H:%M:%S", to_event_time) 247 return event_time 248 249 # 事件预计时间 250 event[expected_time] = ‘‘ 251 if event[expected_time]: 252 event[expected_time] = transform_format_event_data(event[expected_time]) 253 # 事件实际时间 254 event[actual_time] = event_tr.xpath(./td[5]/span/text()).extract_first() 255 if event[actual_time]: 256 event[actual_time] = transform_format_event_data(event[actual_time]) 257 # 事件位置 258 event[location_name] = event_tr.xpath(./td[3]/span/text()).extract_first() 259 # 位置类型 260 event[location_type] = ‘‘ 261 # 位置 code 262 event[location_code] = ‘‘ 263 container_info[event_list].append(event)

 

以上是关于spider-inline-requests的主要内容,如果未能解决你的问题,请参考以下文章

代码片|水波纹

代码片--练习匿名内部类

一个简单的时间片轮转内核代码的分析(课程作业)

markdown 放代码片

代码片-下拉树实现

用java给html文件添加必要的控制html代码片