第一次爬虫
Posted hkhssg
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了第一次爬虫相关的知识,希望对你有一定的参考价值。
1.实例代码
a.预测球队比赛结果代码
def GameOver(a,b):
if a>=10 and b>=10:
if abs(a-b)==2:
return True
if a<10 or b<10:
if a==11 or b==11:
return True
else:
return False
2、使用request库的get()函数访问百度网页20次并且打印返回状态,text内容,计算text()属性和content()属性所返回网页内容的长度
import
requests
response
=
requests.get(
"http://www.google.cn/"
)
print
(
type
(response))
status_code
=
response.status_code
print
(status_code)
text
=
response.text
print
(text)
encoding
=
response.encoding
print
(encoding)
encoding1
=
response.encoding
=
‘utf-8‘
print
(encoding1)
text1
=
response.text
print
(
"text内容为:{}"
.
format
(text1))
print
(
"
"
)
print
(
"text内容长度为:{}"
.
format
(
len
(text1)))
print
(
"content内容长度为:{}"
.
format
(
len
(r.content)))
<class ‘requests.models.Response‘>
200
<!DOCTYPE html>
<html lang="zh">
<meta charset="utf-8">
<title>Google</title>
<style>
html { background: #fff; margin: 0 1em; }
body { font: .8125em/1.5 arial, sans-serif; text-align: center; }
h1 { font-size: 1.5em; font-weight: normal; margin: 1em 0 0; }
p#footer { color: #767676; font-size: .77em; }
p#footer a { background: url(//www.google.cn/intl/zh-CN_cn/images/cn_icp.gif) top right no-repeat; padding: 5px 20px 5px 0; }
ul { margin: 2em; padding: 0; }
li { display: inline; padding: 0 2em; }
div { -moz-border-radius: 20px; -webkit-border-radius: 20px; border: 1px solid #ccc; border-radius: 20px; margin: 2em auto 1em; max-width: 650px; min-width: 544px; }
div:hover, div:hover * { cursor: pointer; }
div:hover { border-color: #999; }
div p { margin: .5em 0 1.5em; }
img { border: 0; }
</style>
<div>
<a href="http://www.google.com.hk/webhp?hl=zh-CN&sourceid=cnhp">
<img src="//www.google.cn/landing/cnexp/google-search.png" alt="Google" width="586" height="257">
</a>
<h1><a href="http://www.google.com.hk/webhp?hl=zh-CN&sourceid=cnhp"><strong id="target">google.com.hk</strong></a></h1>
<p>请æ?¶è??æ??们ç??ç½?å??
</div>
<ul>
<li><a href="http://translate.google.cn/?sourceid=cnhp">翻�</a>
</ul>
<p id="footer">©2011 - <a href="http://www.miibeian.gov.cn/">ICPè¯?å??å?B2-20070004å?·</a>
<script nonce="0qYFPrpj6kdYUM03_qP12w">
var gcn=gcn||{};gcn.IS_IMAGES=(/images.google.cn/.exec(window.location)||window.location.hash==‘#images‘||window.location.hash==‘images‘);gcn.HOMEPAGE_DEST=‘http://www.google.com.hk/webhp?hl=zh-CN&sourceid=cnhp‘;gcn.IMAGES_DEST=‘http://images.google.com.hk/imghp?‘+‘hl=zh-CN&sourceid=cnhp‘;gcn.DEST_URL=gcn.IS_IMAGES?gcn.IMAGES_DEST:gcn.HOMEPAGE_DEST;gcn.READABLE_HOMEPAGE_URL=‘google.com.hk‘;gcn.READABLE_IMAGES_URL=‘images.google.com.hk‘;gcn.redirectIfLocationHasQueryParams=function(){if(window.location.search&&/google.cn/.exec(window.location)&&!/webhp/.exec(window.location)){window.location=String(window.location).replace(‘google.cn‘,‘google.com.hk‘)}}();gcn.replaceHrefsWithImagesUrl=function(){if(gcn.IS_IMAGES){var a=document.getElementsByTagName(‘a‘);for(var i=0,len=a.length;i<len;i++){if(a[i].href==gcn.HOMEPAGE_DEST){a[i].href=gcn.IMAGES_DEST}}}}();gcn.listen=function(a,e,b){if(a.addEventListener){a.addEventListener(e,b,false)}else if(a.attachEvent){var r=a.attachEvent(‘on‘+e,b);return r}};gcn.stopDefaultAndProp=function(e){if(e&&e.preventDefault){e.preventDefault()}else if(window.event&&window.event.returnValue){window.eventReturnValue=false;return false}if(e&&e.stopPropagation){e.stopPropagation()}else if(window.event&&window.event.cancelBubble){window.event.cancelBubble=true;return false}};gcn.resetChildElements=function(a){var b=a.childNodes;for(var i=0,len=b.length;i<len;i++){gcn.listen(b[i],‘click‘,gcn.stopDefaultAndProp)}};gcn.redirect=function(){window.location=gcn.DEST_URL};gcn.setInnerHtmlInEl=function(a){if(gcn.IS_IMAGES){var b=document.getElementById(a);if(b){b.innerHTML=b.innerHTML.replace(gcn.READABLE_HOMEPAGE_URL,gcn.READABLE_IMAGES_URL)}}};
gcn.listen(document, ‘click‘, gcn.redirect);
gcn.setInnerHtmlInEl(‘target‘);
</script>
ISO-8859-1
utf-8
text内容为:<!DOCTYPE html>
<html lang="zh">
<meta charset="utf-8">
<title>Google</title>
<style>
html { background: #fff; margin: 0 1em; }
body { font: .8125em/1.5 arial, sans-serif; text-align: center; }
h1 { font-size: 1.5em; font-weight: normal; margin: 1em 0 0; }
p#footer { color: #767676; font-size: .77em; }
p#footer a { background: url(//www.google.cn/intl/zh-CN_cn/images/cn_icp.gif) top right no-repeat; padding: 5px 20px 5px 0; }
ul { margin: 2em; padding: 0; }
li { display: inline; padding: 0 2em; }
div { -moz-border-radius: 20px; -webkit-border-radius: 20px; border: 1px solid #ccc; border-radius: 20px; margin: 2em auto 1em; max-width: 650px; min-width: 544px; }
div:hover, div:hover * { cursor: pointer; }
div:hover { border-color: #999; }
div p { margin: .5em 0 1.5em; }
img { border: 0; }
</style>
<div>
<a href="http://www.google.com.hk/webhp?hl=zh-CN&sourceid=cnhp">
<img src="//www.google.cn/landing/cnexp/google-search.png" alt="Google" width="586" height="257">
</a>
<h1><a href="http://www.google.com.hk/webhp?hl=zh-CN&sourceid=cnhp"><strong id="target">google.com.hk</strong></a></h1>
<p>请收藏我们的网址
</div>
<ul>
<li><a href="http://translate.google.cn/?sourceid=cnhp">翻译</a>
</ul>
<p id="footer">©2011 - <a href="http://www.miibeian.gov.cn/">ICP证合字B2-20070004号</a>
<script nonce="0qYFPrpj6kdYUM03_qP12w">
var gcn=gcn||{};gcn.IS_IMAGES=(/images.google.cn/.exec(window.location)||window.location.hash==‘#images‘||window.location.hash==‘images‘);gcn.HOMEPAGE_DEST=‘http://www.google.com.hk/webhp?hl=zh-CN&sourceid=cnhp‘;gcn.IMAGES_DEST=‘http://images.google.com.hk/imghp?‘+‘hl=zh-CN&sourceid=cnhp‘;gcn.DEST_URL=gcn.IS_IMAGES?gcn.IMAGES_DEST:gcn.HOMEPAGE_DEST;gcn.READABLE_HOMEPAGE_URL=‘google.com.hk‘;gcn.READABLE_IMAGES_URL=‘images.google.com.hk‘;gcn.redirectIfLocationHasQueryParams=function(){if(window.location.search&&/google.cn/.exec(window.location)&&!/webhp/.exec(window.location)){window.location=String(window.location).replace(‘google.cn‘,‘google.com.hk‘)}}();gcn.replaceHrefsWithImagesUrl=function(){if(gcn.IS_IMAGES){var a=document.getElementsByTagName(‘a‘);for(var i=0,len=a.length;i<len;i++){if(a[i].href==gcn.HOMEPAGE_DEST){a[i].href=gcn.IMAGES_DEST}}}}();gcn.listen=function(a,e,b){if(a.addEventListener){a.addEventListener(e,b,false)}else if(a.attachEvent){var r=a.attachEvent(‘on‘+e,b);return r}};gcn.stopDefaultAndProp=function(e){if(e&&e.preventDefault){e.preventDefault()}else if(window.event&&window.event.returnValue){window.eventReturnValue=false;return false}if(e&&e.stopPropagation){e.stopPropagation()}else if(window.event&&window.event.cancelBubble){window.event.cancelBubble=true;return false}};gcn.resetChildElements=function(a){var b=a.childNodes;for(var i=0,len=b.length;i<len;i++){gcn.listen(b[i],‘click‘,gcn.stopDefaultAndProp)}};gcn.redirect=function(){window.location=gcn.DEST_URL};gcn.setInnerHtmlInEl=function(a){if(gcn.IS_IMAGES){var b=document.getElementById(a);if(b){b.innerHTML=b.innerHTML.replace(gcn.READABLE_HOMEPAGE_URL,gcn.READABLE_IMAGES_URL)}}};
gcn.listen(document, ‘click‘, gcn.redirect);
gcn.setInnerHtmlInEl(‘target‘);
</script>
text内容长度为:3216
content内容长度为:3244
3、HTML页面
import
requests
from
bs4
import
BeautifulSoup
soup
=
BeautifulSoup("<!DOCTYPE html><html><head><meta charset
=
‘utf
-
8
‘>
<title菜鸟教程(rounoob.com)<
/
title><
/
head><body>
<h1>我的第一标题<
/
h1>
<p
id
=
‘first‘
>我的第一个段落。<
/
p><
/
body>
<table border
=
‘
1
‘><tr><td>row
1
,cell
1
<
/
td><td>row
1
,cell
2
<
/
td><
/
tr><tr><td>row
2
,cell
1
<
/
td><td>row
2
,cell
2
<
/
td><
/
tr<
/
table><
/
html>")
print
(soup.head,
"36"
)
print
(soup.body)
print
(soup.find_all(
id
=
"first"
))
print
(soup.h1.string,soup.p.string)
import
requests
from
bs4
import
BeautifulSoup
import
bs4
def
getHTMLText(url):
try
:
r
=
requests.get(url, timeout
=
30
)
r.raise_for_status
r.encoding
=
r.apparent_encoding
return
r.text
except
:
return
""
def
fillUnivList(ulist, html):
soup
=
BeautifulSoup(html,
"lxml"
)
for
tr
in
soup.find(
‘tbody‘
).children:
if
isinstance
(tr, bs4.element.Tag):
tds
=
tr(
‘td‘
)
ulist.append([tds[
0
].string, tds[
1
].string, tds[
2
].string,tds[
3
].string])
def
printUnivList(ulist, num):
tplt
=
"{0:^6} {1:{4}^10} {2:^10} {3:^10}"
print
(tplt.
format
(
"排名"
,
"学校名称"
,
"省份"
,
"总分"
,
chr
(
12288
)))
for
i
in
range
(num):
u
=
ulist[i]
print
(tplt.
format
(u[
0
],u[
1
],u[
2
],u[
3
],
chr
(
12288
)))
def
main():
uinfo
=
[]
url
=
"http://www.zuihaodaxue.com/zuihaodaxuepaiming2017.html"
html
=
getHTMLText(url)
fillUnivList(uinfo, html)
printUnivList(uinfo,
30
)
main()
以上是关于第一次爬虫的主要内容,如果未能解决你的问题,请参考以下文章
Python练习册 第 0013 题: 用 Python 写一个爬图片的程序,爬 这个链接里的日本妹子图片 :-),(http://tieba.baidu.com/p/2166231880)(代码片段