from __future__ import print_function
import requests, os, bs4
def main():
url = 'http://xkcd.com'
try:
os.makedirs(u'e:/url') # Create folder for storing the images
except:
pass
while not url.endswith('#'): # Refer to note 1
print ('Downloading page %s...' % url)
res = requests.get(url)
res.raise_for_status()
soup = bs4.BeautifulSoup(res.text, 'html.parser') # Parsing the html into text
comicElem = soup.select('#comic img') # Refer to note 2 - will return a list with elements that have 'img' and comic in the line.
if comicElem == []:
print ('Could not find comic img')
else:
comicURL = r'http:' + comicElem[0].get('src') # Refer to note 2 - Will resturn only the src of this element (In the example at the buttom, its the img)
print ('Downloading image %s...' % comicURL)
comicURL_split = comicURL.rsplit('/')
len_num = len(comicURL_split) - 1
comicURL = comicURL_split[len_num]
comicURL = 'http://imgs.xkcd.com/comics/' + comicURL # Putting down the full URL and image name
res = requests.get(comicURL)
try:
res.raise_for_status()
except:
pass
imageFile = open(u'e:/url/' + os.path.basename(comicURL), 'wb') # Change the folder name
for chunk in res.iter_content(100000): # Download the PIC
imageFile.write(chunk)
imageFile.close()
# ---After the page finished downloading, we are moving forward the other page---
# The element is: <a rel="prev" href="/1786/" accesskey="p">< Prev</a>
prevLink = soup.select('a[rel="prev"]')[0] # In here we are selecting the <a> element.
url = 'http://xkcd.com' + prevLink.get('href') # Preparing the 'url' variable for the next loop when using the 'href' attribute that contain the previous url
print('Done.')
main()
'''
---Note 1---
# The '#' sign appearing when there is not additional previous page to move into:
Current page:
http://xkcd.com/2/
Previous page:
http://xkcd.com/1/#
---Note 2---
In the web page we can see that the div element contain the ID attribute: comic element that contain the img element.
<img src="//imgs.xkcd.com/comics/barrel_cropped_(1).jpg" title="Don't we all." alt="Barrel - Part 1">
<div id="comic">
<img src="//imgs.xkcd.com/comics/voice_commands.png" title="Dvorak words may sound hard to pronounce, but studies show they actually put less stress on the vocal chords." alt="Voice Commands" srcset="//imgs.xkcd.com/comics/voice_commands_2x.png 2x">
</div>
'''