In this assignment, I need to use sockets to write Firefaux, toy web browser program. All that toy browser needs to do is to download a web page consisting of some HTML text and a single inline image object and store the base page and the inline image in two files in the local directory named respectively the same as the base HTML file and the image file on the remote machine.
Its for the python language using UDP connection
Expert Answer
# Run code as download_page.py [url] [output folder path]
# Create folder in / as test
# Example to run code python download_page.py http://www.example.com /test
import random
import string
import sys
import urllib2
import os
import re
from urlparse import urlparse
def page_loader(url_name, dir_name=’imgs’):
page_to_open = urllib2.urlopen(url_name)
target_page = page_to_open.read()
base_dir = os.path.dirname(os.path.realpath(__file__))
dir_to_save = os.path.join(base_dir, dir_name)
#download the html contents of web url mentioned and save to current directory
new_file_name = ‘%s.html’ % ”.join(random.choice(string.ascii_uppercase + string.ascii_lowercase) for _ in range(10))
if not os.path.exists(dir_to_save):
os.makedirs(dir_to_save)
#scrap all images uri in html file
images_on_page = re.findall(‘img .*?src=”(.*?)”‘, target_page)
#images in web url where the web page is hosted
internal_images = [img for img in images_on_page if img.startswith(‘/’)]
#inamge from other urls hyperlinked to the url mentioned while running this code
external_images = [img for img in images_on_page if not img.startswith(‘/’)]
#Download all the images from list of urls in internal and external images
for image in internal_images:
image_url = ‘%s%s’ % (page_to_open.geturl()[:-1], image)
new_image_name = urlparse(image_url).path.split(‘/’)[-1]
with open(os.path.join(dir_to_save, new_image_name), ‘w’) as new_image:
new_image.write(urllib2.urlopen(image_url).read())
target_page = re.sub(image, new_image.name, target_page)
for image_url in external_images:
new_image_name = urlparse(image_url).path.split(‘/’)[-1]
with open(os.path.join(dir_to_save, new_image_name), ‘w’) as new_image:
new_image.write(urllib2.urlopen(image_url).read())
target_page = re.sub(image_url, new_image.name, target_page)
with open(os.path.join(base_dir, new_file_name), ‘w’) as new_file:
new_file.write(target_page)
if __name__ == ‘__main__’:
target_url = sys.argv[1]
# if destination folder is mentioned as 2nd argument
if len(sys.argv) > 2:
dir_name = sys.argv[2]
page_loader(target_url, dir_name)
#if destination folder for saving images not given
else:
page_loader(target_url)