محب علوی
مدیر
اس دھاگے میں ہم ایک سادہ سا web crawler بنائیں گے جس کا کوڈ اور ویڈیوز Udacity کے کورس میں تفصیلا دیکھی جا سکتی ہیں۔
Introduction to Computer Science
Introduction to Computer Science
import urllib.request
def get_next_target(page):
start_link = page.find('<a href=') # start of link tag
if start_link == -1: # if link tage not found then return None and 0
return None,0
start_quote = page.find('"',start_link) # index of first quote in link tag
end_quote = page.find('"',start_quote+1) # index of second quote in link tag
url = page[start_quote+1:end_quote] # extracting string between start quote and end quote
return url, end_quote # returing URL and the index of ending quote of link string
def get_all_links(page):
links = []
while True:
url, endpos = get_next_target(page) # calling function to get link and end position of lin i.e. second quote(")
if url: # if url has some value(not empty hence true)
links.append(url) # then append url in LINKS list
page = page[endpos:] # reassign page starting from the end of the found url and rest of the remaining page string
else:
break
return links
# opening a given url using URLLIB module
response = urllib.request.urlopen('http://www.urduweb.org')
# reading the file handle and converting it into a string using str() function
page = str(response.read())
# calling get_all_links function to further call get_next_target function and get a list of all links in a page
pagelist = get_all_links(page)
print(pagelist)
start with tocrawl = [seed]
crawled = []
while there are more pages tocrawl:
pick a page from tocrawl
add that page to crawled
add all the link targets on this page to tocrawl
return crawled
def crawl2(seed):
ct=0
crawled=[]
tocr=[]
tocr.append(seed)
while tocr:
ct=ct+1
if ct==50: #goes to infinity otherwise
break
crawled.append(tocr[0])
print (tocr[0])
import urllib.request
try:
response = urllib.request.urlopen(tocr[0])
page = str(response.read())
pagelist = get_all_links(page)
tocr.extend(pagelist)
except:
print("an error here")
del tocr[0]
print(crawled)
return crawled
ری یوزیبلٹی کا فائدہ اٹھاتے ہوئے:
PHP:def crawl2(seed): ct=0 crawled=[] tocr=[] tocr.append(seed) while tocr: ct=ct+1 if ct==50: #goes to infinity otherwise break crawled.append(tocr[0]) print (tocr[0]) import urllib.request try: response = urllib.request.urlopen(tocr[0]) page = str(response.read()) pagelist = get_all_links(page) tocr.extend(pagelist) except: print("an error here") del tocr[0] print(crawled) return crawled
crawl2('http://www.udacity.com/cs101x/index.html')
http://www.udacity.com/cs101x/index.html
an error here
['http://www.udacity.com/cs101x/index.html']
ری یوزیبلٹی کا فائدہ اٹھاتے ہوئے:
PHP:def crawl2(seed): ct=0 crawled=[] tocr=[] tocr.append(seed) while tocr: ct=ct+1 if ct==50: #goes to infinity otherwise break crawled.append(tocr[0]) print (tocr[0]) import urllib.request try: response = urllib.request.urlopen(tocr[0]) page = str(response.read()) pagelist = get_all_links(page) tocr.extend(pagelist) except: print("an error here") del tocr[0] print(crawled) return crawled
ct=ct+1
if ct==50: #goes to infinity otherwise
break
import urllib.request
try:
response = urllib.request.urlopen(tocr[0])
page = str(response.read())
pagelist = get_all_links(page)
tocr.extend(pagelist)
except:
print("an error here")
def get_page(url):
try:
import urllib
return urllib.urlopen(url).read()
except:
return ""
get_all_links(get_page(page)
ویسے میرے پاس کام کر رہا ہے ابھی بھی:کوڈ کام نہیں کر رہا نمرہ
میں نے سیڈ کے طور پر مندرجہ بالا لنک دیا
کوڈ:crawl2('http://www.udacity.com/cs101x/index.html')
جواب میں یہ ایرر آ رہی ہے۔
کوڈ:http://www.udacity.com/cs101x/index.html an error here ['http://www.udacity.com/cs101x/index.html']
crawl2('http://www.udacity.com/cs101x/index.html')
http://www.udacity.com/cs101x/index.html
http://www.udacity.com/cs101x/crawling.html
http://www.udacity.com/cs101x/walking.html
http://www.udacity.com/cs101x/flying.html
http://www.udacity.com/cs101x/kicking.html
http://www.udacity.com/cs101x/index.html
http://www.udacity.com/cs101x/crawling.html
http://www.udacity.com/cs101x/walking.html
http://www.udacity.com/cs101x/flying.html
http://www.udacity.com/cs101x/kicking.html
یہ ضرور ہے کہ یہ کوڈ رکتا نہیں ہے، چلتا رہتا ہے۔
ایک سوال ہے ،اگر اسے وائل لوپ کے بجائے recursion سے کیا جائے تو کیا اس کی رفتار کم ہو جائے گی؟