How to get the web address in google use Python
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33
| import urllib import mechanize from bs4 import BeautifulSoup import re def getGoogleLinks(link,depth): br=mechanize.Browser() br.set_handler_robots(False) br.addheaders=[('User-agent':'chrome')] term=link.replace(" ","+") query="http://www.google.com.sg/search?num=100&q=" + term + "&start=" + depth htmltext=br.open(query).read() print htmltext soup=BeautifulSoup(htmltext) search=soup.findAll('div',attrs={'id':'search'}) searchtext=str(search[0]) soup1=BeautifulSoup(searchtext) list_items=soup1.findAll('li') print list_items[0] regex="q(?!.*q).*?&" pattern=re.compile(regex) results_array=[] for li in list_items: soup2=BeautifulSoup(str(li)) links=soup2.findAll('a') prink links source_link= links[0] print source_link source_url=re.findall(pattern,str(source_link)) if source_url and source_url[0].startswith=="http": results_array.append(source_url[0].replace("q=","").replace("&","")) return results_array print getGoogleLinks('cars','100')
|