2009-05-04 34 views

cevap

14
from BeautifulSoup import BeautifulSoup 

soup = BeautifulSoup(''' 
<html> 
    <head><title>Testing</title></head> 
    <body> 
    <a href="http://foo.com/">foo</a> 
    <a href="http://bar.com/bar">Bar</a> 
    </body> 
</html>''') 

for link in soup.findAll('a'): # find all links 
    link['href'] = link['href'] + '?foo' 

print soup 

O baskılar:

<html> 
<head><title>Testing</title></head> 
<body> 
<a href="http://foo.com/?foo">foo</a> 
<a href="http://bar.com/bar?foo">Bar</a> 
</body> 
</html> 

documentation ayrıca bazı examples for changing attributes sahiptir. BeautifulSoup'un tüm ortak yönlerini kapsayan kapsamlı bir öğreticidir. Dokümanlardan neyin eksik olduğunu bilmiyorum, belki de açıklığa kavuşturmalısın.

1

benim örnek:

HEADERS = {"User-Agent" : "Mozilla/5.0 (Windows; U; Windows NT 5.1; ru; rv:1.9.1.5) Gecko/20091102 Firefox/3.5.5", 
     "Accept" : "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", 
     "Accept-Language" : "ru,en-us;q=0.7,en;q=0.3", 
     "Accept-Charset" : "windows-1251,utf-8;q=0.7,*;q=0.7", 
     "Accept-Encoding" : "identity, *;q=0", 
     "Connection" : "Keep-Alive"} 
PROXY=None 
timeout=60 


def parse_manuf_page_about(page_str_about): 
slovar={} 
global timeout 
socket.setdefaulttimeout(timeout) 
if PROXY is not None: 
     proxy_handler = urllib2.ProxyHandler({ "http": "http://"+PROXY+"/" }) 
     opener = urllib2.build_opener(proxy_handler) 
     urllib2.install_opener(opener) 
page_request = urllib2.Request(url=page_str_about, headers=HEADERS) 
try: 
    #print "Page reading ... %s" %page_str 
    page_zapr = urllib2.urlopen(url=page_request) 
    page=page_zapr.read() 
except Exception ,error: 
    print str(error) 
    res=False 
    return res,slovar 
soup = BeautifulSoup(page) 
select_pod=soup.findAll('div', {"class":"win aboutUs"}) 

promeg= select_pod[0].findAll("p")[0] 
zerro_br= promeg.findAll(text=True) 
Company_Info=" ".join(zerro_br).strip(" \t\n") 
select =soup.findAll('div', {"class":"win"}) 
cells_tabl= select[0].findAll("tr") 

for yach in cells_tabl: 
    text_zag=yach.findAll("th") 
    for zn_yach in text_zag: 
     if len(zn_yach)>0: 
      txt_zn_yach="".join(zn_yach.findAll(text=True)).strip(" \t\n") 
     else: 
      txt_zn_yach= zn_yach.contents[0].strip(" \t\n") 
      #print txt_zn_yach 
    text_znach_td=yach.findAll("td") 
    for zn_yach_td in text_znach_td: 
     if len(zn_yach_td)>0: 
      txt_zn_yach_td="".join(zn_yach_td.findAll(text=True)).strip(" \t\n") 
     else: 
      txt_zn_yach_td= zn_yach.contents[0].strip(" \t\n") 
      #print txt_zn_yach_td 
    # Делаем замены неугодных символов/Replase browsers char 
    if "&nbsp" in txt_zn_yach_td: 
     while txt_zn_yach_td.find("nbsp;")>0: 
      pos_gavna=txt_zn_yach_td.find("&nbsp;") 
      txt_zn_yach_td=txt_zn_yach_td[:pos_gavna]+txt_zn_yach_td[pos_gavna+6:] 
    if "&quot" in txt_zn_yach_td: 
     while txt_zn_yach_td.find("quot;")>0: 
      pos_gavna=txt_zn_yach_td.find("&quot;") 
      txt_zn_yach_td=txt_zn_yach_td[:pos_gavna]+'"'+txt_zn_yach_td[pos_gavna+6:] 
    if "&amp;" in txt_zn_yach_td: 
     while txt_zn_yach_td.find("&amp;")>0: 
      pos_gavna=txt_zn_yach_td.find("&amp;") 
      txt_zn_yach_td=txt_zn_yach_td[:pos_gavna]+'&'+txt_zn_yach_td[pos_gavna+6:] 
    slovar[str(txt_zn_yach)]=txt_zn_yach_td 
    slovar["Company_Info"]=Company_Info 
# разбираем нижнюю таблицу с контактом и вытаскиваем оттуда имя контакта | get name contacts 
select_contact=soup.findAll('a', {"class":"member-name"}) 
for contact_person in select_contact: 
    slovar["Contact_Person"]= contact_person.contents[0] 
# получаем статус голд партнера по наличию таблички в левом верхнем углу | get Gold status 
select_gold_part=soup.findAll('a', {"class":"memberLogo"}) 
if len(select_gold_part)==0: 
    slovar["Gold member"]="N" 
else: 
    slovar["Gold member"]="Y" 
res=True 
return res,slovar 

Bu kod Alibaba.com'da imalatı biri sayfasını ayrıştırma. Gördüğünüz sayfa - http://xmxinhuafeng.en.alibaba.com/aboutus.html

+0

Hayatta kalmak gerçekten hayatta mı? –