盒子
盒子
文章目录
  1. 在Windows上安装第三方库
    1. pip安装lxml
    2. pip安装BeautifulSoup4
    3. pip安装Requests
  2. 简单爬虫示例

crawler

在Windows上安装第三方库

首先查看是否安装了pip,在命令行cmd中输入

pip --version

如果成功安装了pip,会提示

pip 8.1.1 from d:\python 3.5\lib\site-packages(python 3.5)

pip安装lxml

1.安装wheel

python -m pip install wheel

2.下载lxml
lxml下载对应该版本的lxml,其中cp对应python版本号,win32为对应32系统,win_amd64对应64位系统.
3.安装lxml
cd到lxml的下载目录,xxxx为下载文件的名字

python -m pip install xxxx.whl

pip安装BeautifulSoup4

pip3 install beautifulsoup4

pip安装Requests

pip3 install requests

简单爬虫示例


from bs4 import BeautifulSoup
import requests
import time

url = 'https://cn.tripadvisor.com/Attractions-g186338-Activities-c47-London_England.html'
url_saves = 'https://cn.tripadvisor.com/Saves#43102599'
urls=['https://cn.tripadvisor.com/Attractions-g186338-Activities-c47-oa{}-London_England.html#ATTRACTION_LIST'.format(str(i)) for i in range(30,930,30)]
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.87 Safari/537.36',
'Cookie':'TAUnique=%1%enc%3AN55Axsh%2BTw%2Bg4qIKwsrvWeWzaJiU9mRkiWjZmzTMePaKIKHywc0IBA%3D%3D; TAPD=cn.tripadvisor.com; ServerPool=C; TASSK=enc%3AFgyAAVqAoGal7KePZYR1j0urighcsUCXtcCdwe0QcVhcdJapW0vcu2sNDHZel5fIbamtcZtdeWU%3D; SecureLogin2=3.4%3AADIfYqCmm2zJrNy8LbfGZuA7W4pf2euqZl2wITvSUKcstRyvDVwKShQcbeHyHuHTfvynnczOLNvrW81l3uE%2BnU%2B2r9eC9Q5hhXwS9VEm%2FfOHVv8NChWqxlg2vq5FBeTuBUlhrYw%2FvrK%2Bu0KOPnTeIzNd04aRTU%2FzA0u%2FLvGF289ww2Wzd8fNMWUw8Zbag7JtjVhO76AkPl1gcResuIDVBtY%3D; TAAuth2=%1%3%3A66ed9e66e325eab5f3db106234db8d5b%3AAJh0Asxnnis4vnH13%2BHftZilgXAbyew1snwOnmtBqwmS2XMllAvJyLF9Ok9MxFl8DE79iFpaziSUK%2FHq5RW5%2BhUlrqSdTVw18RpCuqdTOSjM2kzk32aVgv9f%2FpntTABvzfsq8XBXNcRG5NfLXzCPH6mV%2FWq88%2Feao1Dsy%2BIuezCEspOG8P6k0KYGJm7%2FDVnqDFQv7BAXLwtFtkxejG8Q3g4%3D; TART=%1%enc%3AoOKiCsLK71lPz2oVgBx0GCCmEOY7uin65dldw%2B8SULjHRMddMcAB6XFsVVztyuZOcT99K%2FLsp0I%3D; TATravelInfo=V2*A.2*MG.-1*HP.2*FL.3*RVL.187547_100*RS.1; TAReturnTo=%1%%2FAttraction_Review-g186338-d187547-Reviews-Tower_of_London-London_England.html; CM=%1%HanaPersist%2C%2C-1%7Cpu_vr2%2C%2C-1%7Ct4b-pc%2C%2C-1%7CHanaSession%2C%2C-1%7CRCPers%2C%2C-1%7CWShadeSeen%2C%2C-1%7Cpu_vr1%2C%2C-1%7CFtrPers%2C%2C-1%7CHomeASess%2C%2C-1%7CAWPUPers%2C%2C-1%7Ccatchsess%2C3%2C-1%7Cbrandsess%2C%2C-1%7CCpmPopunder_1%2C%2C-1%7CCCSess%2C%2C-1%7CCpmPopunder_2%2C%2C-1%7Csesssticker%2C%2C-1%7C%24%2C%2C-1%7Ct4b-sc%2C%2C-1%7CMC_IB_UPSELL_IB_LOGOS2%2C%2C-1%7CMC_IB_UPSELL_IB_LOGOS%2C%2C-1%7Csess_rev%2C2%2C-1%7Csessamex%2C%2C-1%7CSaveFtrPers%2C%2C-1%7Cpers_rev%2C%2C-1%7CMetaFtrSess%2C%2C-1%7CRBAPers%2C%2C-1%7CWAR_RESTAURANT_FOOTER_PERSISTANT%2C%2C-1%7CFtrSess%2C%2C-1%7CHomeAPers%2C%2C-1%7C+r_lf_1%2C%2C-1%7CRCSess%2C%2C-1%7C+r_lf_2%2C%2C-1%7Ccatchpers%2C3%2C1460783131%7CAWPUSess%2C%2C-1%7Cvr_npu2%2C%2C-1%7Csh%2C%2C-1%7CLastPopunderId%2C104-771-null%2C-1%7Cpssamex%2C%2C-1%7C2016sticksess%2C%2C-1%7Cvr_npu1%2C%2C-1%7CCCPers%2C%2C-1%7CWAR_RESTAURANT_FOOTER_SESSION%2C%2C-1%7Cbrandpers%2C%2C-1%7C2016stickpers%2C%2C-1%7CWarPopunder_Session%2C%2C-1%7CWarPopunder_Persist%2C%2C-1%7Cr_ta_2%2C%2C-1%7Cr_ta_1%2C%2C-1%7CSaveFtrSess%2C%2C-1%7CRBASess%2C%2C-1%7Cperssticker%2C%2C-1%7CMetaFtrPers%2C%2C-1%7C; roybatty=AMCgJ%2FgMJQKIwDmg%2BWefWtRtBNem4Wh%2BmcyyLQUm5BuamjmxbpG2pWqccB5XGf7mwJeIEOXVu%2B3l1apZh3Asyu%2Fe1PC9LtxvrIxY6LCko%2FAcqA4E5pNsFP1FThNnDoMeyg%3D%3D%2C1; TASession=%1%V2ID.5C00B76BC6B38571367B4AC920F03307*SQ.21*PR.427%7C*LS.PerformancePingback*GR.13*TCPAR.13*TBR.7*EXEX.37*ABTR.19*PPRP.9*PHTB.51*FS.78*CPU.84*HS.popularity*ES.popularity*AS.popularity*DS.5*SAS.popularity*FPS.oldFirst*TS.6669A44B5EE21C0CBBFE07CA39A63BB2*LF.zhCN*FA.1*DF.0*LP.%2FAttractions-g186338-Activities-c47-London_England%5C.html*IR.1*OD.null*FBH.2*MS.-1*RMS.-1*FLO.186338*TRA.true*LD.187547; TAUD=LA-1460173535040-1*LG-4847653-2.1.F*LD-4847655-.....; NPID='
}

def get_attractions(url,data=None):

time.sleep(10)
wb_data = requests.get(url)

soup = BeautifulSoup(wb_data.text,'lxml')

titles = soup.select('div.property_title > a[target="_blank"]')
imgs = soup.select('img[width="160"]')
cates = soup.select('div.p13n_reasoning_v2 ')

for title,img,cate in zip(titles,imgs,cates):
data = {
'title': title.get_text(),
'img':img.get('src'),
'cate': list(cate.stripped_strings)
}
print(data)



def get_saves(url,data=None):
wb_data1 = requests.get(url_saves,headers=headers)
soup = BeautifulSoup(wb_data1.text,'lxml')

titles = soup.select('a.location-name')
imgs = soup.select('img.photo_image')
metas = soup.select('span.format_address')
if data == None:
for title,img,meta in zip(titles,imgs,metas):
data = {
'title':title.get_text(),
'img':img.get('src'),
'meta': list(meta.stripped_strings)

}
print(data)

for single_url in urls:
get_attractions(single_url)