本文共 3033 字,大约阅读时间需要 10 分钟。
全部原创截止到2021.1.29可用, 直接写入了数据库看代码就可以知道数据表设计的方式, 有帮助的话点个赞谢谢.
from selenium import webdriverfrom time import sleepimport xlwt # 进行excel操作import pandas as pdfrom sqlalchemy import create_enginedriver=webdriver.Chrome()#打开网页driver.get("https://hotels.ctrip.com/hotels/list?countryId=1&city=48&checkin=2021/02/22&checkout=2021/02/23&optionId=48&optionType=City&directSearch=0&display=%E4%B8%9C%E6%96%B9%2C%20%E6%B5%B7%E5%8D%97%2C%20%E4%B8%AD%E5%9B%BD&crn=1&adult=1&children=0&searchBoxArg=t&travelPurpose=0&ctm_ref=ix_sb_dl&domestic=1&")#19&optionId=31&optionType=Province&directSearch=0&display=%E6%B5%B7%E5%8D%97%2C%20%E4%B8%AD%E5%9B%BD&crn=1&adult=1&children=0&searchBoxArg=t&travelPurpose=0&ctm_ref=ix_sb_dl&domestic=1&#通过xpath点击搜索driver.maximize_window()#driver.find_element_by_xpath("//*[@id='hotels-destination']").send_keys("海口")#driver.find_element_by_xpath("//*[@id='ibu_hotel_container']/div[1]/div[1]/div[3]/div/div/ul/li[5]/div").click()driver.implicitly_wait(30)#隐式休息20s 登录携程 # 加长到30s 办两件事 1. 把登录框X掉 2. 一直下拉直到出现查看更多蓝色按钮为止newData = pd.DataFrame(columns=['hotel', 'price', 'address', 'score', 'number','image'])# for i in range(1,11):for j in range(4,14): name=driver.find_element_by_xpath("//*[@id='ibu_hotel_container']/div/section/div[2]/ul/li["+str(j)+"]/div/div/div/div[1]/div[2]/div[1]/div/span[1]") price=driver.find_element_by_xpath("//*[@id='ibu_hotel_container']/div/section/div[2]/ul/li["+str(j)+"]/div/div/div/div[2]/div[2]/div[1]/p/span") #address=driver.find_element_by_xpath("//*[@id='ibu_hotel_container']/div/section/div[2]/ul/li["+str(j)+"]/div/div/div/div[1]/div[2]/div[2]/p/span[1]") percent=driver.find_element_by_xpath("//*[@id='ibu_hotel_container']/div/section/div[2]/ul/li["+str(j)+"]/div/div/div/div[2]/div[1]/div/div[2]/span") people=driver.find_element_by_xpath("//*[@id='ibu_hotel_container']/div/section/div[2]/ul/li["+str(j)+"]/div/div/div/div[2]/div[1]/div/div[1]/p[2]/a") image = driver.find_element_by_xpath("//*[@id='ibu_hotel_container']/div/section/div[2]/ul/li["+str(j)+"]/div/div/div/div[1]/div[1]/div/div/div") names = name.get_attribute("textContent").replace('\n', '').replace('\t', '') prices = price.get_attribute("textContent") percents = percent.get_attribute("textContent") peoples = people.get_attribute("textContent") # lst = address.get_attribute("textContent").split(" ") # a = len(lst) # addresses = lst[a-1] addresses = "东方" xiugai = image.get_attribute("style") lstn = xiugai.strip('");').split("//") ls = len(lstn) images = "https://" + lstn[ls-1] sleep(3) #if j%2 == 0: # driver.find_element_by_xpath("//*[@id='ibu_hotel_container']/div/section/div[2]/ul/div[2]/div/span").click() newData = newData.append(pd.Series({ 'hotel': names, 'price': prices, 'address': addresses, 'score': percents, 'number': peoples, 'image': images}),ignore_index=True)data=newDataengine = create_engine('mysql+pymysql://root:root@localhost:3306/myblog')data.to_sql('cityhotel', engine, if_exists='append')print("爬取完毕!")
转载地址:http://yilzi.baihongyu.com/