0

大众点评爬虫教程

A comprehensive web scraping tutorial for Dianping (大众点评) using Python, Selenium, and MySQL. Learn how to extract restaurant data, reviews, and ratings from China's popular review platform.

安装环境

import time
import pandas as pd
from sqlalchemy import create_engine
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options

Step 1: 对浏览器进行设置

chrome_options = Options()
chrome_options.add_argument('--incognito')  ##设置无痕模式
chrome_options.add_argument("--disable-blink-features=AutomationControlled")
service = Service('/Users/skg/Desktop/test/chromedriver')
driver = webdriver.Chrome(service=service, options=chrome_options)
script = '''
Object.defineProperty(navigator, 'webdriver', {
    get: () => undefined
})
'''
driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {"source": script})
driver.maximize_window()

Step 2: 设置定位xpath信息

next_page_xp = "//div[@class='page']//a[@title='下一页']"
shop_lis_xp = "//div[@id='shop-all-list']//ul/li//div[@class='tit']/a"
shop_title_xp = "//h1[@class='shop-name']"
shop_score_xp = "//div[contains(@class,'score score')]"
shop_reviewcount_xp = "//span[@id='reviewCount']|//span[@class='reviews']"
shop_avgpricetitle_xp = "//span[@id='avgPriceTitle']|//span[@class='price']"
shop_kouwei_xp = "//span[contains(text(),'口味')]"
shop_huanjing_xp = "//span[contains(text(),'环境')]"
shop_fuwu_xp = "//span[contains(text(),'服务')]"
shop_adress_xp = "//span[@id='address']|//div[contains(text(),'地址')]"
shop_tel_xp = "//p[@class='expand-info tel']|//div[contains(text(),'电话')]"
 
reviews_block_xp = "//div[@class='reviews-items']/ul/li"
reviews_zhankai_xp = ".//a[@class='fold' and contains(@data-click-name,'展开评价')]"
reviews_body_xp = ".//div[@class='review-words']"
reviews_time_xp = ".//span[@class='time']"
reviews_nextpage_xp = "//div[@class='reviews-pages']//a[@class='NextPage']"

Step 3: 参数设置

# 登录页访问
driver.get("https://account.dianping.com/pclogin")
# 目标类目店铺列表页访问
driver.get("https://www.dianping.com/shanghai/ch10/g110")
## 设置抓取店铺列表页码数量
shop_lis_page_num = 2
 
## 设置抓取店铺评论列表页码数量
reviews_page_num = 10

Step 4: 获取店铺主页链接

shop_set = set()
for i in range(shop_lis_page_num):
    try:
        driver.find_element(By.XPATH,next_page_xp).click()
        time.sleep(2)
        shop_eles = driver.find_elements(By.XPATH,shop_lis_xp)
        for ele in shop_eles:
            shop_url = ele.get_attribute('href')
            if 'brands' in shop_url:
                continue
            shop_set.add(shop_url)
    except:
        pass

Step 5: 循环访问店铺主页 并存储数据

df_all = pd.DataFrame()
for li in shop_set:
    url = li + '/review_all'
    # 设置数据库链接
    conn = create_engine("mysql+pymysql://root:1711785634@localhost/test")
    try:
        driver.get(url)
        time.sleep(3)
    except:
        pass
 
    try:
        driver.find_element(By.XPATH,"//h1[contains(text(),'Forbidden')]")
        print("被限制")
        #driver.quit()
        break
    except:
        pass
 
    for rev_page in range(reviews_page_num):
        current_url = driver.current_url
        try:
            shop_title = driver.find_element(By.XPATH,shop_title_xp).get_attribute('textContent').strip().split(' ')[0].strip()
        except:
            shop_title = ''
 
        try:
            shop_score = driver.find_element(By.XPATH,shop_score_xp).get_attribute('textContent').strip()
        except:
            shop_score = ''
 
        try:
            shop_reviewcount = driver.find_element(By.XPATH,shop_reviewcount_xp).get_attribute('textContent').replace('条评价','').strip()
        except:
            shop_reviewcount = ''
 
        try:
            shop_avgpricetitle = driver.find_element(By.XPATH,shop_avgpricetitle_xp).get_attribute('textContent').replace('人均:','').replace('元','').strip()
        except:
            shop_avgpricetitle = ''
 
        try:
            shop_kouwei = driver.find_element(By.XPATH,shop_kouwei_xp).get_attribute('textContent').replace('口味:','').strip()
        except:
            shop_kouwei = ''
 
        try:
            shop_huanjing = driver.find_element(By.XPATH,shop_huanjing_xp).get_attribute('textContent').replace('环境:','').strip()
        except:
            shop_huanjing = ''
 
        try:
            shop_fuwu = driver.find_element(By.XPATH,shop_fuwu_xp).get_attribute('textContent').replace('服务:','').strip()
        except:
            shop_fuwu = ''
 
        try:
            shop_adress = driver.find_element(By.XPATH,shop_adress_xp).get_attribute('textContent').replace('地址:','').strip()
        except:
            shop_adress = ''
 
        try:
            shop_tel = driver.find_element(By.XPATH,shop_tel_xp).get_attribute('textContent').replace('电话:','').strip()
        except:
            shop_tel = ''
 
        reviews_block_eles = driver.find_elements(By.XPATH,reviews_block_xp)
        for reviews_ele in reviews_block_eles:
            try:
                reviews_ele.find_element(By.XPATH,reviews_zhankai_xp).click()
            except:
                pass
            time.sleep(0.5)
            try:
                reviews_body = reviews_ele.find_element(By.XPATH,reviews_body_xp).get_attribute('textContent').replace('收起评价','').strip()
            except:
                reviews_body = ''
 
            try:
                reviews_time = reviews_ele.find_element(By.XPATH,reviews_time_xp).get_attribute('textContent').strip().split('更新于')[0].strip()
            except:
                reviews_time = ''
 
 
            df = pd.DataFrame({"shop_url":[li],"shop_title":[shop_title],"shop_score":[shop_score],"shop_reviewcount":[shop_reviewcount],"shop_avgpricetitle":[shop_avgpricetitle],
                               "shop_kouwei":[shop_kouwei],"shop_huanjing":[shop_huanjing],"shop_fuwu":[shop_fuwu],"shop_adress":[shop_adress],"shop_tel":[shop_tel],
                               "reviews_body":[reviews_body],"reviews_time":[reviews_time],"current_url":[current_url]
                               })
            # 数据同步到mysql数据库
            df.to_sql(con=conn,name='dazdp_data',if_exists='append',index=False)
 
            ## 数据储存到pandas数据帧,结束后导出到excel
            df_all = pd.concat([df_all,df])
        try:
            driver.find_element(By.XPATH,reviews_nextpage_xp).click()
            time.sleep(1)
        except:
            pass
        time.sleep(1)
        try:
            driver.find_element(By.XPATH,"//h1[contains(text(),'Forbidden')]")
            print("被限制")
            #driver.quit()
            break
        except:
            pass

Step 6: 导出为Excel

df_all.to_excel("大众点评数据.xlsx",index=False)