level 1
Liuyiweihan
楼主
import requests
import re
import time
import pymongo
client = pymongo.MongoClient('localhost', 27017)

ershoufang = client['ershoufang']#创建数据库
lianjia = ershoufang['lianjia']#创建集合
#url = 'https://m.lianjia.com/gz/ershoufang/index/'
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36'}
def get_href(url):
html = requests.get(url, headers=headers)
links = re.findall('<a href=\"(.*?)\" class="a_mask post_ulog post_ulog_action VIEWDATA CLICKDATA"', html.text)
for link in links:
#print(link)
get_info(link)
def get_info(link):
#urlm = 'https://m.lianjia.com/gz/ershoufang/108401036211.html"'
res = requests.get(link, headers=headers)
price = re.findall('<span class="price_total"><em>(.*?)</em><span class="unit">万</span></span>', res.text)
name = re.findall('<div class="item_other text_cut" title="锦绣新天地二手房">3室2厅/135.46m²/南/(.*?)</div>', res.text, re.S)
huxing = re.findall('<div class="item_other text_cut" title="锦绣新天地二手房">(.*?)/135.46m²/南/锦绣新天地</div>', res.text)
mianji = re.findall('<div class="item_other text_cut" title="锦绣新天地二手房">3室2厅/(.*?)/南/锦绣新天地</div>', res.text)
zuoxiang = re.findall('<div class="item_other text_cut" title="锦绣新天地二手房">3室2厅/135.46m²/(.*?)/锦绣新天地</div>', res.text)
info = {
'价格': price[0].strip(),
'名称': name[0].strip(),
'户型': huxing[0].strip(),
'面积': mianji[0].strip(),
'坐向': zuoxiang[0].strip()
}
#print(info)
lianjia.insert_one(info)
#36.05
if __name__ == '__main__':
urls = ['https://m.lianjia.com/gz/ershoufang/index/pg{}'.format(str(i)) for i in range(0, 60, 30)]
for url in urls:
get_href(url)
print(str(i))
i = i + 1
time.sleep(2)
2020年04月02日 15点04分
1
import re
import time
import pymongo
client = pymongo.MongoClient('localhost', 27017)

ershoufang = client['ershoufang']#创建数据库lianjia = ershoufang['lianjia']#创建集合
#url = 'https://m.lianjia.com/gz/ershoufang/index/'
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36'}
def get_href(url):
html = requests.get(url, headers=headers)
links = re.findall('<a href=\"(.*?)\" class="a_mask post_ulog post_ulog_action VIEWDATA CLICKDATA"', html.text)
for link in links:
#print(link)
get_info(link)
def get_info(link):
#urlm = 'https://m.lianjia.com/gz/ershoufang/108401036211.html"'
res = requests.get(link, headers=headers)
price = re.findall('<span class="price_total"><em>(.*?)</em><span class="unit">万</span></span>', res.text)
name = re.findall('<div class="item_other text_cut" title="锦绣新天地二手房">3室2厅/135.46m²/南/(.*?)</div>', res.text, re.S)
huxing = re.findall('<div class="item_other text_cut" title="锦绣新天地二手房">(.*?)/135.46m²/南/锦绣新天地</div>', res.text)
mianji = re.findall('<div class="item_other text_cut" title="锦绣新天地二手房">3室2厅/(.*?)/南/锦绣新天地</div>', res.text)
zuoxiang = re.findall('<div class="item_other text_cut" title="锦绣新天地二手房">3室2厅/135.46m²/(.*?)/锦绣新天地</div>', res.text)
info = {
'价格': price[0].strip(),
'名称': name[0].strip(),
'户型': huxing[0].strip(),
'面积': mianji[0].strip(),
'坐向': zuoxiang[0].strip()
}
#print(info)
lianjia.insert_one(info)
#36.05
if __name__ == '__main__':
urls = ['https://m.lianjia.com/gz/ershoufang/index/pg{}'.format(str(i)) for i in range(0, 60, 30)]
for url in urls:
get_href(url)
print(str(i))
i = i + 1
time.sleep(2)