|
|
|
@ -1,9 +1,15 @@
|
|
|
|
|
#!/usr/bin/env python
|
|
|
|
|
# coding=utf-8
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import csv
|
|
|
|
|
import json
|
|
|
|
|
from pprint import pprint
|
|
|
|
|
|
|
|
|
|
import requests
|
|
|
|
|
|
|
|
|
|
class Cups():
|
|
|
|
|
|
|
|
|
|
class Cups:
|
|
|
|
|
|
|
|
|
|
def __init__(self, url, page, path):
|
|
|
|
|
self._url = url
|
|
|
|
@ -12,10 +18,14 @@ class Cups():
|
|
|
|
|
self._result = set()
|
|
|
|
|
|
|
|
|
|
def run(self):
|
|
|
|
|
""" 启动爬虫 """
|
|
|
|
|
headers = {'X-Requested-With': 'XMLHttpRequest',
|
|
|
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) '
|
|
|
|
|
'Chrome/56.0.2924.87 Safari/537.36'}
|
|
|
|
|
"""
|
|
|
|
|
启动爬虫
|
|
|
|
|
"""
|
|
|
|
|
headers = {
|
|
|
|
|
'X-Requested-With': 'XMLHttpRequest',
|
|
|
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 '
|
|
|
|
|
'(KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'
|
|
|
|
|
}
|
|
|
|
|
urls = []
|
|
|
|
|
urls.extend([self._url.format(1, p, 0) for p in range(1, self._page)])
|
|
|
|
|
urls.extend([self._url.format(1, p, 1) for p in range(1, self._page)])
|
|
|
|
@ -25,8 +35,11 @@ class Cups():
|
|
|
|
|
try:
|
|
|
|
|
j = json.loads(requests.get(u, headers=headers, timeout=2).text[15:])
|
|
|
|
|
for i, v in enumerate(j['rateList']):
|
|
|
|
|
goods = (v['rateDate'], v['auctionSku'],
|
|
|
|
|
v['rateContent'].replace("<b>", "").replace("</b>", "").replace("&hellip", ""))
|
|
|
|
|
goods = (
|
|
|
|
|
v['rateDate'],
|
|
|
|
|
v['auctionSku'],
|
|
|
|
|
v['rateContent'].replace("<b>", "").replace("</b>", "").replace("&hellip", "")
|
|
|
|
|
)
|
|
|
|
|
self._result.add(goods)
|
|
|
|
|
print(i)
|
|
|
|
|
except Exception as e:
|
|
|
|
@ -36,18 +49,23 @@ class Cups():
|
|
|
|
|
self.save()
|
|
|
|
|
|
|
|
|
|
def save(self):
|
|
|
|
|
""" 保存数据到本地 """
|
|
|
|
|
"""
|
|
|
|
|
保存数据到本地
|
|
|
|
|
"""
|
|
|
|
|
with open(self._path, "w+", encoding="utf-8") as f:
|
|
|
|
|
f_csv = csv.writer(f)
|
|
|
|
|
f_csv.writerows(self._result)
|
|
|
|
|
|
|
|
|
|
def clear(self):
|
|
|
|
|
""" 数据去重 """
|
|
|
|
|
"""
|
|
|
|
|
数据去重
|
|
|
|
|
"""
|
|
|
|
|
s = set()
|
|
|
|
|
with open(self._path, "r", encoding="utf-8") as f:
|
|
|
|
|
fin_csv = csv.reader(f)
|
|
|
|
|
for row in fin_csv:
|
|
|
|
|
s.add(tuple(row))
|
|
|
|
|
|
|
|
|
|
with open("cup_all.csv", "w+", encoding="utf-8") as f:
|
|
|
|
|
fout_csv = csv.writer(f)
|
|
|
|
|
fout_csv.writerows(s)
|
|
|
|
@ -55,7 +73,9 @@ class Cups():
|
|
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
|
def extract():
|
|
|
|
|
""" 提取数据 """
|
|
|
|
|
"""
|
|
|
|
|
提取数据
|
|
|
|
|
"""
|
|
|
|
|
# datelst, size_colorlst, commentlst = [], [], []
|
|
|
|
|
# with open("cup_all.csv", "r", encoding="utf-8") as f:
|
|
|
|
|
# fin_csv = csv.reader(f)
|
|
|
|
@ -67,6 +87,7 @@ class Cups():
|
|
|
|
|
# with open("comment.txt", "w+", encoding="utf-8") as f:
|
|
|
|
|
# for r in commentlst:
|
|
|
|
|
# f.write(r + "\n")
|
|
|
|
|
|
|
|
|
|
with open(r"data_/size_color.txt", "r", encoding="utf-8") as fin:
|
|
|
|
|
rows = fin.readlines()
|
|
|
|
|
lst = []
|
|
|
|
@ -75,6 +96,7 @@ class Cups():
|
|
|
|
|
with open(r"data_/size.txt", "w+", encoding="utf-8") as fout:
|
|
|
|
|
fout.writelines(lst)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
|
|
|
|
|
|
url = "https://rate.tmall.com/list_detail_rate.htm?itemId=37457670144&spuId=249827344&" \
|
|
|
|
@ -82,4 +104,4 @@ if __name__ == "__main__":
|
|
|
|
|
cups = Cups(url, 101, "cups.csv")
|
|
|
|
|
# cups.run()
|
|
|
|
|
# cups.clear()
|
|
|
|
|
cups.extract()
|
|
|
|
|
cups.extract()
|
|
|
|
|