init repo

pull/7/merge
chenjiandongx 6 years ago
commit 69abe72603

90
.gitignore vendored

@ -0,0 +1,90 @@
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
/cup/
# C extensions
*.so
/data_/
.idea
# Distribution / packaging
.Python
env/
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
*.egg-info/
.installed.cfg
*.egg
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*,cover
.hypothesis/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
target/
# IPython Notebook
.ipynb_checkpoints
# pyenv
.python-version
# celery beat schedule file
celerybeat-schedule
# dotenv
.env
# virtualenv
venv/
ENV/
# Spyder project settings
.spyderproject
# Rope project settings
.ropeproject

@ -0,0 +1,79 @@
import csv
import json
from pprint import pprint
import requests
class Cups():
def __init__(self, url, page, path):
self._url = url
self._page = page
self._path = path
self._result = set()
def run(self):
""" 启动爬虫 """
headers = {'X-Requested-With': 'XMLHttpRequest',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/56.0.2924.87 Safari/537.36'}
urls = []
urls.extend([self._url.format(1, p, 0) for p in range(1, self._page)])
urls.extend([self._url.format(1, p, 1) for p in range(1, self._page)])
urls.extend([self._url.format(3, p, 0) for p in range(1, self._page)])
urls.extend([self._url.format(3, p, 1) for p in range(1, self._page)])
for i, u in enumerate(urls):
try:
j = json.loads(requests.get(u, headers=headers, timeout=2).text[15:])
for i, v in enumerate(j['rateList']):
goods = (v['rateDate'], v['auctionSku'],
v['rateContent'].replace("<b>", "").replace("</b>", "").replace("&hellip", ""))
self._result.add(goods)
print(i)
except Exception as e:
print(e)
pprint(self._result)
print(len(self._result))
self.save()
def save(self):
""" 保存数据到本地 """
with open(self._path, "w+", encoding="utf-8") as f:
f_csv = csv.writer(f)
f_csv.writerows(self._result)
def clear(self):
""" 数据去重 """
s = set()
with open(self._path, "r", encoding="utf-8") as f:
fin_csv = csv.reader(f)
for row in fin_csv:
s.add(tuple(row))
with open("cup_all.csv", "w+", encoding="utf-8") as f:
fout_csv = csv.writer(f)
fout_csv.writerows(s)
print(len(s))
@staticmethod
def extract():
""" 提取数据 """
datelst, size_colorlst, commentlst = [], [], []
with open("cup_all.csv", "r", encoding="utf-8") as f:
fin_csv = csv.reader(f)
for row in fin_csv:
date, size_color, comment = row
# datelst.append((date))
# size_colorlst.append((size_color))
commentlst.append((comment))
with open("comment.txt", "w+", encoding="utf-8") as f:
for r in commentlst:
f.write(r + "\n")
if __name__ == "__main__":
url = "https://rate.tmall.com/list_detail_rate.htm?itemId=37457670144&spuId=249827344&" \
"sellerId=470355944&order={}&currentPage={}&append=0&content={}"
cups = Cups(url, 101, "cups.csv")
# cups.run()
# cups.clear()
cups.extract()
Loading…
Cancel
Save