结果如下:
程序运行结果
"C:\Program Files\Python\Python38\python.exe" E:/PycharmProjects/test/爬虫/今日头条街拍/Spider.py
请求文章详情页出错 None
正在下载1d47b4dceb635bff92ef296ce530cdc2.jpg
正在下载bff2d27b1d4ed77cc0a575513b9eeea4.jpg
正在下载f30c57b85c75fe655612450c993b320b.jpg
正在下载2475f2ad97bdcb6a45e6d64ce1650ca6.jpg
正在下载52fc1cd305b52366532f3ec7a9729a71.jpg
正在下载dc59cae9ed2859454c461b2944b819b1.jpg
正在下载bf1b48c4ca5cfeffec3bee95b1da6b64.jpg
正在下载7b8c8a28c037eac669988e34e90e37de.jpg
正在下载ad0fb9dbfcc0f15981fd701339e13bb8.jpg
正在下载98486366369a8c0b79f4c6ced3fb51b0.jpg
E:/PycharmProjects/test/爬虫/今日头条街拍/Spider.py:125: DeprecationWarning: insert is deprecated. Use insert_one or insert_many instead.
if db[MONGO_TABLE].insert(result):
存储到MongoDB成功 {'url': 'http://toutiao.com/group/6760980286252515853/', 'title': '三里屯街拍', 'images': ['http://p9.pstatp.com/origin/pgc-image/f23cd60127824cb88b11256cf9ffe951', 'http://p9.pstatp.com/origin/pgc-image/0c54ffb46fdd4553ad99e9273a85643f', 'http://p9.pstatp.com/origin/pgc-image/da0dd5a6d8314f699de340c06a9093ec', 'http://p3.pstatp.com/origin/pgc-image/7fff7dff1863466883d0920481bc9ffe', 'http://p1.pstatp.com/origin/pgc-image/9ddcb34586514ab7ad6427c73c416f51', 'http://p3.pstatp.com/origin/pgc-image/a406850542ca496aa1f68333e42aea87', 'http://p1.pstatp.com/origin/pgc-image/50bb8781363e4e3e8e9e45137ded07f6', 'http://p3.pstatp.com/origin/pgc-image/6dfc140a4d814cd49295f210aa1499bd', 'http://p1.pstatp.com/origin/pgc-image/014679ab63584e0ca7f8612b8b120651', 'http://p9.pstatp.com/origin/pgc-image/be0226872a864d2aac76c28987938cdf'], '_id': ObjectId('5e648f564d5a25526d8630e9')}
请求文章详情页出错 None
正在下载289d58776687584857ea8d2cec8e3266.jpg
正在下载4d89bd5efc7166322207e31e737cd6d2.jpg
正在下载bcfd11725613120a2f6f20145521bf70.jpg
正在下载2256eed522bab696d578be7bd021a79d.jpg
正在下载9f4b6fac29acae9d0d3964a33922cd76.jpg
正在下载6226b26aacb6111fa5bb6ab4ecb360ca.jpg
正在下载22e469ecf6277bf6631e9053b0782656.jpg
正在下载fb6e9bba0afd3880fd5acb418c08af06.jpg
正在下载9e418d8f5aa93beefe64005a941b0b8d.jpg
正在下载dc98a060a3ee9abe1583bc304de1dfd1.jpg
存储到MongoDB成功 {'url': 'http://toutiao.com/group/6797731729667785220/', 'title': '10张街拍,都是移动的美景', 'images': ['http://p1.pstatp.com/origin/pgc-image/e783e840dfb54e3c8c30c385d1089345', 'http://p9.pstatp.com/origin/pgc-image/9476cd0974cb4a2984665b2c5b983212', 'http://p1.pstatp.com/origin/pgc-image/d4fbc13cfc2d43ac8161b4be08df3e14', 'http://p3.pstatp.com/origin/pgc-image/b18028cec93146a2b51e5671da0521d5', 'http://p1.pstatp.com/origin/pgc-image/64bd25e7787e4273a2286b35a3ae6446', 'http://p3.pstatp.com/origin/pgc-image/dfa087e2320241b3a17911d4e0ac3029', 'http://p3.pstatp.com/origin/pgc-image/3d8f1af2896147ee93bf2d2fd7ba9ca8', 'http://p1.pstatp.com/origin/pgc-image/120a3479c1e64c339fd248804e65e07a', 'http://p9.pstatp.com/origin/pgc-image/c4719a980d674138bcdf6f275ef02641', 'http://p3.pstatp.com/origin/pgc-image/9166a9f1c804471289a2561b132e6c2d'], '_id': ObjectId('5e648f584d5a25526d8630ea')}
请求文章详情页出错 None
正在下载058f885bfde3719ed25338005a437383.jpg
正在下载232333b9f0f2b87a553e8f24f728890e.jpg
正在下载b81adeef46b7948a2ae60ba19b1c2f9f.jpg
正在下载0617b7ed0ad9230354903c6afe97db35.jpg
正在下载a9a568400013d371b26f9ed5ed475867.jpg
正在下载3fe54e96b51b9ace46634e8f94ab5592.jpg
正在下载33110348606726d108c04b7c40b964d9.jpg
正在下载a1052b801e72efdbfc1c4cc0a81aac95.jpg
存储到MongoDB成功 {'url': 'http://toutiao.com/group/6798525975911989767/', 'title': '8张街拍,美腿是亮点', 'images': ['http://p1.pstatp.com/origin/pgc-image/016ab17507bd46639a130da594e0c5a7', 'http://p1.pstatp.com/origin/pgc-image/a812e94ff2634cd88e3212d0f8e2e69d', 'http://p3.pstatp.com/origin/pgc-image/2e71f3fe533e4907be31731dfa27d42b', 'http://p3.pstatp.com/origin/pgc-image/e57e58cf58204fa0849b6b10f27c2a63', 'http://p1.pstatp.com/origin/pgc-image/2430de0431354ccb98683a2e48886679', 'http://p1.pstatp.com/origin/pgc-image/9c122e7a3c764c249c5c1f8a89d2d92d', 'http://p1.pstatp.com/origin/pgc-image/d812aa22e3b94c5382afaf6c23970dcd', 'http://p3.pstatp.com/origin/pgc-image/015a672717b24791ba75ad876238c79b'], '_id': ObjectId('5e648f5a4d5a25526d8630eb')}
请求文章详情页出错 None
正在下载2f03708f2837f73dcb109d23f9416792.jpg
正在下载bd905f5aa9bbf02485958efdbbac1dd9.jpg
正在下载d217942affa862060bc9196e38d7c79c.jpg
正在下载81bfdb8072c2345216b26df2eacad4de.jpg
正在下载49187040d908d5ecaaa66fc1d4065594.jpg
正在下载63afad5c014c8b814f3f2985e1f6f0e9.jpg
正在下载32b7ad87244b35dfb0537aa17db4d481.jpg
正在下载788e4bc0f4656154c3155e4c8fb8c86b.jpg
存储到MongoDB成功 {'url': 'http://toutiao.com/group/6544662303671845383/', 'title': '街拍美女,每个都不会让你失望,能拍到这么多很难得', 'images': ['http://p1.pstatp.com/origin/pgc-image/1523797525926884c65306e', 'http://p3.pstatp.com/origin/pgc-image/152379752620949b0806b6a', 'http://p3.pstatp.com/origin/pgc-image/1523797526532845db7eb8f', 'http://p9.pstatp.com/origin/pgc-image/1523797526872017fabff50', 'http://p1.pstatp.com/origin/pgc-image/15237975272443a6dc806be', 'http://p1.pstatp.com/origin/pgc-image/1523797527522d8e7dd116e', 'http://p1.pstatp.com/origin/pgc-image/15237975277918d0c1769d3', 'http://p9.pstatp.com/origin/pgc-image/152379753708045ae2bcea6'], '_id': ObjectId('5e648f5d4d5a25526d8630ec')}
请求文章详情页出错 None
正在下载1c24d58fcecb7c040d7eecb7f5f40812.jpg
正在下载57a39dd807a981d1cbc0ce114084bc5a.jpg
正在下载539b075ff4afec5baa3277758a7b6c4d.jpg
正在下载928ec49ea00572be1dc16bcbe970f0f7.jpg
正在下载7851da03c9b9d535a9c271a12db6d3d7.jpg
存储到MongoDB成功 {'url': 'http://toutiao.com/group/6801700984973165059/', 'title': '街拍时尚美女达人,个性时尚靓丽', 'images': ['http://p1.pstatp.com/origin/pgc-image/f5c5829387b14ff99685a28a0eeab0fe', 'http://p3.pstatp.com/origin/pgc-image/fdec9a612a6044feb92f981be07821ba', 'http://p3.pstatp.com/origin/pgc-image/4e1028d6ff804bdb8286087699c93dd6', 'http://p3.pstatp.com/origin/pgc-image/ec39dfbd5e6f4d1e8ddbf093c5dac423', 'http://p3.pstatp.com/origin/pgc-image/21be061a0fbf47d5a9084d707757a0e0'], '_id': ObjectId('5e648f614d5a25526d8630ed')}
请求文章详情页出错 None
正在下载60451c6c4904eb992260fe3337a7da85.jpg
正在下载4e8b28d7767d035ebb2e9be015af5f12.jpg
正在下载40f67d5373b2eb26f502002cc8d1379f.jpg
正在下载317532fa5039e4707a55f095f5287ea4.jpg
正在下载4f849724eadfb58e743d95ed35e01ca1.jpg
正在下载51a4b4e42cd90441c988d5747a1fa0ce.jpg
存储到MongoDB成功 {'url': 'http://toutiao.com/group/6793512652669714956/', 'title': '路人街拍,随性有度,高颜值任性', 'images': ['http://p1.pstatp.com/origin/pgc-image/812c6785511940859d0ae8340023cd33', 'http://p1.pstatp.com/origin/pgc-image/a95721f9184c467195c1f03ebf5ddb2d', 'http://p1.pstatp.com/origin/pgc-image/87a94ae849e64590a691f5866a75a6a6', 'http://p1.pstatp.com/origin/pgc-image/2463ca69bd38421eb2b3fcb87a2b6485', 'http://p3.pstatp.com/origin/pgc-image/437bf5b276ef4d178c33f3b5121aacad', 'http://p3.pstatp.com/origin/pgc-image/3729b42f46a741cd85366d08fa10dabe'], '_id': ObjectId('5e648f644d5a25526d8630ee')}
请求文章详情页出错 None
正在下载605a6964f274bf05359a94611d766c90.jpg
正在下载b263ab68d9f2a481282f0b95dd8185da.jpg
正在下载22229fb6e545aa1c362a864a254dc190.jpg
正在下载7178df50a0dc98dd8081462c55b39abb.jpg
正在下载c47eeecb2060eb7e979b26d201c89d85.jpg
正在下载8196e60f431fc88c07faf569b0d410d5.jpg
正在下载7b60211a6c1e85cd8d0a5b313a3eea54.jpg
正在下载a9e01aa5e7bf45092479ce5b545e5b43.jpg
正在下载704ab8f5cf3ed7fe6684c29fc85c32b9.jpg
正在下载ff47896bf7d103e1f11c92291a6e22fa.jpg
存储到MongoDB成功 {'url': 'http://toutiao.com/group/6798491818288742915/', 'title': '街拍:时尚的美女们那无处安放的魅力呀', 'images': ['http://p3.pstatp.com/origin/pgc-image/6ad69aabdd7b4638a4d1ae95ad03aa31', 'http://p3.pstatp.com/origin/pgc-image/d687b4a681d04c96966ca068571cbfc7', 'http://p1.pstatp.com/origin/pgc-image/5d1800490a45422a8ecf640aec508117', 'http://p3.pstatp.com/origin/pgc-image/0fbd7554e6e44b0baffeddba5e682397', 'http://p3.pstatp.com/origin/pgc-image/be0f015c0ef4494f891948cb9b2079df', 'http://p1.pstatp.com/origin/pgc-image/7f2443587bfe47a584e2fc03d6274b44', 'http://p9.pstatp.com/origin/pgc-image/7bad2342572f4141a72c963c3bf7489d', 'http://p9.pstatp.com/origin/pgc-image/546965a83e344dfe8939142381401839', 'http://p3.pstatp.com/origin/pgc-image/7f351bd5863146e384849ca68e858f89', 'http://p1.pstatp.com/origin/pgc-image/b087dddab5874b049eb97b5cc0e5a46e'], '_id': ObjectId('5e648f6b4d5a25526d8630ef')}
正在下载c0d75e4999e69d869e86d5fec1f59485.jpg
正在下载56ba253418c432c99a57331cf38c34d0.jpg
正在下载3d17fffb0618658810d3e03f4051a8ae.jpg
正在下载a0406a502caf02ff77cf96dd173c0a58.jpg
正在下载62d84cedab105e433c88afb0327780c6.jpg
正在下载ae8e01add6e9d85ec8aff9d495a68985.jpg
正在下载62be7702c19941666ab7550add3b0846.jpg
正在下载c40e65a1ee585414a10fd62c2f1abc3c.jpg
正在下载400900353f90b9f0c1dccbc41254bf6e.jpg
正在下载c55892d7a5d3bff48d2ec49698b8cc77.jpg
正在下载2bfb96396e6ac8c655994505d2787813.jpg
正在下载4f29434ac280b66c11c97d9cbee304a0.jpg
正在下载1a0a748c67847be27d477f4dc5629ce2.jpg
正在下载bbf03b4c64ee88e8f30f7ce9c4ffb75d.jpg
正在下载aa06350b58919b0bf4c8dcf67dfa051d.jpg
正在下载e8204bc0995bb67209f1ae2ce46dfa6d.jpg
正在下载5ae505e545b1a0ddae53f4d9e04af17e.jpg
正在下载3170b09e65c287c47d9424a569c8b211.jpg
正在下载93729e56d76c91baf217e668853b9675.jpg
正在下载d32daacb8c3c6760d171b25d257e620f.jpg
正在下载0b85fb287c9a9a0e1ed178ec3ce6c7ad.jpg
正在下载6632d9ceccb7f048060f6121d3471028.jpg
存储到MongoDB成功 {'url': 'http://toutiao.com/group/6360561337588465922/', 'title': '三里屯时尚街拍', 'images': ['http://p1.pstatp.com/origin/127a000565b624674d7f', 'http://p9.pstatp.com/origin/11f60008aea95af99269', 'http://p3.pstatp.com/origin/127b0000b91a74ee8aa0', 'http://p9.pstatp.com/origin/127b0000b91d0af5e30d', 'http://p9.pstatp.com/origin/11ff00035036c49b99ff', 'http://p1.pstatp.com/origin/127a000565c33e48688d', 'http://p3.pstatp.com/origin/127a000565c5bb7a220d', 'http://p1.pstatp.com/origin/127b0000bb5db97385a5', 'http://p3.pstatp.com/origin/127b0000bb839a6d7fdd', 'http://p3.pstatp.com/origin/11ff0003528897cbd6a3', 'http://p1.pstatp.com/origin/127b0000bbca727e9712', 'http://p1.pstatp.com/origin/11ff000352d0b620b45d', 'http://p3.pstatp.com/origin/127b0000bc09a4933d91', 'http://p3.pstatp.com/origin/127a000568ba70678b41', 'http://p3.pstatp.com/origin/11ff0003530c352041b2', 'http://p1.pstatp.com/origin/1279000000228a83fb2d', 'http://p1.pstatp.com/origin/11f80002e84fb20134ff', 'http://p1.pstatp.com/origin/127900000061ba8db7f1', 'http://p1.pstatp.com/origin/127d0000008129c92602', 'http://p9.pstatp.com/origin/127d000000c6b41742c7', 'http://p1.pstatp.com/origin/1279000000cfe9cf96a5', 'http://p3.pstatp.com/origin/127d000000e8996bb8a1'], '_id': ObjectId('5e648f714d5a25526d8630f0')}
正在下载e8b059259110fb7b2145238c783e1e16.jpg
正在下载4e611f9b0bf4f50be7da523398b81ccb.jpg
正在下载f114136a7df71b5e303903ed8bdfde16.jpg
正在下载08189490c08f7072cc3d7df8e2e3a9ca.jpg
正在下载73812fb5ba1d4f781cd86cccac76e7b1.jpg
存储到MongoDB成功 {'url': 'http://toutiao.com/group/6801620851247546887/', 'title': '街拍:身材好,朴素的搭配也好看', 'images': ['http://p3.pstatp.com/origin/pgc-image/229b29b9d46d41b59a6b1241b4f6522e', 'http://p3.pstatp.com/origin/pgc-image/22a06b2a0b9c452b91b8b8fe5ff9eeb9', 'http://p9.pstatp.com/origin/pgc-image/6545bf27a1d84b7695fe75913165dc3f', 'http://p1.pstatp.com/origin/pgc-image/68e79f9722524c2b9e95e369231e881f', 'http://p3.pstatp.com/origin/pgc-image/4add570ae30949319ee7a82a8a70905e'], '_id': ObjectId('5e648f744d5a25526d8630f1')}
正在下载6ad8e277bc909ab8b2aef5f65e6fc108.jpg
正在下载68dec26885c44b8d592b16799fab9bec.jpg
正在下载ca0d1280000bd094ee618107525c50cd.jpg
正在下载9f3eb9fb5e1587ba4ea6563b41bba065.jpg
正在下载b803cfbd980e1ea37c9943cecbe4d9d2.jpg
正在下载52beaad3de4da77a21d6b33b9c6bc535.jpg
正在下载818ec69ece423d6ef53df0e673755db3.jpg
正在下载4582c657f330e7925dad64d9a43ee23b.jpg
存储到MongoDB成功 {'url': 'http://toutiao.com/group/6801713871884124686/', 'title': '图虫街拍摄影:无题', 'images': ['http://p3.pstatp.com/origin/tuchong.fullscreen/243059128_tt', 'http://p1.pstatp.com/origin/tuchong.fullscreen/192138312_tt', 'http://p3.pstatp.com/origin/tuchong.fullscreen/311282017_tt', 'http://p1.pstatp.com/origin/tuchong.fullscreen/238865179_tt', 'http://p3.pstatp.com/origin/tuchong.fullscreen/229820783_tt', 'http://p3.pstatp.com/origin/tuchong.fullscreen/503761196_tt', 'http://p3.pstatp.com/origin/tuchong.fullscreen/63687246_tt', 'http://p1.pstatp.com/origin/tuchong.fullscreen/292407501_tt'], '_id': ObjectId('5e648f784d5a25526d8630f2')}
请求文章详情页出错 None
正在下载a83165d608142d3673f161ddcbe78473.jpg
正在下载2a3a99a77ecac2a5d5053b76984af14a.jpg
正在下载e13ff6d472651823cf060564621858c3.jpg
正在下载60715dabb78cb7f25bd4c7e773816b98.jpg
正在下载fd5a68dea6558fab2dbcd4ee33a66bb5.jpg
正在下载bea2d6122c2b4b9cac123b870fa2a2e5.jpg
正在下载1433ff923ee966d7aad93306a00e9194.jpg
正在下载3515669e22dc198da7413bd1bc144b26.jpg
正在下载2b0a30397d74c8307b2fe208c101cb6b.jpg
正在下载ea938a964c692a72889c3550946beb40.jpg
正在下载27acc644e75f99ed74c3d2fe7ac90c90.jpg
正在下载6e4d52ef6cd03cf1b08e1aa2f66049af.jpg
存储到MongoDB成功 {'url': 'http://toutiao.com/group/6761387978905354765/', 'title': '街拍美女', 'images': ['http://p3.pstatp.com/origin/pgc-image/75cedc8655974a5393d37001cc997ffb', 'http://p1.pstatp.com/origin/pgc-image/7867bd5c02c040519de20b4f7b6aec86', 'http://p1.pstatp.com/origin/pgc-image/85bb485b37ab4ae893333b2a3a1d3bdc', 'http://p3.pstatp.com/origin/pgc-image/25e8b5d6b07542e891d8589272dc43f0', 'http://p1.pstatp.com/origin/pgc-image/d241f49affaf43f1b8ccb26e5de6dfd9', 'http://p3.pstatp.com/origin/pgc-image/d86a1ed0b6a84e0b9df9e9132346751f', 'http://p3.pstatp.com/origin/pgc-image/78aa27e880eb415f9b08fcace47dbbad', 'http://p1.pstatp.com/origin/pgc-image/1752f41b2b86499d8986986783cb2986', 'http://p1.pstatp.com/origin/pgc-image/e55546f4cd29475da01997ded3d7c0d1', 'http://p1.pstatp.com/origin/pgc-image/25d0fc56b79a437086b70d57ee2d2d57', 'http://p9.pstatp.com/origin/pgc-image/d5c878a943ed46fb87e1c059831a1127', 'http://p3.pstatp.com/origin/pgc-image/0340686e0c7a417f8262fb8a9a0b6854'], '_id': ObjectId('5e648f7e4d5a25526d8630f3')}
正在下载69e442ab916aacb3381271b40536d918.jpg
正在下载db907cb5a1036864aff213d46984a88d.jpg
正在下载04b93d83f943a0507ed547b6736f535a.jpg
正在下载c5ead73b64e742f5f81fe29cb1f86fdf.jpg
正在下载21ed741bc8196cc52dc2e3881e3ea2b7.jpg
正在下载ced7135b971626d5830f01eee8e70813.jpg
正在下载7cc78999ee6e6ec05380d976884df636.jpg
存储到MongoDB成功 {'url': 'http://toutiao.com/group/6801669716575257099/', 'title': '街拍格子衬衫+黑色束腰大衣,美女绽放迷人微笑', 'images': ['http://p9.pstatp.com/origin/pgc-image/08ab6c2fafb84551b6b9018a070bdfc3', 'http://p1.pstatp.com/origin/pgc-image/7878dfe8070a4922b2f41996745e3257', 'http://p3.pstatp.com/origin/pgc-image/a8402a23183d4cb7a67af9bf8e387c94', 'http://p3.pstatp.com/origin/pgc-image/d962686c0d3848abb28bf5f1c42dcfdd', 'http://p1.pstatp.com/origin/pgc-image/3b97d9f5b9bf4676968a89a7f01299f1', 'http://p3.pstatp.com/origin/pgc-image/91ec603752794b5b8d9d0c67e68eeb5d', 'http://p9.pstatp.com/origin/pgc-image/b401c7c025a04c17a4e99484330e32fb'], '_id': ObjectId('5e648f844d5a25526d8630f4')}
请求文章详情页出错 None
进程已结束,退出代码 0
代码贴上:
import json
import os
from _md5 import md5
from urllib.parse import urlencode
from bs4 import BeautifulSoup
from requests.exceptions import RequestException
import requests
import re
import pymongo
MONGO_URL = 'localhost'
MONGO_DB = 'toutiao_jiepai'
MONGO_TABLE = 'jiepai'
client = pymongo.MongoClient(MONGO_URL, connect=False)
db = client[MONGO_DB]
GROUP_STARST = 0
GROUP_END = 20
KEYWORD = '街拍'
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.87 Safari/537.36',
'cookie': 'csrftoken=bf9266cc133025b21a5e4998ffd5ce35; tt_webid=6790686063313733134; UM_distinctid=1701fb35e91866-04c171aadb2013-b791b36-144000-1701fb35e9279a; ttcid=86d38a88812b421583147156ab016ec237; _ga=GA1.2.1866509185.1582287463; s_v_web_id=verify_k794o57o_YRKnMOhd_FUeV_4ZdI_Bpo2_3vXzw7x5Gx3a; __utma=24953151.1866509185.1582287463.1583116936.1583116936.1; __utmz=24953151.1583116936.1.1.utmcsr=mp.toutiao.com|utmccn=(referral)|utmcmd=referral|utmcct=/profile_v3/weitoutiao; __utmc=24953151; SLARDAR_WEB_ID=7b78defd-3070-4596-9123-59244140755e; passport_auth_status=0f792b07d1cc7e289d22eff873cc7c10%2C05d5a1d4bfea33adb46666e2ce9e3596; toutiao_sso_user=7da7c4fb85bcf55cc492a893a21d6251; sessionid=a915499f555249bdb4f3589e50930ec3; uid_tt=769f44a859173e5be80260a51aff7903; sid_tt=a915499f555249bdb4f3589e50930ec3; tt_scid=hu9kP-IprXVAhkiPwrxxRBJJFpnHByB.40Boc8LMWZxP.rmAnLe5H8ltCAKUsLk4032b; __tasessionId=tchl9rncq1583588525235; CNZZDATA1272960458=515500770-1581078898-%7C1583585464'
}
# 获得通过ajax请求返回的json数据
def get_page_index(offset, keyword):
data = {
'aid': '24',
'app_name': 'web_search',
'offset': offset,
'format': 'json',
'keyword': keyword,
'autoload': 'true',
'count': '20',
'en_qc': '1',
'cur_tab': '1',
'from': 'search_tab',
'pd': 'synthesis',
'timestamp': '1583588896243'
}
url = 'https://www.toutiao.com/api/search/content/?' + urlencode(data)
# https://www.toutiao.com/api/search/content/?aid=24&app_name=web_search
# &offset=0&format=json&keyword=%E8%A1%97%E6%8B%8D&autoload=true&count=20
# &en_qc=1&cur_tab=1&from=search_tab&pd=synthesis×tamp=1583588896243
try:
response = requests.get(url, headers=headers)
if response.status_code == 200:
return response.text
return None
except RequestException:
print("请求ajax出错")
return None
# 获得每篇文章的url
def parse_page_index(html):
data = json.loads(html)
if data and 'data' in data.keys():
for item in data.get('data'):
yield item.get('article_url') # 返回article_url
# 获得文章详情页的源代码
def get_page_detail(url):
try:
response = requests.get(url, headers=headers)
if response.status_code == 200:
return response.text
return None
except RequestException:
print("请求文章详情页出错", url)
return None
def parse_page_detail(html, url):
soup = BeautifulSoup(html, 'lxml')
images_pattern = re.compile('BASE_DATA.galleryInfo.*?gallery: JSON.parse\("(.*?)"\),', re.S)
result = re.search(images_pattern, html)
if result:
title = soup.select('title')[0].get_text()
# print(title)
# print(result.group(1))
data = result.group(1).encode('utf-8').decode('unicode-escape') # 去掉\\\u002F,先用utf-8编码再用unicode-escape解码
data = json.loads(data) # 转换成json格式的字符串
if data and 'sub_images' in data.keys():
sub_images = data.get('sub_images')
images = [item.get('url') for item in sub_images]
for image in images:
download_image(image, title)
return {
'url': url,
'title': title,
'images': images
}
def download_image(url, title):
try:
response = requests.get(url, headers=headers)
if response.status_code == 200:
save_image(response.content, title)
return None
except RequestException:
print("请求图片出错", url)
return None
def save_image(content, title):
dir_name = title
if not os.path.exists(dir_name):
os.makedirs(dir_name)
file_path = '{0}.{1}'.format(md5(content).hexdigest(), 'jpg')
print('正在下载' + file_path)
if not os.path.exists(file_path):
with open(dir_name + '/' + file_path, 'wb') as f:
f.write(content)
f.close()
def save_to_mongo(result):
if db[MONGO_TABLE].insert(result):
print("存储到MongoDB成功", result)
return True
return False
def main(offset):
html = get_page_index(offset, KEYWORD)
# print(html)
for url in parse_page_index(html):
# print(url)
html = get_page_detail(url)
if html:
result = parse_page_detail(html, url)
# print(result)
if result:
save_to_mongo(result)
if __name__ == '__main__':
for offset in range(GROUP_STARST, GROUP_END + 1, 20):
main(offset)