各位用户为了找寻关于python实现爬取千万淘宝商品的方法的资料费劲了很多周折。这里教程网为您整理了关于python实现爬取千万淘宝商品的方法的相关资料,仅供查阅,以下为您介绍关于python实现爬取千万淘宝商品的方法的详细内容

本文实例讲述了python实现爬取千万淘宝商品的方法。分享给大家供大家参考。具体实现方法如下:

? 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 import time import leveldb from urllib.parse import quote_plus import re import json import itertools import sys import requests from queue import Queue from threading import Thread URL_BASE = 'http://s.m.taobao.com/search?q={}&n=200&m=api4h5&style=list&page={}' def url_get(url):   # print('GET ' + url)   header = dict()   header['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'   header['Accept-Encoding'] = 'gzip,deflate,sdch'   header['Accept-Language'] = 'en-US,en;q=0.8'   header['Connection'] = 'keep-alive'   header['DNT'] = '1'   #header['User-Agent'] = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.71 Safari/537.36'   header['User-Agent'] = 'Mozilla/12.0 (compatible; MSIE 8.0; Windows NT)'   return requests.get(url, timeout = 5, headers = header).text def item_thread(cate_queue, db_cate, db_item):   while True:     try:       cate = cate_queue.get()       post_exist = True       try:         state = db_cate.Get(cate.encode('utf-8'))         if state != b'OK': post_exist = False       except:         post_exist = False       if post_exist == True:         print('cate-{}: {} already exists ... Ignore'.format(cate, title))         continue       db_cate.Put(cate.encode('utf-8'), b'crawling')       for item_page in itertools.count(1):         url = URL_BASE.format(quote_plus(cate), item_page)         for tr in range(5):           try:             items_obj = json.loads(url_get(url))             break           except KeyboardInterrupt:             quit()           except Exception as e:             if tr == 4: raise e         if len(items_obj['listItem']) == 0: break         for item in items_obj['listItem']:           item_obj = dict(             _id = int(item['itemNumId']),             name = item['name'],             price = float(item['price']),             query = cate,             category = int(item['category']) if item['category'] != '' else 0,             nick = item['nick'],             area = item['area'])           db_item.Put(str(item_obj['_id']).encode('utf-8'),                 json.dumps(item_obj, ensure_ascii = False).encode('utf-8'))         print('Get {} items from {}: {}'.format(len(items_obj['listItem']), cate, item_page))         if 'nav' in items_obj:           for na in items_obj['nav']['navCatList']:             try:               db_cate.Get(na['name'].encode('utf-8'))             except:               db_cate.Put(na['name'].encode('utf-8'), b'waiting')       db_cate.Put(cate.encode('utf-8'), b'OK')       print(cate, 'OK')     except KeyboardInterrupt:       break     except Exception as e:       print('An {} exception occured'.format(e)) def cate_thread(cate_queue, db_cate):   while True:     try:       for key, value in db_cate.RangeIter():         if value != b'OK':           print('CateThread: put {} into queue'.format(key.decode('utf-8')))           cate_queue.put(key.decode('utf-8'))       time.sleep(10)     except KeyboardInterrupt:       break     except Exception as e:       print('CateThread: {}'.format(e)) if __name__ == '__main__':   db_cate = leveldb.LevelDB('./taobao-cate')   db_item = leveldb.LevelDB('./taobao-item')   orig_cate = '正装'   try:     db_cate.Get(orig_cate.encode('utf-8'))   except:     db_cate.Put(orig_cate.encode('utf-8'), b'waiting')   cate_queue = Queue(maxsize = 1000)   cate_th = Thread(target = cate_thread, args = (cate_queue, db_cate))   cate_th.start()   item_th = [Thread(target = item_thread, args = (cate_queue, db_cate, db_item)) for _ in range(5)]   for item_t in item_th:     item_t.start()   cate_th.join()

希望本文所述对大家的Python程序设计有所帮助。