各位用户为了找寻关于python实现爬取千万淘宝商品的方法的资料费劲了很多周折。这里教程网为您整理了关于python实现爬取千万淘宝商品的方法的相关资料,仅供查阅,以下为您介绍关于python实现爬取千万淘宝商品的方法的详细内容
本文实例讲述了python实现爬取千万淘宝商品的方法。分享给大家供大家参考。具体实现方法如下:
? 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98import
time
import
leveldb
from
urllib.parse
import
quote_plus
import
re
import
json
import
itertools
import
sys
import
requests
from
queue
import
Queue
from
threading
import
Thread
URL_BASE
=
'http://s.m.taobao.com/search?q={}&n=200&m=api4h5&style=list&page={}'
def
url_get(url):
# print('GET ' + url)
header
=
dict
()
header[
'Accept'
]
=
'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
header[
'Accept-Encoding'
]
=
'gzip,deflate,sdch'
header[
'Accept-Language'
]
=
'en-US,en;q=0.8'
header[
'Connection'
]
=
'keep-alive'
header[
'DNT'
]
=
'1'
#header['User-Agent'] = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.71 Safari/537.36'
header[
'User-Agent'
]
=
'Mozilla/12.0 (compatible; MSIE 8.0; Windows NT)'
return
requests.get(url, timeout
=
5
, headers
=
header).text
def
item_thread(cate_queue, db_cate, db_item):
while
True
:
try
:
cate
=
cate_queue.get()
post_exist
=
True
try
:
state
=
db_cate.Get(cate.encode(
'utf-8'
))
if
state !
=
b
'OK'
: post_exist
=
False
except
:
post_exist
=
False
if
post_exist
=
=
True
:
print
(
'cate-{}: {} already exists ... Ignore'
.
format
(cate, title))
continue
db_cate.Put(cate.encode(
'utf-8'
), b
'crawling'
)
for
item_page
in
itertools.count(
1
):
url
=
URL_BASE.
format
(quote_plus(cate), item_page)
for
tr
in
range
(
5
):
try
:
items_obj
=
json.loads(url_get(url))
break
except
KeyboardInterrupt:
quit()
except
Exception as e:
if
tr
=
=
4
:
raise
e
if
len
(items_obj[
'listItem'
])
=
=
0
:
break
for
item
in
items_obj[
'listItem'
]:
item_obj
=
dict
(
_id
=
int
(item[
'itemNumId'
]),
name
=
item[
'name'
],
price
=
float
(item[
'price'
]),
query
=
cate,
category
=
int
(item[
'category'
])
if
item[
'category'
] !
=
''
else
0
,
nick
=
item[
'nick'
],
area
=
item[
'area'
])
db_item.Put(
str
(item_obj[
'_id'
]).encode(
'utf-8'
),
json.dumps(item_obj, ensure_ascii
=
False
).encode(
'utf-8'
))
print
(
'Get {} items from {}: {}'
.
format
(
len
(items_obj[
'listItem'
]), cate, item_page))
if
'nav'
in
items_obj:
for
na
in
items_obj[
'nav'
][
'navCatList'
]:
try
:
db_cate.Get(na[
'name'
].encode(
'utf-8'
))
except
:
db_cate.Put(na[
'name'
].encode(
'utf-8'
), b
'waiting'
)
db_cate.Put(cate.encode(
'utf-8'
), b
'OK'
)
print
(cate,
'OK'
)
except
KeyboardInterrupt:
break
except
Exception as e:
print
(
'An {} exception occured'
.
format
(e))
def
cate_thread(cate_queue, db_cate):
while
True
:
try
:
for
key, value
in
db_cate.RangeIter():
if
value !
=
b
'OK'
:
print
(
'CateThread: put {} into queue'
.
format
(key.decode(
'utf-8'
)))
cate_queue.put(key.decode(
'utf-8'
))
time.sleep(
10
)
except
KeyboardInterrupt:
break
except
Exception as e:
print
(
'CateThread: {}'
.
format
(e))
if
__name__
=
=
'__main__'
:
db_cate
=
leveldb.LevelDB(
'./taobao-cate'
)
db_item
=
leveldb.LevelDB(
'./taobao-item'
)
orig_cate
=
'正装'
try
:
db_cate.Get(orig_cate.encode(
'utf-8'
))
except
:
db_cate.Put(orig_cate.encode(
'utf-8'
), b
'waiting'
)
cate_queue
=
Queue(maxsize
=
1000
)
cate_th
=
Thread(target
=
cate_thread, args
=
(cate_queue, db_cate))
cate_th.start()
item_th
=
[Thread(target
=
item_thread, args
=
(cate_queue, db_cate, db_item))
for
_
in
range
(
5
)]
for
item_t
in
item_th:
item_t.start()
cate_th.join()
希望本文所述对大家的Python程序设计有所帮助。