1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87
|
__author__ = 'Administrator'
from urllib import * from bs4 import BeautifulSoup import socket import threading import Queue import time
url = 'http://libopac.btbu.edu.cn:8080/opac/browseByCategory?pager.offset=' socket.setdefaulttimeout(3) maxnum = 634309 print "\n"
q = Queue.Queue()
f = open('library3.txt', 'a')
def getlinks(): offset = 192850 while offset < maxnum: try: html = urlopen(url + str(offset)).read().decode('utf-8') soup = BeautifulSoup(html) links = soup.findAll(target='_blank') onetask = (offset, links) q.put(onetask) offset += 25 print "\r" + str(offset) + "/" + str(maxnum) + " " + str(offset * 100.0 / maxnum) + "% 第" + str( offset / 25) + "页----获取列表" except IOError: pass except Exception, e: print e
class MyThread(threading.Thread): def __init__(self, queue): threading.Thread.__init__(self) self.queue = queue
def run(self): while True: myonetask = self.queue.get() myoffset, mylinks = myonetask print "\r" + str(myoffset) + "/" + str(maxnum) + " " + str(myoffset * 100.0 / maxnum) + "% 第" \ + str(myoffset / 25) + "页----收到任务" mywritecontent = "" for mylink in mylinks: always = True while always: try: myurl2 = 'http://libopac.btbu.edu.cn:8080' + mylink['href'] mywritecontent += myurl2.encode('utf-8', 'ignore') + "\n" mywritecontent += mylink.text.encode('utf-8', 'ignore') + "\n" myhtml2 = urlopen(myurl2).read() mysoup2 = BeautifulSoup(myhtml2) mywritecontent += mysoup2.find(id='detailsTable').text.encode('utf-8', 'ignore') + "\n" myjson3 = urlopen(myurl2.replace('book', 'book/getHoldingsInformation')).read() mywritecontent += myjson3 + "\n\n" always = False except IOError: pass except Exception: always = False print "\r" + str(myoffset) + "/" + str(maxnum) + " " + str(myoffset * 100.0 / maxnum) + "% 第" \ + str(myoffset / 25) + "页----下载完成----" + str(myoffset + 25) mywritecontent += "\n==========第" + str(myoffset / 25) + "页==========\n\n" f.write(mywritecontent) f.flush() time.sleep(0.01)
t1 = threading.Thread(target=getlinks) t1.start()
for asd in range(10): t2 = MyThread(q) t2.start()
|