1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87
   | 
  __author__ = 'Administrator'
  from urllib import * from bs4 import BeautifulSoup import socket import threading import Queue import time
  url = 'http://libopac.btbu.edu.cn:8080/opac/browseByCategory?pager.offset=' socket.setdefaulttimeout(3)   maxnum = 634309   print "\n"
  q = Queue.Queue()  
  f = open('library3.txt', 'a')
  def getlinks():          offset = 192850       while offset < maxnum:         try:             html = urlopen(url + str(offset)).read().decode('utf-8')             soup = BeautifulSoup(html)             links = soup.findAll(target='_blank')               onetask = (offset, links)               q.put(onetask)               offset += 25             print "\r" + str(offset) + "/" + str(maxnum) + "    " + str(offset * 100.0 / maxnum) + "%   第" + str(                 offset / 25) + "页----获取列表"         except IOError:                          pass         except Exception, e:             print e
  class MyThread(threading.Thread):     def __init__(self, queue):         threading.Thread.__init__(self)         self.queue = queue  
      def run(self):         while True:             myonetask = self.queue.get()               myoffset, mylinks = myonetask               print "\r" + str(myoffset) + "/" + str(maxnum) + "    " + str(myoffset * 100.0 / maxnum) + "%   第" \                   + str(myoffset / 25) + "页----收到任务"             mywritecontent = ""             for mylink in mylinks:                 always = True                   while always:                     try:                         myurl2 = 'http://libopac.btbu.edu.cn:8080' + mylink['href']                                                  mywritecontent += myurl2.encode('utf-8', 'ignore') + "\n"                         mywritecontent += mylink.text.encode('utf-8', 'ignore') + "\n"                                                  myhtml2 = urlopen(myurl2).read()                         mysoup2 = BeautifulSoup(myhtml2)                         mywritecontent += mysoup2.find(id='detailsTable').text.encode('utf-8', 'ignore') + "\n"                                                  myjson3 = urlopen(myurl2.replace('book', 'book/getHoldingsInformation')).read()                         mywritecontent += myjson3 + "\n\n"                         always = False                       except IOError:                         pass                       except Exception:                                                                           always = False             print "\r" + str(myoffset) + "/" + str(maxnum) + "    " + str(myoffset * 100.0 / maxnum) + "%   第" \                   + str(myoffset / 25) + "页----下载完成----" + str(myoffset + 25)             mywritecontent += "\n==========第" + str(myoffset / 25) + "页==========\n\n"             f.write(mywritecontent)             f.flush()               time.sleep(0.01)  
  t1 = threading.Thread(target=getlinks)   t1.start()
  for asd in range(10):     t2 = MyThread(q)       t2.start()
 
 
  |