python爬取本站电子书信息并入库的实现代码
更新时间:2020年4月22日 23:19 点击:1324
入门级爬虫:只抓取书籍名称,信息及下载地址并存储到数据库
数据库工具类:DBUtil.py
import pymysql class DBUtils(object): def connDB(self): #连接数据库 conn=pymysql.connect(host='192.168.251.114',port=3306, user='root',passwd='b6f3g2',db='yangsj',charset='utf8'); cur=conn.cursor(); return (conn,cur); def exeUpdate(self,conn,cur,sql): #更新或插入操作 sta=cur.execute(sql); conn.commit(); return (sta); def exeDelete(self,conn,cur,IDs): #删除操作 demo 没用到 sta=0; for eachID in IDs.split(' '): sta+=cur.execute("delete from students where Id=%d"%(int(eachID))); conn.commit(); return (sta); def exeQuery(self,cur,sql): #查找操作 effect_row = cur.execute(sql); return (effect_row,cur); def connClose(self,conn,cur): #关闭连接,释放资源 cur.close(); conn.close(); if __name__ == '__main__': dbUtil = DBUtils(); conn,cur = dbUtil.connDB();
书籍操作文件 bookOpe.py
from DBUtil import DBUtils from bookInfo import Book from bookInfo import DownLoadInfo import logging logging.basicConfig( level=logging.INFO ) class BookOperator(object): def __addBook(self,book): logging.info("add book:%s" % book.bookName); dbUtil = DBUtils(); conn,cur = dbUtil.connDB(); insertBookSql = ("insert into book (bookName,bookUrl,bookInfo) values ('%s','%s','%s');"%(book.bookName,book.downLoadUrl,book.mainInfo)); dbUtil.exeUpdate(conn,cur,insertBookSql); dbUtil.connClose(conn,cur); def __selectLastBookId(self): logging.info("selectLastBookId "); dbUtil = DBUtils(); conn,cur = dbUtil.connDB(); selectLastBookSql = "select id from book order by id desc limit 1"; effect_row,cur = dbUtil.exeQuery(cur,selectLastBookSql); bookId = cur.fetchone()[0]; dbUtil.connClose(conn,cur); return bookId; def __addBookDownLoadInfos(self,downLoadInfos,bookId): logging.info("add bookId:%s" % bookId); dbUtil = DBUtils(); conn,cur = dbUtil.connDB(); for downLoadinfo in downLoadInfos: insertBookDownLoadInfo = ("insert into book_down_url (bookId,downName,downUrl) values ('%s','%s','%s');"%(bookId,downLoadinfo.downName,downLoadinfo.downUrl)); dbUtil.exeUpdate(conn,cur,insertBookDownLoadInfo); dbUtil.connClose(conn,cur); def addBookInfo(self,book): logging.info("add bookInfo:%s" % book.bookName); self.__addBook(book); bookId = self.__selectLastBookId(); self.__addBookDownLoadInfos(book.downLoadInfos,bookId); if __name__ == '__main__': bookope = BookOperator(); book = Book("aaa","yang","cccc"); book.addDownLoadUrl(DownLoadInfo("aaa.html","书籍")); bookope.addBookInfo(book);
书籍信息文件 bookInfo.py
import sys sys.encoding = "utf8" class Book(object): #书籍信息# def __init__(self,mainInfo,downLoadUrl,bookName): self.mainInfo = mainInfo; self.downLoadUrl = downLoadUrl; self.bookName = bookName; self.downLoadInfos = []; def addDownLoadUrl(self,downloadInfo): self.downLoadInfos.append(downloadInfo); def print_book_info(self): print ("bookName :%s" % (self.bookName)); class DownLoadInfo(object): #下载信息# def __init__(self,downUrl,downName): self.downUrl = downUrl; self.downName = downName; def print_down_info(self): print ("downLoad %s - %s" % (self.downUrl,self.downName));
51job界面解析文件 FiveOneJobFetch.py
import requests from bs4 import BeautifulSoup import sys from bookInfo import Book from bookInfo import DownLoadInfo import logging sys.encoding = "utf8" class PageFetch(object): host = "//www.jb51.net/"; #域名+分类 category = "books/"; #具体请求页 def __init__(self,pageUrl): self.pageUrl = pageUrl; #完整URL self.url = PageFetch.host+PageFetch.category + pageUrl; def __getPageContent(self): req = requests.get(self.url); if req.status_code == 200: req.encoding = "gb2312"; strText = req.text; return strText; else: return ""; def getPageContent(url): req = requests.get(url); if req.status_code == 200: req.encoding = "gb2312"; strText = req.text; return strText; else: return ""; def __getMaxPageNumAndUrl(self): fetchUrl = self.pageUrl; #获取分页地址 分页url 形如 list45_2.html 2为页号# maxPageNum = 0; maxLink = ""; while maxLink == "": url = PageFetch.host+PageFetch.category +fetchUrl; reqContent = PageFetch.getPageContent(url) soup = BeautifulSoup (reqContent,"html.parser"); for ul in soup.select(".plist"): print ("数据"); print (ul); maxPageNum = ul.select("strong")[0].text; alink = ul.select("a"); if alink[-1]['href'] == "#": maxLink = alink[1]['href']; else: fetchUrl = alink[-1]['href']; return maxPageNum,maxLink; def __formatPage(self,pageNum): #格式化url 形如 list45_2.html# lineBeginSite = self.pageUrl.index("_")+1; docBeginSite = self.pageUrl.index("."); return self.pageUrl[:lineBeginSite]+str(pageNum+1)+self.pageUrl[docBeginSite:]; def getBookPageList(self): #获取书籍每页的URL# shortPageList = []; maxPageNum,urlPattern = self.__getMaxPageNumAndUrl(); for i in range(int(maxPageNum)): shortPageList.append(self.host +self.category+ self.__formatPage(i)); return shortPageList; def getDownloadPage(url): downPage= []; reqContent = PageFetch.getPageContent(url); soup = BeautifulSoup (reqContent,"html.parser"); for a in soup.select(".cur-cat-list .btn-dl"): downPage.append(PageFetch.host+a['href']); return downPage; def getBookInfo(url): logging.info("获取书籍信息url:%s" % url); reqContent = PageFetch.getPageContent(url); soup = BeautifulSoup (reqContent,"html.parser"); mainInfo = (soup.select("#soft-intro"))[0].text.replace("截图:","").replace("'",""); title = (soup.select("dl dt h1"))[0].text.replace("'",""); book = Book(mainInfo,url,title); for ul in soup.select(".ul_Address"): for li in ul.select("li"): downLoadInfo = DownLoadInfo(li.select("a")[0]['href'],li.select("a")[0].text); book.addDownLoadUrl(downLoadInfo); return book; if __name__ == '__main__': p = PageFetch("list152_1.html"); shortPageList = p.getBookPageList(); downPage= []; for page in shortPageList: downLoadPage = PageFetch.getDownloadPage(page); downPage = downPage+downLoadPage; print ("================汇总如下==============================="); for bookDownLoadPage in downPage: book = PageFetch.getBookInfo(bookDownLoadPage); print (book.bookName+":%s" % book.downLoadUrl); for d in book.downLoadInfos: print ("%s - %s" % (d.downUrl,d.downName)); # p = PageFetch("list977_1.html"); # p = p.getMaxPageNumAndUrl(); # print (p);
执行文件,以上文件copy在相同的文件夹下 执行此文件即可 51Job.py
from FiveOneJobFetch import PageFetch from bookInfo import Book from bookInfo import DownLoadInfo from bookOpe import BookOperator def main(url): p = PageFetch(url); shortPageList = p.getBookPageList(); bookOperator = BookOperator(); downPage= []; for page in shortPageList: downLoadPage = PageFetch.getDownloadPage(page); downPage = downPage+downLoadPage; for bookDownLoadPage in downPage: book = PageFetch.getBookInfo(bookDownLoadPage); bookOperator.addBookInfo(book); print ("数据抓取成功:"+url); if __name__ == '__main__': urls = ["list152_35.html","list300_2.html","list476_6.html","list977_2.html","list572_5.html","list509_2.html","list481_1.html","list576_1.html","list482_1.html","list483_1.html","list484_1.html"]; for url in urls: main(url);
数据库表:书籍信息表和下载地址表
CREATE TABLE `book` ( `id` INT(11) NOT NULL AUTO_INCREMENT, `bookName` VARCHAR(200) NULL DEFAULT NULL, `bookUrl` VARCHAR(500) NULL DEFAULT NULL, `bookInfo` TEXT NULL, PRIMARY KEY (`id`) ) COLLATE='utf8mb4_general_ci' ENGINE=InnoDB AUTO_INCREMENT=2936;
CREATE TABLE `book_down_url` ( `id` INT(11) NOT NULL AUTO_INCREMENT, `bookId` INT(11) NOT NULL DEFAULT '0', `downName` VARCHAR(200) NOT NULL DEFAULT '0', `downUrl` VARCHAR(2000) NOT NULL DEFAULT '0', PRIMARY KEY (`id`) ) COLLATE='utf8mb4_general_ci' ENGINE=InnoDB AUTO_INCREMENT=44441;
git地址:https://git.oschina.net/yangsj/BookFetch/tree/master
相关文章
- 这篇文章主要介绍了python-opencv-画外接矩形框的实例代码,代码简单易懂,对大家的学习或工作具有一定的参考借鉴价值,需要的朋友可以参考下...2021-09-04
Python astype(np.float)函数使用方法解析
这篇文章主要介绍了Python astype(np.float)函数使用方法解析,文中通过示例代码介绍的非常详细,对大家的学习或者工作具有一定的参考学习价值,需要的朋友可以参考下...2020-06-08- 2022虎年新年即将来临,小编为大家带来了一个利用Python编写的虎年烟花特效,堪称全网最绚烂,文中的示例代码简洁易懂,感兴趣的同学可以动手试一试...2022-02-14
- 在本篇文章里小编给大家分享的是一篇关于python中numpy.empty()函数实例讲解内容,对此有兴趣的朋友们可以学习下。...2021-02-06
python-for x in range的用法(注意要点、细节)
这篇文章主要介绍了python-for x in range的用法,具有很好的参考价值,希望对大家有所帮助。一起跟随小编过来看看吧...2021-05-10- 这篇文章主要介绍了Python 图片转数组,二进制互转操作,具有很好的参考价值,希望对大家有所帮助。一起跟随小编过来看看吧...2021-03-09
- 这篇文章主要介绍了Python中的imread()函数用法说明,具有很好的参考价值,希望对大家有所帮助。一起跟随小编过来看看吧...2021-03-16
- 这篇文章主要介绍了python如何实现b站直播自动发送弹幕,帮助大家更好的理解和学习使用python,感兴趣的朋友可以了解下...2021-02-20
python Matplotlib基础--如何添加文本和标注
这篇文章主要介绍了python Matplotlib基础--如何添加文本和标注,帮助大家更好的利用Matplotlib绘制图表,感兴趣的朋友可以了解下...2021-01-26- 这篇文章主要介绍了解决python 使用openpyxl读写大文件的坑,具有很好的参考价值,希望对大家有所帮助。一起跟随小编过来看看吧...2021-03-13
- 今天小编就为大家分享一篇python 计算方位角实例(根据两点的坐标计算),具有很好的参考价值,希望对大家有所帮助。一起跟随小编过来看看吧...2020-04-27
- 这篇文章主要为大家详细介绍了python实现双色球随机选号,文中示例代码介绍的非常详细,具有一定的参考价值,感兴趣的小伙伴们可以参考一下...2020-05-02
- 在本篇文章里小编给大家整理的是一篇关于python中使用np.delete()的实例方法,对此有兴趣的朋友们可以学习参考下。...2021-02-01
- 这篇文章主要介绍了使用Python的pencolor函数实现渐变色功能,本文通过实例代码给大家介绍的非常详细,对大家的学习或工作具有一定的参考借鉴价值,需要的朋友可以参考下...2021-03-09
- 这篇文章主要介绍了python自动化办公操作PPT的实现,文中通过示例代码介绍的非常详细,对大家的学习或者工作具有一定的参考学习价值,需要的朋友们下面随着小编来一起学习学习吧...2021-02-05
Python getsizeof()和getsize()区分详解
这篇文章主要介绍了Python getsizeof()和getsize()区分详解,文中通过示例代码介绍的非常详细,对大家的学习或者工作具有一定的参考学习价值,需要的朋友们下面随着小编来一起学习学习吧...2020-11-20- 这篇文章主要为大家详细介绍了python实现学生通讯录管理系统,文中示例代码介绍的非常详细,具有一定的参考价值,感兴趣的小伙伴们可以参考一下...2021-02-25
- 这篇文章主要介绍了PyTorch一小时掌握之迁移学习篇,本文给大家介绍的非常详细,对大家的学习或工作具有一定的参考借鉴价值,需要的朋友可以参考下...2021-09-08
- 这篇文章主要介绍了解决python 两个时间戳相减出现结果错误的问题,具有很好的参考价值,希望对大家有所帮助。一起跟随小编过来看看吧...2021-03-12
- 这篇文章主要介绍了Python绘制的爱心树与表白代码,本文通过实例代码给大家介绍的非常详细,对大家的学习或工作具有一定的参考借鉴价值,需要的朋友可以参考下...2021-04-06