技术资料
Mysql
Windows下安装mysql 5.6
Python
Python3.5.2 安装(windows环境)
图片爬取和写入
gevent队列任务
selenium模拟浏览器操作
pandas表格和数据应用
OS文件创建
excel格式转换:csv转xls
email自动发送
excel读取指定多行数据
cookie登录后爬取内容
单页文字图片爬取保存到word
学习实践:知网疾病知识
学习实践:知网指南
字典生成树形目录
docx文本图片存入word
-
+
首页
单页文字图片爬取保存到word
```python import requests,csv,docx,os,time,random from bs4 import BeautifulSoup from docx.shared import Inches # 0703完美测试版,记得更新cookie # 定义函数,判断段落代码里是否有img,若有,则要单独提取图片 def img_ok(x): try: img = x.find('img')['src'] if len(img)>1: return 1 else: return 0 except TypeError: return 0 headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win32; x32) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.106 Safari/535.1', 'Cookie': '__root_domain_v=.ipmph.com; _qddaz=QD.ky39uq.kayj8f.k9m46lu7; UM_distinctid=171c9f3608f367-0db723f0697f73-7373667-e1000-171c9f36090528; Hm_lvt_ec31b23a3a54fb0e85df69fc93bd5de9=1595478459,1595497069,1595562737,1595819138; jeesite.session.id=5bb73af84fdb41888c956f6d0daf9df0; CNZZDATA1268665702=515724504-1588225736-%7C1595897449; Hm_lpvt_ec31b23a3a54fb0e85df69fc93bd5de9=1595901255',} session = requests.Session() logurl = 'http://sso.ipmph.com//oauth/authorize?client_id=8fe837c9b8b94cf0b4a87287ea4c7b7a&response_type=code&scope=user_info&redirect_uri=http://ccdas.ipmph.com/sso/login?rw_lczsPath=http%3A%2F%2Fccdas.ipmph.com%2F' response = session.get(logurl, headers=headers) #url='http://ccdas.ipmph.com/rwDisease/getRwDiseaseDetail?diseaseId=24472' with open(r'mulu.csv','r') as f: xls = csv.reader(f) for i, row in enumerate(xls): if i in range(0,1000): url = row[0] time.sleep(random.randrange(4,81)/3) print(url) # 以下无误的爬取单个页面内容 msg = session.get(url, headers=headers) sop = BeautifulSoup(msg.text, 'html.parser') try: name = sop.find('div',class_='gu_det_left_top').text.replace(' ','').replace('\n','').replace('\t','').replace('\r','').replace('/','、').replace('?','?').replace('*','') except AttributeError: name = sop.find('div',class_='gu_det_left_top') div = sop.find('div',class_='gu_det_left_con') # 记得先打开docx doc = docx.Document() doc.add_paragraph(name+'\n'+url) vvv = sop.find_all('div',class_='gu_det_left_con_h1') kkk = sop.find_all('div',class_='gu_det_left_con_main') ls1 = [] # 提取段落标题,封装到列表 for v in vvv: title = v.text.replace(' ','').replace('\n','').replace('\t','').replace('\r','').replace(':','') ls1.append(title) # 提取段落文字,n为段落的顺序 n = -1 for k in kkk: n += 1 if img_ok(k) == 0: print('【'+ls1[n]+'】'+'\n'+k.text) doc.add_paragraph('【'+ls1[n]+'】'+k.text) elif img_ok(k) == 1: print('【'+ls1[n]+'】'+'\n') doc.add_paragraph('【'+ls1[n]+'】'+'\n') for j in k: if img_ok(j) == 0: try: # print(j.text) doc.add_paragraph(j.text) except AttributeError: doc.add_paragraph(j) elif img_ok(j) == 1: img_ul = 'http://ccdas.ipmph.com/'+j.find('img')['src'] pic = requests.get(img_ul) with open('img_tmp.png', 'wb') as f: f.write(pic.content) print(img_ul) doc.add_picture('img_tmp.png', width=Inches(6)) os.remove('img_tmp.png') #若无需保存原图片,一般删除 # 切记保存word,否则不成功 doc.save('人卫/0728/{}.docx'.format(name)) else: continue ```
大诚
2022年8月3日 10:33
转发文档
收藏文档
上一篇
下一篇
手机扫码
复制链接
手机扫一扫转发分享
复制链接
Markdown文件
PDF文档
PDF文档(打印)
分享
链接
类型
密码
更新密码