py 爬虫,爬取药监总局相关数据
import requestsimport jsonif __name__ == '__main__':url = 'http://scxk.nmpa.gov.cn:81/xk/itownet/portalAction.do?method=getXkzsList'# UA 伪装headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64
·
import requests
import json
if __name__ == '__main__':
url = 'http://scxk.nmpa.gov.cn:81/xk/itownet/portalAction.do?method=getXkzsList'
# UA 伪装
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36'
}
# 存储所有的企业id
id_list = []
# 存储所有的企业详细信息
all_data_list = []
# page_text = page_test = requests.get(url=url,headers=headers).text
#
# with open('./hauzhuangpin.html','w',encoding='utf-8') as pf:
# pf.write(page_test)
# print('end')
#爬取前6页的数据
for page in range(1,6):
page = str(page)
data = {
'on': 'true',
'page': page,
'pageSize': '15',
'productName': '',
'conditionType': '1',
'applyname': '',
'applysn': '',
}
# response = requests.post(url=url,data=data,headers=headers)
#
# # 获取类型是json
# list_data = response.json()
# 结果是字典
json_ids = requests.post(url=url,data=data,headers=headers).json()
#拿字典的json_list 键 list ,循环读取 值 ,加入到列表
#批量获取到了 id
for i in json_ids['list']:
id_list.append(i['ID'])
# print(id_list)
#获取企业详细数据
post_url = 'http://scxk.nmpa.gov.cn:81/xk/itownet/portalAction.do?method=getXkzsById'
for id in id_list:
data = {
'id': id
}
new_json = requests.post(url=post_url,data=data,headers=headers).json()
# print(new_json)
all_data_list.append(new_json)
#持久化存储
fp = open('./allData.json','w',encoding='utf-8')
json.dump(all_data_list,fp=fp,ensure_ascii=False)
print('end')
DAMO开发者矩阵,由阿里巴巴达摩院和中国互联网协会联合发起,致力于探讨最前沿的技术趋势与应用成果,搭建高质量的交流与分享平台,推动技术创新与产业应用链接,围绕“人工智能与新型计算”构建开放共享的开发者生态。
更多推荐



所有评论(0)