saowu's Blog

用python写一个新冠疫情爬虫

用python写一个新冠疫情爬虫
2020-04-10 · 5 min read
爬虫 Python

每一颗渺小的种子都能创造伟大的力量!

完整代码:github

一、准备好api

#全国省市区信息api
https://ncov.html5.qq.com/api/getPosition
#全国疫情api
https://ncov.html5.qq.com/api/getCommunity?province=省&city=市&district=区

二、所需python包

bs4==0.0.1
requests==2.23.0
pandas==1.0.2

三、获取全国省市区信息

全国省市信息json数据格式

{
    "position":{
        "云南省":{
            "玉溪市":{
                "全部":"",
                "红塔区":""
            },
            "德宏傣族景颇族自治州":{
                "全部":"",
                "瑞丽市":""
            }
        }
    }
}

获取并解析json

# 加header伪装成浏览器
user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) " \
             "Chrome/80.0.3987.100 Safari/537.36 "
headers = {'User-Agent': user_agent}

def get_city_info():
    """
    获取全国省市信息
    Returns:DataFrame

    """
    url_position = 'https://ncov.html5.qq.com/api/getPosition'
    r_position = requests.get(url_position, headers=headers)
    data_position = BeautifulSoup(r_position.text, 'html.parser')
    json1bj = json.loads(data_position.decode('utf-8'))
    data = []
    for i in json1bj['position']:
        for j in json1bj['position'][i]:
            for k in json1bj['position'][i][j]:
                dict1 = {'province': i, 'city': j, 'district': k}
                data.append(dict1)
    return pd.DataFrame(data)

四、获取疫情信息

def get_info(province, city, district):
    """
    获取地区疫情信息
    Args:
        province: 省
        city: 市
        district: 县(区)

    Returns:json

    """
    url_community = 'https://ncov.html5.qq.com/api/getCommunity?province=' + str(province) + '&city=' + str(
        city) + '&district=' + str(district)
    r_community = requests.get(url_community, headers=headers)
    data_community = BeautifulSoup(r_community.text, 'html.parser')
    jsonObj = json.loads(data_community.decode('utf-8'))
    return jsonObj

五、格式化疫情信息

通过api获取到的 json数据格式

{
    "code":0,
    "community":{
        "云南省":{
            "玉溪市":{
                "红塔区":[
                    {
                        "province":"云南省",
                        "city":"玉溪市",
                        "district":"红塔区",
                        "county":"",
                        "street":"北城街道",
                        "community":"大石板社区秧草塘村",
                        "show_address":"大石板社区秧草塘村",
                        "cnt_inc_uncertain":"-1",
                        "cnt_inc_certain":"-1",
                        "cnt_inc_die":"-1",
                        "cnt_inc_recure":"-1",
                        "cnt_sum_uncertain":"-1",
                        "cnt_sum_certain":"2",
                        "cnt_sum_die":"-1",
                        "cnt_sum_recure":"-1",
                        "full_address":"云南省玉溪市红塔区北城街道大石板社区秧草塘村",
                        "release_date":"",
                        "article_source":[
                            {
                                "title":"玉溪新增4例确诊病例 详细路线公布!",
                                "url":"https://mp.weixin.qq.com/s/THaIpahx_5VTWP2jcMMRMw"
                            },
                            {
                                "title":"玉溪市新增确诊病例1例!红塔大道50号华瑞小区实施隔离封闭管理",
                                "url":"https://mp.weixin.qq.com/s/AjB4YCWmfSiJHzBJNhzHpg"
                            }
                        ],
                        "id":"60bd8bd1be4e00780a29336dab42f18e",
                        "lng":"102.51207",
                        "lat":"24.46442",
                        "doc_id":"90000052_60bd8bd1be4e00780a29336dab42f18e",
                        "source":[
                            {
                                "name":"玉溪发布",
                                "url":""
                            }
                        ],
                        "communitytype":1,
                        "distance":-1
                    }
                ]
            }
        }
    }
}

json数据格式化

通过对json数据的解析,转化为dict数据集,从而再以其他格式输出处理(此处过于野蛮)

def format_data(info, result):
    """
    格式化疫情信息
    Args:
        info: 疫情信息
        result:结果集

    Returns:None

    """
    for i in info['community']:
        for j in info['community'][i]:
            for k in info['community'][i][j]:
                for x in info['community'][i][j][k]:
                    if len(x['article_source']) > 0:
                        for y in x['article_source']:
                            dict1 = {'province': x['province'], 'city': x['city'], 'district': x['district'],
                                     'street': x['street'], 'community': x['community'],
                                     'full_address': x['full_address'],
                                     'lat': x['lat'], 'lng': x['lng'], 'title': y['title'], 'url': y['url']}
                    else:
                        dict1 = {'province': x['province'], 'city': x['city'], 'district': x['district'],
                                 'street': x['street'], 'community': x['community'], 'full_address': x['full_address'],
                                 'lat': x['lat'], 'lng': x['lng'], 'title': '', 'url': ''}
                    result.append(dict1)

六、启动并检验成果


Copyright © 2020 - 2024 saowu. All Right Reserved
Powered by Gridea