#!/usr/bin/env python
#-*- coding:utf-8-*-

"""
@author:    wangzhu
@desc:  get qian cheng wu you qiu zhi wang information
@contact:   isaac.zhu@dbappsecurity.com.cn
@data:  2019/8/7
"""

import requests  #导入请求包
import re  #导入正则包
from random import randint


"""
网站地址:https://www.danke.com/room/hz
"""

#Some User Agents
hds=[{'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'},
    {'User-Agent':'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.12 Safari/535.11'},
    {'User-Agent':'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Trident/6.0)'},
    {'User-Agent':'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:34.0) Gecko/20100101 Firefox/34.0'},
    {'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/44.0.2403.89 Chrome/44.0.2403.89 Safari/537.36'},
    {'User-Agent':'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50'},
    {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50'},
    {'User-Agent':'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0'},
    {'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1'},
    {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1'},
    {'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11'},
    {'User-Agent':'Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11'},
    {'User-Agent':'Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11'}]

def CrawlerHouse():
    url="https://www.danke.com/room/hz"
    res=requests.get(url,headers=hds[randint(0,len(hds)-1)])
    res.encoding="utf-8"  #对返回数据进行排版
    #print(res.text)  #<div class="r_ls_box">  /<div class="r_lbx">
    infolist=re.findall('<div class="r_lbx_cena">(.*?)</div>',res.text,re.S)  #获取所有数据
    i=1
    for one in infolist:  #每一条的数据
        two=one.split('<span class="location">{0}</span>'.format(i))  #过滤掉<span class="location">{0}</span>,以防止 >与下方的>冲突
        two=''.join(two) #去掉外边中括号
        #print(two)
        #获取岗位名称
        job=re.findall('>(.*?)</a>',two,re.S)
        job=''.join(job).strip()  #删除两边空字符
        print(job)
        i+=1

#运行程序
CrawlerHouse()

 

    <div class="r_ls_box">
                            
            <div class="r_lbx">
                <a href="javascript:void(0)" class="rimg" key='0' xiaoqu='万科北宸之光'>
                    <span class="img-hint">
                        <span></span>
                        <span></span>
                    </span>
                    <img
                            src="https://public.danke.com.cn/public-20190123-isz_ljR3BG1JKKfa2lXEilpNXgN1NTRV?imageView2/1/w/380/h/285" width="260" height="173"
                            title=""
                            alt="图片"/>

                                    </a>
                <div class="r_lbx_cen">
                    <div class="r_lbx_cena">
                        <span class="location">1</span>
                        <a href="https://www.danke.com/duanzu/1913140756.html" key='0' xiaoqu='万科北宸之光' target="_blank"
                           title="万达广场  万科北宸之光 3室2厅">
                            万达广场  万科北宸之光 3室2厅
                        </a>
                                                    <div class="r_lbx_cena">
                                <div class="sub_img"></div>
                                距5号线大运河站2700米
                            </div>
                                            </div>
                    <div class="r_lbx_cenb">
                        <div class="address_img"></div>
                        建筑面积约12㎡ | 21楼
                        | 3室1卫                          | 朝南
                                                    <i>合</i>
                                            </div>
                    <div class="r_lbx_cenc">
                                                                    </div>
                                    </div>
                <div class="r_lbx_money">
                                            <div class="r_lbx_moneya">
                                                            <span class="ty_b">1890</span> 元/月
                                                    </div>

                                        <a class="lk_more" key='0' xiaoqu='万科北宸之光' href="https://www.danke.com/duanzu/1913140756.html"
                       target="_blank">
                        查看详情
                    </a>
                </div>
            </div>

<div class="r_ls_box">
<DIV类= “r_ls_box”>
内容来源于网络如有侵权请私信删除

文章来源: 博客园

原文链接: https://www.cnblogs.com/gufengchen/p/12420798.html

你还没有登录,请先登录注册
  • 还没有人评论,欢迎说说您的想法!