You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
156 lines
5.1 KiB
156 lines
5.1 KiB
#!/usr/bin/env python3
|
|
# -*- coding: utf-8 -*-
|
|
"""
|
|
深圳市住房和建设局信息监控脚本
|
|
抓取通知公告和政策法规页面的信息
|
|
"""
|
|
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
from datetime import datetime, timedelta
|
|
import time
|
|
import re
|
|
|
|
# 配置
|
|
NOTICE_URL = "https://zjj.sz.gov.cn/ztfw/zfbz/tzgg2017/index.html"
|
|
POLICY_URL = "https://zjj.sz.gov.cn/ztfw/zfbz/zcfg2017/index.html"
|
|
|
|
# 关键词过滤
|
|
KEYWORDS = ["人才房", "公租房", "保障房", "租赁"]
|
|
|
|
def get_yesterday():
|
|
"""获取昨天的日期"""
|
|
yesterday = datetime.now() - timedelta(days=1)
|
|
return yesterday.strftime("%Y-%m-%d")
|
|
|
|
def fetch_page(url):
|
|
"""获取页面内容"""
|
|
headers = {
|
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
|
|
}
|
|
try:
|
|
response = requests.get(url, headers=headers, timeout=30)
|
|
response.encoding = 'utf-8'
|
|
return response.text
|
|
except Exception as e:
|
|
print(f"获取页面失败: {e}")
|
|
return None
|
|
|
|
def parse_notice_page(html, target_date, filter_keywords=True):
|
|
"""解析通知公告页面"""
|
|
if not html:
|
|
return []
|
|
|
|
soup = BeautifulSoup(html, 'html.parser')
|
|
results = []
|
|
|
|
# 查找通知列表 - 根据实际页面结构:ul.ftdt-list li
|
|
list_elem = soup.find('ul', class_='ftdt-list')
|
|
if not list_elem:
|
|
return results
|
|
|
|
items = list_elem.find_all('li')
|
|
|
|
for item in items:
|
|
try:
|
|
# 查找标题和链接
|
|
title_elem = item.find('a')
|
|
if not title_elem:
|
|
continue
|
|
|
|
title = title_elem.get_text(strip=True)
|
|
link = title_elem.get('href', '')
|
|
|
|
# 查找日期 - span元素
|
|
date_elem = item.find('span')
|
|
date_str = ""
|
|
if date_elem:
|
|
date_str = date_elem.get_text(strip=True)
|
|
|
|
# 格式化日期:从"26-01-27"转换为"2026-01-27"
|
|
if date_str and len(date_str) == 8 and date_str.count('-') == 2:
|
|
parts = date_str.split('-')
|
|
if len(parts) == 3:
|
|
# 假设是21世纪
|
|
year = '20' + parts[0]
|
|
month = parts[1]
|
|
day = parts[2]
|
|
date_str = f"{year}-{month}-{day}"
|
|
|
|
# 检查是否是目标日期
|
|
if date_str == target_date:
|
|
# 检查关键词(如果需要过滤)
|
|
if not filter_keywords or any(keyword in title for keyword in KEYWORDS):
|
|
# 补全链接
|
|
if link and not link.startswith('http'):
|
|
if link.startswith('/'):
|
|
link = 'https://zjj.sz.gov.cn' + link
|
|
else:
|
|
link = 'https://zjj.sz.gov.cn/ztfw/zfbz/' + link
|
|
|
|
results.append({
|
|
'title': title,
|
|
'link': link,
|
|
'date': date_str,
|
|
'type': '通知公告'
|
|
})
|
|
except Exception as e:
|
|
continue
|
|
|
|
return results
|
|
|
|
def parse_policy_page(html, target_date):
|
|
"""解析政策法规页面 - 不过滤关键词,有就整理"""
|
|
# 复用通知公告的解析逻辑,但不过滤关键词
|
|
results = parse_notice_page(html, target_date, filter_keywords=False)
|
|
for item in results:
|
|
item['type'] = '政策法规'
|
|
return results
|
|
|
|
def main():
|
|
"""主函数"""
|
|
print("=" * 60)
|
|
print("深圳市住房和建设局信息监控")
|
|
print("=" * 60)
|
|
|
|
target_date = get_yesterday()
|
|
print(f"\n目标日期: {target_date}")
|
|
print(f"通知公告关键词: {', '.join(KEYWORDS)}")
|
|
print(f"政策法规: 不过滤,有就整理")
|
|
|
|
all_results = []
|
|
|
|
# 1. 检查通知公告
|
|
print("\n[1/2] 检查通知公告页面...")
|
|
notice_html = fetch_page(NOTICE_URL)
|
|
if notice_html:
|
|
notice_results = parse_notice_page(notice_html, target_date)
|
|
all_results.extend(notice_results)
|
|
print(f" 找到 {len(notice_results)} 条相关通知")
|
|
|
|
# 2. 检查政策法规
|
|
print("\n[2/2] 检查政策法规页面...")
|
|
policy_html = fetch_page(POLICY_URL)
|
|
if policy_html:
|
|
policy_results = parse_policy_page(policy_html, target_date)
|
|
all_results.extend(policy_results)
|
|
print(f" 找到 {len(policy_results)} 条相关政策")
|
|
|
|
# 输出结果
|
|
print("\n" + "=" * 60)
|
|
print(f"\n总计找到 {len(all_results)} 条相关信息\n")
|
|
|
|
if all_results:
|
|
for i, item in enumerate(all_results, 1):
|
|
print(f"[{i}] {item['type']}")
|
|
print(f" 标题: {item['title']}")
|
|
print(f" 日期: {item['date']}")
|
|
print(f" 链接: {item['link']}")
|
|
print()
|
|
else:
|
|
print(f"未找到 {target_date} 发布的相关信息")
|
|
|
|
return all_results
|
|
|
|
if __name__ == "__main__":
|
|
main()
|
|
|