diff --git a/.github/workflows/report_news/main.py b/.github/workflows/report_news/main.py index a8f62fa..82d6542 100644 --- a/.github/workflows/report_news/main.py +++ b/.github/workflows/report_news/main.py @@ -2,13 +2,17 @@ from hashlib import sha1 import random import os +import re from typing import Iterable from urllib.request import urlopen +from urllib.parse import quote as urlquote +from urllib.error import HTTPError import xml.etree.ElementTree as ET TECH_GRIOT_API_URL = 'https://techgriot.co/feed' NEWS_API_URL = 'https://newsapi.org/v2/top-headlines?sources=techcrunch' NEWS_API_KEY = os.getenv('NEWS_API_KEY') +TERICCABREL_BLOG_SITEMAP_URL = "https://blog.tericcabrel.com/sitemap-posts.xml" REPORT_NEWS_HASHES = 'report_news.hash' MAX_ARTICLES = 10 @@ -58,6 +62,58 @@ def fetch_tech_griot() -> list: return articles +def fetch_tericcabrel_blog() -> list: + """ + fetching articles from tericcabrel blog + """ + + with urlopen(TERICCABREL_BLOG_SITEMAP_URL) as response: + body = response.read() + + xml_raw = ET.fromstring(body) + articles = [] + + for url in xml_raw.findall('{http://www.sitemaps.org/schemas/sitemap/0.9}url')[:MAX_ARTICLES]: + loc = url.find('{http://www.sitemaps.org/schemas/sitemap/0.9}loc') + lastmod = url.find('{http://www.sitemaps.org/schemas/sitemap/0.9}lastmod') + + if loc is None or lastmod is None: + continue + + link = re.sub(r'^http://', 'https://', urlquote(loc.text, safe=":/")) + pub_date = lastmod.text + + try: + with urlopen(link) as response: + body = response.read().decode() + + body = body.replace('\n', '') + + # Parse the HTML data + title = re.findall(r'