From eb23024b39ae98a118be48bbb0436da21f095ab5 Mon Sep 17 00:00:00 2001 From: Brady Fomegne Date: Thu, 7 Nov 2024 23:35:11 +0100 Subject: [PATCH 1/2] report_news: added tericcabrel blog as source --- .github/workflows/report_news/main.py | 58 ++++++++++++++++++++++++++- 1 file changed, 57 insertions(+), 1 deletion(-) diff --git a/.github/workflows/report_news/main.py b/.github/workflows/report_news/main.py index a8f62fa..0abf9ec 100644 --- a/.github/workflows/report_news/main.py +++ b/.github/workflows/report_news/main.py @@ -2,13 +2,17 @@ from hashlib import sha1 import random import os +import re from typing import Iterable from urllib.request import urlopen +from urllib.parse import quote as urlquote +from urllib.error import HTTPError import xml.etree.ElementTree as ET TECH_GRIOT_API_URL = 'https://techgriot.co/feed' NEWS_API_URL = 'https://newsapi.org/v2/top-headlines?sources=techcrunch' NEWS_API_KEY = os.getenv('NEWS_API_KEY') +TERICCABREL_BLOG_SITEMAP_URL = "https://blog.tericcabrel.com/sitemap-posts.xml" REPORT_NEWS_HASHES = 'report_news.hash' MAX_ARTICLES = 10 @@ -58,6 +62,58 @@ def fetch_tech_griot() -> list: return articles +def fetch_tericcabrel_blog() -> list: + """ + fetching articles from tericcabrel blog + """ + + with urlopen(TERICCABREL_BLOG_SITEMAP_URL) as response: + body = response.read() + + xml_raw = ET.fromstring(body) + articles = [] + + for url in xml_raw.findall('{http://www.sitemaps.org/schemas/sitemap/0.9}url')[-MAX_ARTICLES:]: + loc = url.find('{http://www.sitemaps.org/schemas/sitemap/0.9}loc') + lastmod = url.find('{http://www.sitemaps.org/schemas/sitemap/0.9}lastmod') + + if loc is None or lastmod is None: + continue + + link = re.sub(r'^http://', 'https://', urlquote(loc.text, safe=":/")) + pub_date = lastmod.text + + try: + with urlopen(link) as response: + body = response.read().decode() + + body = body.replace('\n', '') + + # Parse the HTML data + title = re.findall(r'

(.*?)

', body) + description = re.findall(r'
(.*?)
', body) + + if not title or not description: + print("Error: Unable to parse the article", file=sys.stderr) + continue + + title = title[0] + description = description[0] + # Clean the data + description = re.sub(r"<.*?>", "", description).strip()[:255] + except HTTPError as e: + continue + + articles.append({ + 'title': title, + 'link': link, + 'description': description, + 'pub_date': pub_date, + }) + + return articles + + def hash_it(info: dict) -> str: """ This method takes as input a dictionnary, then dumps it into string @@ -91,7 +147,7 @@ def build_news() -> Iterable: We build news """ result = {} - news = fetch_tech_crunch() + fetch_tech_griot() + news = fetch_tech_crunch() + fetch_tech_griot() + fetch_tericcabrel_blog() # we extract hashes from the file as a dict hashes = extract_hash() # From each iteration, we shuffle From 53040fea23d949ccea89ad64c3b9c0834ad155e0 Mon Sep 17 00:00:00 2001 From: Brady Fomegne Date: Sun, 10 Nov 2024 12:02:47 +0100 Subject: [PATCH 2/2] Fix the split order The articles are listed from latest to oldest --- .github/workflows/report_news/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/report_news/main.py b/.github/workflows/report_news/main.py index 0abf9ec..82d6542 100644 --- a/.github/workflows/report_news/main.py +++ b/.github/workflows/report_news/main.py @@ -73,7 +73,7 @@ def fetch_tericcabrel_blog() -> list: xml_raw = ET.fromstring(body) articles = [] - for url in xml_raw.findall('{http://www.sitemaps.org/schemas/sitemap/0.9}url')[-MAX_ARTICLES:]: + for url in xml_raw.findall('{http://www.sitemaps.org/schemas/sitemap/0.9}url')[:MAX_ARTICLES]: loc = url.find('{http://www.sitemaps.org/schemas/sitemap/0.9}loc') lastmod = url.find('{http://www.sitemaps.org/schemas/sitemap/0.9}lastmod')