Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
64 changes: 63 additions & 1 deletion .github/workflows/report_news/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,16 @@
import os
from typing import Iterable
from urllib.request import urlopen
from urllib.error import HTTPError
from urllib.parse import quote as urlquote
import xml.etree.ElementTree as ET
import re

TECH_GRIOT_API_URL = 'https://techgriot.co/feed'
NEWS_API_URL = 'https://newsapi.org/v2/top-headlines?sources=techcrunch'
NEWS_API_KEY = os.getenv('NEWS_API_KEY')
REPORT_NEWS_HASHES = 'report_news.hash'
PEEF_SITEMAP_URL = 'https://peef.dev/sitemap.xml'
MAX_ARTICLES = 10


Expand Down Expand Up @@ -58,6 +62,63 @@ def fetch_tech_griot() -> list:
return articles


def fetch_peef():
"""
Fetching data from peef
"""

url = f'{PEEF_SITEMAP_URL}'

with urlopen(url) as response:
body = response.read()

xml_raw = ET.fromstring(body)

articles = []
for url in xml_raw.findall('{http://www.sitemaps.org/schemas/sitemap/0.9}url')[-MAX_ARTICLES:]:
loc = url.find('{http://www.sitemaps.org/schemas/sitemap/0.9}loc')

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you secure each iteration with a try/except ?

@pythonbrad pythonbrad Nov 7, 2024

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I updated, I think now only HTTP errors are remaining.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am not sure why we use this MAX_ARTICLES=10 limit here, can you elaborate ?
I left a comment on the other PR regarding what I think we should do here,

lastmod = url.find('{http://www.sitemaps.org/schemas/sitemap/0.9}lastmod')

if loc is None or lastmod is None:
continue

link = re.sub(r'^http://', 'https://', urlquote(loc.text, safe=":/"))
pub_date = lastmod.text

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what's the purpose of creating this variable

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@Sanix-Darker, which variable?

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

pub_date


# A peef article in under the form https://peef.dev/post/<author>/<slug>
if not link.startswith("https://peef.dev/post/") or link.strip("/").count("/") != 5:

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh nice, but why not making it very strict ? If we need only articles, then it should be strict, so using AND instead of OR is better

Doing that, every URL should pass both checks : start with https://peef.dev/post/ and have exactly 5 "/"

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@abdounasser202, I think, it is doing exactly what you said.

If I am wrong, please, give me more details for me to understand your point.

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yep, you're right, I did a test, and it's doing what I was explaining
so it's fine ! 👍

continue

author, title = link.split("/")[4:6]
title = title.replace('-', ' ').capitalize()

try:
with urlopen(link) as response:
body = response.read().decode()

# Parse the HTML data
description = re.findall(r"<article.*?>(.*?)</article>", body.replace('\n', ''))
if not description:
print("Error: Unable to parse the article", file=sys.stderr)
continue

description = description[0]
# Clean the data
description = re.sub(r"<.*?>", "", description).strip()[:255]
except HTTPError as e:
continue

articles.append({
'author': author,
'title': title,
'link': link,
'description': description,
'pub_date': pub_date,
})

return articles


def hash_it(info: dict) -> str:
"""
This method takes as input a dictionnary, then dumps it into string
Expand Down Expand Up @@ -91,7 +152,7 @@ def build_news() -> Iterable:
We build news
"""
result = {}
news = fetch_tech_crunch() + fetch_tech_griot()
news = fetch_tech_crunch() + fetch_tech_griot() + fetch_peef()
# we extract hashes from the file as a dict
hashes = extract_hash()
# From each iteration, we shuffle
Expand Down Expand Up @@ -134,3 +195,4 @@ def format_str():
if __name__ == '__main__':
print(format_str())