Skip to content
Merged
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
63 changes: 62 additions & 1 deletion .github/workflows/report_news/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,16 @@
import os
from typing import Iterable
from urllib.request import urlopen
from urllib.error import HTTPError
from urllib.parse import quote as urlquote
import xml.etree.ElementTree as ET
import re

TECH_GRIOT_API_URL = 'https://techgriot.co/feed'
NEWS_API_URL = 'https://newsapi.org/v2/top-headlines?sources=techcrunch'
NEWS_API_KEY = os.getenv('NEWS_API_KEY')
REPORT_NEWS_HASHES = 'report_news.hash'
PEEF_SITEMAP_URL = 'https://peef.dev/sitemap.xml'
MAX_ARTICLES = 10


Expand Down Expand Up @@ -58,6 +62,62 @@ def fetch_tech_griot() -> list:
return articles


def fetch_peef():
"""
Fetching data from peef
"""

url = f'{PEEF_SITEMAP_URL}'

with urlopen(url) as response:
body = response.read()

xml_raw = ET.fromstring(body)

articles = []
for url in xml_raw.findall('{http://www.sitemaps.org/schemas/sitemap/0.9}url')[-MAX_ARTICLES:]:
loc = url.find('{http://www.sitemaps.org/schemas/sitemap/0.9}loc')

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you secure each iteration with a try/except ?

@pythonbrad pythonbrad Nov 7, 2024

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I updated, I think now only HTTP errors are remaining.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am not sure why we use this MAX_ARTICLES=10 limit here, can you elaborate ?
I left a comment on the other PR regarding what I think we should do here,

lastmod = url.find('{http://www.sitemaps.org/schemas/sitemap/0.9}lastmod')

if loc is None or lastmod is None:
continue

link = re.sub(r'^http://', 'https://', urlquote(loc.text, safe=":/"))
pub_date = lastmod.text

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what's the purpose of creating this variable

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@Sanix-Darker, which variable?

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

pub_date


if not link.startswith("https://peef.dev/post/"):

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would like to add here that not all content from /post/ are article

  1. We have comments or simple post for the early version : /post/<slug> where slug is autogenerated
  2. And articles are from /post/<username>/<slug> where slug comes from the article's title

So, i think normal /post/<slug> should be excluded and use only /post/<username>/<slug>

@pythonbrad pythonbrad Nov 8, 2024

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@abdounasser202, I updated 👌

continue

author, title = link.split("/")[4:6]
title = title.replace('-', ' ').capitalize()

try:
with urlopen(link) as response:
body = response.read().decode()

# Parse the HTML data
description = re.findall(r"<article.*?>(.*?)</article>", body.replace('\n', ''))
if not description:
print("Error: Unable to parse the article", file=sys.stderr)
continue

description = description[0]
# Clean the data
description = re.sub(r"<.*?>", "", description).strip()[:255]
except HTTPError as e:
continue

articles.append({
'author': author,
'title': title,
'link': link,
'description': description,
'pub_date': pub_date,
})

return articles


def hash_it(info: dict) -> str:
"""
This method takes as input a dictionnary, then dumps it into string
Expand Down Expand Up @@ -91,7 +151,7 @@ def build_news() -> Iterable:
We build news
"""
result = {}
news = fetch_tech_crunch() + fetch_tech_griot()
news = fetch_tech_crunch() + fetch_tech_griot() + fetch_peef()
# we extract hashes from the file as a dict
hashes = extract_hash()
# From each iteration, we shuffle
Expand Down Expand Up @@ -134,3 +194,4 @@ def format_str():
if __name__ == '__main__':
print(format_str())