-
Notifications
You must be signed in to change notification settings - Fork 7
patch(report_news): add peef.dev as source
#124
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -4,12 +4,16 @@ | |
| import os | ||
| from typing import Iterable | ||
| from urllib.request import urlopen | ||
| from urllib.error import HTTPError | ||
| from urllib.parse import quote as urlquote | ||
| import xml.etree.ElementTree as ET | ||
| import re | ||
|
|
||
| TECH_GRIOT_API_URL = 'https://techgriot.co/feed' | ||
| NEWS_API_URL = 'https://newsapi.org/v2/top-headlines?sources=techcrunch' | ||
| NEWS_API_KEY = os.getenv('NEWS_API_KEY') | ||
| REPORT_NEWS_HASHES = 'report_news.hash' | ||
| PEEF_SITEMAP_URL = 'https://peef.dev/sitemap.xml' | ||
| MAX_ARTICLES = 10 | ||
|
|
||
|
|
||
|
|
@@ -58,6 +62,63 @@ def fetch_tech_griot() -> list: | |
| return articles | ||
|
|
||
|
|
||
| def fetch_peef(): | ||
| """ | ||
| Fetching data from peef | ||
| """ | ||
|
|
||
| url = f'{PEEF_SITEMAP_URL}' | ||
|
|
||
| with urlopen(url) as response: | ||
| body = response.read() | ||
|
|
||
| xml_raw = ET.fromstring(body) | ||
|
|
||
| articles = [] | ||
| for url in xml_raw.findall('{http://www.sitemaps.org/schemas/sitemap/0.9}url')[-MAX_ARTICLES:]: | ||
| loc = url.find('{http://www.sitemaps.org/schemas/sitemap/0.9}loc') | ||
| lastmod = url.find('{http://www.sitemaps.org/schemas/sitemap/0.9}lastmod') | ||
|
|
||
| if loc is None or lastmod is None: | ||
| continue | ||
|
|
||
| link = re.sub(r'^http://', 'https://', urlquote(loc.text, safe=":/")) | ||
| pub_date = lastmod.text | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. what's the purpose of creating this variable
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @Sanix-Darker, which variable?
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. pub_date |
||
|
|
||
| # A peef article in under the form https://peef.dev/post/<author>/<slug> | ||
| if not link.startswith("https://peef.dev/post/") or link.strip("/").count("/") != 5: | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Oh nice, but why not making it very strict ? If we need only articles, then it should be strict, so using AND instead of OR is better Doing that, every URL should pass both checks : start with https://peef.dev/post/ and have exactly 5 "/"
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @abdounasser202, I think, it is doing exactly what you said. If I am wrong, please, give me more details for me to understand your point. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yep, you're right, I did a test, and it's doing what I was explaining |
||
| continue | ||
|
|
||
| author, title = link.split("/")[4:6] | ||
| title = title.replace('-', ' ').capitalize() | ||
|
|
||
| try: | ||
| with urlopen(link) as response: | ||
| body = response.read().decode() | ||
|
|
||
| # Parse the HTML data | ||
| description = re.findall(r"<article.*?>(.*?)</article>", body.replace('\n', '')) | ||
| if not description: | ||
| print("Error: Unable to parse the article", file=sys.stderr) | ||
| continue | ||
|
|
||
| description = description[0] | ||
| # Clean the data | ||
| description = re.sub(r"<.*?>", "", description).strip()[:255] | ||
| except HTTPError as e: | ||
| continue | ||
|
|
||
| articles.append({ | ||
| 'author': author, | ||
| 'title': title, | ||
| 'link': link, | ||
| 'description': description, | ||
| 'pub_date': pub_date, | ||
| }) | ||
|
|
||
| return articles | ||
|
|
||
|
|
||
| def hash_it(info: dict) -> str: | ||
| """ | ||
| This method takes as input a dictionnary, then dumps it into string | ||
|
|
@@ -91,7 +152,7 @@ def build_news() -> Iterable: | |
| We build news | ||
| """ | ||
| result = {} | ||
| news = fetch_tech_crunch() + fetch_tech_griot() | ||
| news = fetch_tech_crunch() + fetch_tech_griot() + fetch_peef() | ||
| # we extract hashes from the file as a dict | ||
| hashes = extract_hash() | ||
| # From each iteration, we shuffle | ||
|
|
@@ -134,3 +195,4 @@ def format_str(): | |
| if __name__ == '__main__': | ||
| print(format_str()) | ||
|
|
||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you secure each iteration with a try/except ?
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I updated, I think now only HTTP errors are remaining.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I am not sure why we use this
MAX_ARTICLES=10limit here, can you elaborate ?I left a comment on the other PR regarding what I think we should do here,