Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
137 changes: 130 additions & 7 deletions cli.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,15 @@
import io
import json
import logging
from concurrent.futures import ThreadPoolExecutor
from enum import Enum

import httpx
import lxml.html
import pandas as pd
import tldextract
import typer
from sklearn.feature_extraction.text import TfidfVectorizer

from apistemic.markets.api import create_markets_api_from_environment
from apistemic.markets.models import CompetitorItem
Expand All @@ -29,6 +34,119 @@ def leadgen():
pass


def fetch_text_nofail(url) -> str | None:
headers = {
"User-Agent": (
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
),
"Accept": (
"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"
),
"Accept-Language": "en-US,en;q=0.5",
"Accept-Encoding": "gzip, deflate",
"DNT": "1",
"Connection": "keep-alive",
"Upgrade-Insecure-Requests": "1",
}
try:
resp = httpx.get(url, headers=headers, follow_redirects=True, timeout=10.0)
resp.raise_for_status()
return resp.text
except Exception as e:
logging.error(f"Error fetching {url}: {e}")
return None


def clean_html_text(html: str | None) -> str | None:
if not html:
return None
doc = lxml.html.fromstring(html)
# Remove script and style elements
for elem in doc.xpath(".//script | .//style"):
elem.getparent().remove(elem)
return doc.text_content().strip()


@app.command()
def keywords():
items = _fetch_items("linkedin:startupradar", "competitors")
df = items_to_df(items)
df = df.sample(n=20)

# Take top 10 + random sample from remaining for proper TF-IDF corpus
df_top = df.head(10)
df_full = df

df_full["organization.domain"] = (
df_full["organization.website_url"]
.apply(tldextract.extract)
.apply(lambda x: x.top_domain_under_public_suffix)
)

df_full["html"] = df_full["organization.domain"].apply(
lambda domain: fetch_text_nofail(f"https://{domain}")
)

with ThreadPoolExecutor(8) as tpe:
texts = list(
tpe.map(
fetch_text_nofail,
df_full["organization.domain"].apply(lambda x: f"https://{x}"),
)
)
df_full["text"] = texts

# Filter out None texts
df_full_valid = df_full[df_full["text"].notna()]

if len(df_full_valid) == 0:
print("No valid text content found")
return

# Apply TF-IDF on full corpus for proper IDF calculation
vectorizer = TfidfVectorizer(
max_features=10_000,
ngram_range=(1, 2),
min_df=0.1,
max_df=0.5,
stop_words="english",
)

all_texts = df_full_valid["text"].tolist()
tfidf_matrix = vectorizer.fit_transform(all_texts)
feature_names = vectorizer.get_feature_names_out()

df_tfidf = pd.DataFrame(
data=tfidf_matrix.toarray(), columns=feature_names, index=df_full_valid.index
)
print(df_tfidf)

print(df_tfidf.loc[df_top.index].mean(axis=0).sort_values(ascending=False).head(20))

exit()
# Get overall top keywords from top 10 competitors only
top_10_matrix = tfidf_matrix[df_top.index]
overall_scores = top_10_matrix.sum(axis=0).A1
top_indices = overall_scores.argsort()[-20:][::-1]

print("\n=== Top 20 Keywords from Top Competitors ===")
for idx in top_indices:
print(f"{feature_names[idx]}: {overall_scores[idx]:.3f}")

# Get top keywords per top 10 companies
print("\n=== Top Keywords per Top Competitor ===")
for df_idx in df_top.index:
if df_idx in df_full_valid.index:
row = df_full_valid.loc[df_idx]
scores = tfidf_matrix[df_full_valid.index.get_loc(df_idx)].toarray()[0]
top_company_indices = scores.argsort()[-5:][::-1]
print(f"\n{row['organization.name']} ({row['organization.domain']}):")
for j in top_company_indices:
if scores[j] > 0:
print(f" - {feature_names[j]}: {scores[j]:.3f}")


@app.command()
def fetch(
endpoint: Endpoint = typer.Argument(..., help="Type of data to fetch"),
Expand Down Expand Up @@ -67,13 +185,7 @@ def _fetch_items(
return items


def _display_items(items: list[CompetitorItem], format: OutputFormat):
"""Display the data in the requested format."""
# early return for non-list formats
if format == OutputFormat.json:
typer.echo(json.dumps([item.model_dump() for item in items], indent=2))
return

def items_to_df(items: list[CompetitorItem]) -> pd.DataFrame:
# all other formats are list-based
items_data = [item.model_dump() for item in items]
df = pd.json_normalize(items_data)
Expand All @@ -88,6 +200,17 @@ def _display_items(items: list[CompetitorItem], format: OutputFormat):
if col in df.columns:
df[col] = df[col].astype("Int64")

return df


def _display_items(items: list[CompetitorItem], format: OutputFormat):
"""Display the data in the requested format."""
# early return for non-list formats
if format == OutputFormat.json:
typer.echo(json.dumps([item.model_dump() for item in items], indent=2))
return

df = items_to_df(items)
if format == OutputFormat.csv:
typer.echo(df.to_csv(index=False))
elif format == OutputFormat.parquet:
Expand Down
3 changes: 3 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,11 @@ readme = "README.md"
requires-python = ">=3.13"
dependencies = [
"httpx>=0.28.1",
"lxml>=6.0.0",
"pandas>=2.3.1",
"pydantic>=2.11.7",
"scikit-learn>=1.7.1",
"tldextract>=5.3.0",
"typer>=0.16.1",
]

Expand Down
Loading