abhisheks008 · Reaper-ai · May 31, 2026
diff --git a/Named Entity Recognition using NLP/Dataset/label.json b/Named Entity Recognition using NLP/Dataset/label.json
@@ -0,0 +1 @@
+{"O": 0, "B-CARDINAL": 1, "B-DATE": 2, "I-DATE": 3, "B-PERSON": 4, "I-PERSON": 5, "B-NORP": 6, "B-GPE": 7, "I-GPE": 8, "B-LAW": 9, "I-LAW": 10, "B-ORG": 11, "I-ORG": 12, "B-PERCENT": 13, "I-PERCENT": 14, "B-ORDINAL": 15, "B-MONEY": 16, "I-MONEY": 17, "B-WORK_OF_ART": 18, "I-WORK_OF_ART": 19, "B-FAC": 20, "B-TIME": 21, "I-CARDINAL": 22, "B-LOC": 23, "B-QUANTITY": 24, "I-QUANTITY": 25, "I-NORP": 26, "I-LOC": 27, "B-PRODUCT": 28, "I-TIME": 29, "B-EVENT": 30, "I-EVENT": 31, "I-FAC": 32, "B-LANGUAGE": 33, "I-PRODUCT": 34, "I-ORDINAL": 35, "I-LANGUAGE": 36}
diff --git a/Named Entity Recognition using NLP/Dataset/test.json b/Named Entity Recognition using NLP/Dataset/test.json
diff --git a/Named Entity Recognition using NLP/Dataset/train00.json b/Named Entity Recognition using NLP/Dataset/train00.json
diff --git a/Named Entity Recognition using NLP/Dataset/train01.json b/Named Entity Recognition using NLP/Dataset/train01.json
diff --git a/Named Entity Recognition using NLP/Dataset/train02.json b/Named Entity Recognition using NLP/Dataset/train02.json
diff --git a/Named Entity Recognition using NLP/Dataset/train03.json b/Named Entity Recognition using NLP/Dataset/train03.json
diff --git a/Named Entity Recognition using NLP/Dataset/valid.json b/Named Entity Recognition using NLP/Dataset/valid.json
diff --git a/Named Entity Recognition using NLP/Images/entity_span_length_distribution.png b/Named Entity Recognition using NLP/Images/entity_span_length_distribution.png
diff --git a/Named Entity Recognition using NLP/Images/label_distribution_top25.png b/Named Entity Recognition using NLP/Images/label_distribution_top25.png
diff --git a/Named Entity Recognition using NLP/Images/ner_final_metrics.png b/Named Entity Recognition using NLP/Images/ner_final_metrics.png
diff --git a/Named Entity Recognition using NLP/Images/ner_training_metrics.png b/Named Entity Recognition using NLP/Images/ner_training_metrics.png
diff --git a/Named Entity Recognition using NLP/Images/sentence_length_distribution.png b/Named Entity Recognition using NLP/Images/sentence_length_distribution.png
diff --git a/Named Entity Recognition using NLP/Images/top_entities.png b/Named Entity Recognition using NLP/Images/top_entities.png
diff --git a/Named Entity Recognition using NLP/Models/named_entity_recofnition_eda.ipynb b/Named Entity Recognition using NLP/Models/named_entity_recofnition_eda.ipynb
diff --git a/Named Entity Recognition using NLP/Models/named_entity_recofnition_model.ipynb b/Named Entity Recognition using NLP/Models/named_entity_recofnition_model.ipynb
diff --git a/Named Entity Recognition using NLP/README.md b/Named Entity Recognition using NLP/README.md
@@ -0,0 +1,63 @@
+## **PROJECT TITLE**
+
+Named Entity Recognition Using NLP (OntoNotes 5)
+
+### 🎯 **Goal**
+
+Build a named entity recognition system from scratch, explore the dataset, and provide an interactive Streamlit demo.
+
+### 🧵 **Dataset**
+
+OntoNotes 5.0 (LDC2013T19). Obtain via the Linguistic Data Consortium (LDC).
+
+### 🧾 **Description**
+
+This project trains a BiLSTM tagger for NER using the OntoNotes 5 JSONL dataset. It includes EDA plots, model training with per-epoch checkpoints, and a Streamlit app for inference.
+
+### 🧮 **What I had done!**
+
+- Loaded OntoNotes 5 JSONL splits and label mapping.
+- Ran EDA to understand label distribution, sentence length, entity span length, and top entity forms.
+- Built a token vocabulary from the training split.
+- Trained a BiLSTM tagger with padded batches.
+- Saved per-epoch model weights to the Streamlit folder.
+- Generated training metrics and final metrics images.
+- Built a Streamlit app for quick inference.
+
+### 🚀 **Models Implemented**
+
+- BiLSTM tagger (from scratch) for sequence labeling: simple, fast, and effective baseline for NER without relying on pretrained transformers.
+
+### 📚 **Libraries Needed**
+
+- torch
+- numpy
+- pandas
+- matplotlib
+- streamlit
+
+### 📊 **Exploratory Data Analysis Results**
+
+`INCLUSION OF IMAGES OF THE VISUALIZATION IS MUST (RESULT OF EDA).`
+
+![Label Distribution](Images/label_distribution_top25.png)
+
+![Sentence Length Distribution](Images/sentence_length_distribution.png)
+
+![Entity Span Length Distribution](Images/entity_span_length_distribution.png)
+
+![Top Entity Surface Forms](Images/top_entities.png)
+
+### 📈 **Performance of the Models based on the Accuracy Scores**
+
+- BiLSTM tagger: ![Results](Images/ner_final_metrics.png)
+
+
+### 📢 **Conclusion**
+
+The BiLSTM baseline provides a solid starting point for OntoNotes 5 NER. Use the final metrics image to report test accuracy after training.
+
+
+Gaurav Upreti
+[GitHub](https://github.com/Reaper-ai) | [LinkedIn](https://www.linkedin.com/in/gaurav-upreti-488348312/)
+Contribution under GSSoC 2026
diff --git a/Named Entity Recognition using NLP/Streamlit/app.py b/Named Entity Recognition using NLP/Streamlit/app.py
@@ -0,0 +1,137 @@
+import json
+import os
+from glob import glob
+
+import streamlit as st
+import torch
+from torch import nn
+
+BASE_DIR = os.path.dirname(os.path.dirname(__file__))
+DATASET_DIR = os.path.join(BASE_DIR, "Dataset")
+MODEL_PATH = os.path.join(os.path.dirname(__file__), "ner_model.pt")
+
+st.set_page_config(page_title="NER Demo", layout="centered")
+
+st.title("Named Entity Recognition Demo")
+st.caption("BiLSTM tagger trained on OntoNotes 5 (simple whitespace tokenization).")
+
+HUMAN_LABELS = {
+    "PERSON": "Person",
+    "ORG": "Organization",
+    "GPE": "Geo-Political Entity",
+    "NORP": "Nationality/Religious/Political Group",
+    "FAC": "Facility",
+    "LOC": "Location",
+    "PRODUCT": "Product",
+    "EVENT": "Event",
+    "WORK_OF_ART": "Work of Art",
+    "LAW": "Law",
+    "LANGUAGE": "Language",
+    "DATE": "Date",
+    "TIME": "Time",
+    "MONEY": "Money",
+    "PERCENT": "Percent",
+    "QUANTITY": "Quantity",
+    "ORDINAL": "Ordinal",
+    "CARDINAL": "Cardinal",
+}
+
+def format_label(label):
+    if label == "O":
+        return "O"
+    if "-" not in label:
+        return label
+    bio, ent_type = label.split("-", 1)
+    human = HUMAN_LABELS.get(ent_type, ent_type)
+    return f"{human} ({bio})"
+
+@st.cache_resource
+def load_label_map():
+    label_path = os.path.join(DATASET_DIR, "label.json")
+    with open(label_path, "r", encoding="utf-8") as f:
+        label_to_id = json.load(f)
+    id_to_label = {v: k for k, v in label_to_id.items()}
+    return id_to_label
+
+@st.cache_resource
+def build_vocab():
+    token_counter = {}
+    train_files = sorted(glob(os.path.join(DATASET_DIR, "train*.json")))
+    for path in train_files:
+        if not os.path.exists(path):
+            continue
+        with open(path, "r", encoding="utf-8") as f:
+            for line in f:
+                line = line.strip()
+                if not line:
+                    continue
+                item = json.loads(line)
+                for token in item["tokens"]:
+                    token_counter[token] = token_counter.get(token, 0) + 1
+
+    pad_token = "<PAD>"
+    unk_token = "<UNK>"
+    vocab = [pad_token, unk_token] + sorted(
+        token_counter.keys(), key=lambda t: token_counter[t], reverse=True
+    )
+    token_to_id = {t: i for i, t in enumerate(vocab)}
+    return token_to_id
+
+class BiLSTMTagger(nn.Module):
+    def __init__(self, vocab_size, embed_dim, hidden_dim, num_labels, pad_id):
+        super().__init__()
+        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=pad_id)
+        self.lstm = nn.LSTM(
+            embed_dim,
+            hidden_dim,
+            num_layers=1,
+            batch_first=True,
+            bidirectional=True,
+        )
+        self.dropout = nn.Dropout(0.2)
+        self.classifier = nn.Linear(hidden_dim * 2, num_labels)
+
+    def forward(self, input_ids):
+        x = self.embedding(input_ids)
+        x, _ = self.lstm(x)
+        x = self.dropout(x)
+        logits = self.classifier(x)
+        return logits
+
+@st.cache_resource
+def load_model(token_to_id, id_to_label):
+    pad_id = token_to_id["<PAD>"]
+    model = BiLSTMTagger(
+        vocab_size=len(token_to_id),
+        embed_dim=128,
+        hidden_dim=256,
+        num_labels=len(id_to_label),
+        pad_id=pad_id,
+    )
+    model.load_state_dict(torch.load(MODEL_PATH, map_location="cpu"))
+    model.eval()
+    return model
+
+id_to_label = load_label_map()
+token_to_id = build_vocab()
+model = load_model(token_to_id, id_to_label)
+
+text = st.text_area("Enter text", "Barack Obama visited New York in 2012.")
+
+if st.button("Tag Entities"):
+    tokens = text.strip().split()
+    if not tokens:
+        st.warning("Please enter some text.")
+    else:
+        unk_id = token_to_id.get("<UNK>")
+        input_ids = torch.tensor([[token_to_id.get(t, unk_id) for t in tokens]])
+        with torch.no_grad():
+            logits = model(input_ids)
+            pred_ids = logits.argmax(-1).squeeze(0).tolist()
+        pred_labels = [format_label(id_to_label.get(i, "O")) for i in pred_ids]
+
+        st.subheader("Predictions")
+        st.dataframe({"token": tokens, "label": pred_labels}, use_container_width=True)
+
+st.sidebar.markdown("**Model weights**")
+st.sidebar.code(MODEL_PATH)
diff --git a/Named Entity Recognition using NLP/Streamlit/ner_model.pt b/Named Entity Recognition using NLP/Streamlit/ner_model.pt
diff --git a/Named Entity Recognition using NLP/pyproject.toml b/Named Entity Recognition using NLP/pyproject.toml
@@ -0,0 +1,14 @@
+[project]
+name = "named-entity-recognition-using-nlp"
+version = "0.1.0"
+description = "Add your description here"
+readme = "README.md"
+requires-python = ">=3.11"
+dependencies = [
+    "ipykernel>=7.2.0",
+    "matplotlib>=3.10.9",
+    "pandas>=3.0.3",
+    "scikit-learn>=1.8.0",
+    "streamlit>=1.58.0",
+    "torch>=2.12.0",
+]
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		{"O": 0, "B-CARDINAL": 1, "B-DATE": 2, "I-DATE": 3, "B-PERSON": 4, "I-PERSON": 5, "B-NORP": 6, "B-GPE": 7, "I-GPE": 8, "B-LAW": 9, "I-LAW": 10, "B-ORG": 11, "I-ORG": 12, "B-PERCENT": 13, "I-PERCENT": 14, "B-ORDINAL": 15, "B-MONEY": 16, "I-MONEY": 17, "B-WORK_OF_ART": 18, "I-WORK_OF_ART": 19, "B-FAC": 20, "B-TIME": 21, "I-CARDINAL": 22, "B-LOC": 23, "B-QUANTITY": 24, "I-QUANTITY": 25, "I-NORP": 26, "I-LOC": 27, "B-PRODUCT": 28, "I-TIME": 29, "B-EVENT": 30, "I-EVENT": 31, "I-FAC": 32, "B-LANGUAGE": 33, "I-PRODUCT": 34, "I-ORDINAL": 35, "I-LANGUAGE": 36}