# ---- Build dict --------------------------------------------------------- return { "n_tokens": n_tokens, "n_chars": n_chars, "avg_token_len": avg_token_len, "upper_ratio": upper_ratio, "digit_ratio": digit_ratio, "stop_ratio": stop_ratio, "has_action_verb": int(has_action), "has_suspicious_kw": int(has_suspicious), "hyphen_cnt": hyphen_cnt, "ellipsis": int(ellipsis), "numeric_pattern": int(numeric_pattern), "domain_present": int(bool(domain)), "registered_domain": registered, "tld": tld, "subdomain_cnt": subdomain_cnt,
# ---- Textual cues ------------------------------------------------------- upper_ratio = sum(c.isupper() for c in subject) / max(n_chars, 1) digit_ratio = sum(c.isdigit() for c in subject) / max(n_chars, 1) avg_token_len = np.mean([len(t) for t in tokens]) if tokens else 0 has_action = any(v in subject.lower() for v in "download","click","open","update","verify") has_suspicious = any(v in subject.lower() for v in suspicious_word_list) stop_ratio = sum(t.lower() in stop_words for t in tokens) / max(n_tokens, 1) hyphen_cnt = subject.count('-') ellipsis = subject.endswith('...') numeric_pattern = bool(re.search(r'\b\d3,\s*-\s*\d4\b', subject)) Download WORK - 840 -2024- Bengla -www.mazabd.click...
def extract_features(subject: str) -> dict: # ---- Basic tokenisation ------------------------------------------------- tokens = re.split(r'\s+', subject.strip()) n_tokens = len(tokens) n_chars = len(subject) | |15| Domain Reputation Score | Public blacklists
# Dummy placeholders for reputation / age (replace with real API calls) domain_age_days = 9999 # e.g., today - creation_date domain_risk = 0 # 0 = clean, 1 = flagged | Encode TLD as categorical (one‑hot) or assign
suspicious_word_list = "download","click","open","update","verify","invoice","account", "password","login","security","confirm"
def entropy(s): """Shannon entropy of a string.""" probs = np.bincount(list(s.encode())) / len(s) probs = probs[probs > 0] return -np.sum(probs * np.log2(probs))
| # | Feature | Why it matters | Extraction | |---|---------|----------------|------------| |13| | www.mazabd.click is a new TLD ( .click ) frequently used by malicious actors. | tldextract.extract(subject).registered_domain | |14| Domain Age (in days) | Newly registered domains are riskier. | Use WHOIS API → creation_date → today - creation_date . | |15| Domain Reputation Score | Public blacklists (VirusTotal, Google Safe Browsing) give a numeric trust rating. | Query API → reputation_score . | |16| Top‑Level‑Domain (TLD) Popularity | .click , .xyz , .top are over‑represented in phishing. | Encode TLD as categorical (one‑hot) or assign risk weight (e.g., .com =0, others=1). | |17| Number of Sub‑domains | More sub‑domains → higher chance of URL‑shortening or obfuscation. | subject_url.count('.') - 1 . | |18| Presence of Hyphens in Domain | Hyphens are often used to mimic legitimate names ( mazabd ). | '−' in domain (boolean). | |19| URL Length | Very long URLs are suspicious. | len(url) | |20| URL Entropy | Randomly generated strings boost entropy. | Same entropy formula as above, applied to url . | |21| IDN / Punycode | Internationalised domain names can hide malicious domains. | url.startswith('xn--') . | |22| SSL Certificate Validity | Self‑signed or expired certs are a warning sign (if you later fetch the URL). | Use ssl / requests to check cert.notAfter . | |23| IP‑Address in URL | Direct IP links are uncommon in legitimate business mail. | re.search(r'\b\d1,3(?:\.\d1,3)3\b', url) . | 3. Structural / Formatting Features | # | Feature | Why it matters | Extraction | |---|---------|----------------|------------| |24| Number of Hyphens ( - ) | Overuse of hyphens often separates “spammy” tokens. | subject.count('-') | |25| Pattern of Numeric Tokens | The sequence 840 -2024 is a “number‑dash‑year” pattern typical of fake invoice titles. | Regex: r'\b\d3,\s*-\s*\d4\b' → boolean | |26| Presence of Ellipsis ( ... ) | Indicates truncation; spammers often hide the rest of a malicious URL. | subject.endswith('...') | |27| Bracket/Parentheses Ratio | Unbalanced punctuation is a heuristic for malformed messages. | subject.count('(') != subject.count(')') | |28| Whitespace Anomalies (multiple spaces, tabs) | Spam generators sometimes add extra spaces to bypass simple filters. | re.search(r'\s2,', subject) | |29| Encoding Flags (e.g., =?UTF-8?B?…?= ) | MIME‑encoded subjects can hide malicious strings. | Detect with email.header.decode_header . | |30| Subject Prefix / Tag Count | Tags like [Urgent] , [Notice] can be abused. | re.findall(r'\[.*?\]', subject) → count. | 4. Aggregated / Meta‑Features You can combine the raw values into risk scores :