Dev Goyal commited on
Commit
012bcc4
Β·
1 Parent(s): 3d3ba3f

refactor: replace Alpha Vantage with Financial Modeling Prep (FMP) for earnings transcript ingestion

Browse files
.env.example CHANGED
@@ -18,8 +18,8 @@ LANGSMITH_PROJECT=<YOUR_PROJECT_NAME>
18
  # Optional: verbose LangChain stdout (noisy; off by default)
19
  # LANGCHAIN_DEBUG=true
20
 
21
- # Earnings-call pipeline (Alpha Vantage free tier; falls back to SEC 8-K)
22
- # Get a free key at https://www.alphavantage.co/support/#api-key
23
- ALPHA_VANTAGE_API_KEY=demo
24
 
25
  # HTTP API: uvicorn api:app --host 0.0.0.0 --port 8000
 
18
  # Optional: verbose LangChain stdout (noisy; off by default)
19
  # LANGCHAIN_DEBUG=true
20
 
21
+ # FMP
22
+ FMP_API_KEY = <YOUR_API_KEY>
23
+
24
 
25
  # HTTP API: uvicorn api:app --host 0.0.0.0 --port 8000
Dockerfile CHANGED
@@ -28,7 +28,7 @@ ENV PYTHONPATH=/app
28
  RUN python scripts/ingest.py --tickers AAPL MSFT TSLA GOOGL NVDA
29
 
30
  # Ingest SEC 8-K / earnings call data for demo tickers
31
- RUN python scripts/ingest_earnings_calls.py --tickers AAPL MSFT --quarters Q4-2024 Q1-2025
32
 
33
  # ── Supervisord config (runs both services) ─────────────────────────────────
34
  COPY supervisord.conf /etc/supervisor/conf.d/supervisord.conf
 
28
  RUN python scripts/ingest.py --tickers AAPL MSFT TSLA GOOGL NVDA
29
 
30
  # Ingest SEC 8-K / earnings call data for demo tickers
31
+ RUN python scripts/ingest_earnings_calls.py --tickers AAPL MSFT GOOGL NVDA TSLA --quarters Q2-2025 Q1-2025 Q3-2025 Q4-2025 Q1-2026
32
 
33
  # ── Supervisord config (runs both services) ─────────────────────────────────
34
  COPY supervisord.conf /etc/supervisor/conf.d/supervisord.conf
core/config.py CHANGED
@@ -16,5 +16,5 @@ class Settings(BaseSettings):
16
  openai_temperature: float = 0.0
17
 
18
  # Earnings-call pipeline
19
- alpha_vantage_api_key: str = ""
20
  earnings_chroma_path: str = "./chroma_db"
 
16
  openai_temperature: float = 0.0
17
 
18
  # Earnings-call pipeline
19
+ fmp_api_key: str = ""
20
  earnings_chroma_path: str = "./chroma_db"
core/earnings_tools.py CHANGED
@@ -1,12 +1,16 @@
1
  """
2
  Earnings-call ingest + inference tools.
3
 
4
- Ingest layer – fetch transcript (Alpha Vantage β†’ SEC 8-K fallback),
5
  normalize into Prepared Remarks / Q&A segments,
6
  extract keyword counts, and embed into ChromaDB.
7
 
8
- Inference layer – LangGraph @tool functions for retrieval,
9
  sentiment divergence, and keyword trend analysis.
 
 
 
 
10
  """
11
 
12
  import json
@@ -70,7 +74,7 @@ def parse_quarter(quarter_str: str) -> tuple[int, int]:
70
 
71
 
72
  def _quarter_to_month(q: int) -> str:
73
- """Map fiscal quarter to approximate month for Alpha Vantage API."""
74
  return {1: "03", 2: "06", 3: "09", 4: "12"}[q]
75
 
76
 
@@ -78,46 +82,53 @@ def _quarter_to_month(q: int) -> str:
78
  # Transcript fetchers
79
  # ---------------------------------------------------------------------------
80
 
81
- def fetch_transcript_alpha_vantage(
82
  ticker: str, quarter: int, year: int, api_key: str
83
  ) -> Optional[str]:
84
  """
85
- Try the Alpha Vantage EARNINGS_CALL_TRANSCRIPT endpoint.
86
- Returns raw transcript text or None on failure (premium-only).
 
 
 
 
 
 
 
 
 
 
 
 
87
  """
88
  if not api_key:
89
  return None
90
  url = (
91
- "https://www.alphavantage.co/query"
92
- f"?function=EARNINGS_CALL_TRANSCRIPT"
93
- f"&symbol={ticker}"
94
- f"&quarter={year}Q{quarter}"
95
- f"&apikey={api_key}"
96
  )
97
  try:
98
- print(f"[Earnings Ingest] Trying Alpha Vantage for {ticker} Q{quarter}-{year}...")
99
  resp = requests.get(url, timeout=30)
100
  resp.raise_for_status()
101
  data = resp.json()
102
- # Alpha Vantage returns a list of transcript segments on success
103
- if isinstance(data, dict) and "transcript" in data:
104
- segments = data["transcript"]
105
- lines = []
106
- for seg in segments:
107
- speaker = seg.get("speaker", "Unknown")
108
- text = seg.get("content", "")
109
- lines.append(f"{speaker}: {text}")
110
- full = "\n".join(lines)
111
- if len(full) > 200:
112
- print(f"[Earnings Ingest] Alpha Vantage returned transcript ({len(full)} chars).")
113
- return full
114
- # Premium-required or empty response
115
- info = data.get("Information") or data.get("Note") or ""
116
- if info:
117
- print(f"[Earnings Ingest] Alpha Vantage: {info[:120]}")
118
  return None
119
  except Exception as e:
120
- print(f"[Earnings Ingest] Alpha Vantage failed: {e}")
121
  return None
122
 
123
 
@@ -210,7 +221,7 @@ def normalize_transcript(
210
  "ticker": ..., "quarter": ..., "year": ...,
211
  "prepared_remarks": str,
212
  "qa_session": str,
213
- "source": "alpha_vantage" | "sec_8k",
214
  }
215
  """
216
  text_lower = raw_text.lower()
@@ -225,7 +236,7 @@ def normalize_transcript(
225
  prepared = raw_text[:split_pos].strip()
226
  qa = raw_text[split_pos:].strip()
227
  else:
228
- # Could not find Q&A boundary β€” treat entire text as prepared remarks
229
  prepared = raw_text.strip()
230
  qa = ""
231
 
@@ -310,7 +321,7 @@ def ingest_earnings_call(
310
  ) -> str:
311
  """
312
  Full ingest pipeline for one ticker/quarter pair.
313
- Returns a status string: 'success', 'partial', or 'failed'.
314
  """
315
  ticker = ticker.upper()
316
  collection_dir = os.path.join(chroma_path, f"{ticker}_earnings")
@@ -322,9 +333,9 @@ def ingest_earnings_call(
322
  print(f"[Earnings Ingest] Q{quarter}-{year} for {ticker} already ingested. Skipping.")
323
  return "exists"
324
 
325
- # 1. Fetch transcript
326
- raw_text = fetch_transcript_alpha_vantage(ticker, quarter, year, api_key)
327
- source = "alpha_vantage" if raw_text else None
328
 
329
  if not raw_text:
330
  raw_text = fetch_transcript_sec_8k(ticker, quarter, year)
@@ -372,8 +383,8 @@ def ingest_earnings_call(
372
  docs.extend(splitter.split_documents([qa_doc]))
373
 
374
  if not docs:
375
- _save_metadata(chroma_path, ticker, quarter, year, keywords, "partial")
376
- return "partial"
377
 
378
  print(f"[Earnings Ingest] Embedding {len(docs)} chunks into {collection_dir}...")
379
  embeddings = get_cached_embeddings()
@@ -383,9 +394,10 @@ def ingest_earnings_call(
383
  persist_directory=collection_dir,
384
  )
385
 
386
- status = "success" if segments["qa_session"] else "partial"
 
387
  _save_metadata(chroma_path, ticker, quarter, year, keywords, status)
388
- print(f"[Earnings Ingest] {ticker} Q{quarter}-{year} ingested ({status}).")
389
  return status
390
 
391
 
@@ -420,7 +432,7 @@ def search_earnings_call(ticker: str, query: str) -> str:
420
  results = db.similarity_search(query, k=3)
421
 
422
  if not results:
423
- return f"No earnings-call matches found for '{query}' on {ticker}."
424
 
425
  output_parts = [f"EARNINGS CALL SEARCH RESULTS FOR {ticker.upper()} β€” '{query}':\n"]
426
  total_chars = 0
@@ -444,6 +456,8 @@ def get_earnings_sentiment_divergence(ticker: str) -> str:
444
  Retrieves evidence from both Prepared Remarks and Q&A sections of the
445
  most recent earnings call for a ticker. Use this to analyze whether
446
  management tone differs between the scripted portion and live Q&A.
 
 
447
  CRITICAL: The ticker's earnings data must already be ingested.
448
  """
449
  try:
@@ -461,31 +475,37 @@ def get_earnings_sentiment_divergence(ticker: str) -> str:
461
  filter={"section": "Q&A Session"},
462
  )
463
 
464
- output = f"SENTIMENT DIVERGENCE EVIDENCE FOR {ticker.upper()}:\n\n"
465
 
466
- output += "=== PREPARED REMARKS (scripted management commentary) ===\n"
467
  if pr_results:
468
  for doc in pr_results:
469
  output += doc.page_content[:600] + "\n---\n"
470
  else:
471
- output += "(No Prepared Remarks data found.)\n"
 
 
 
472
 
473
- output += "\n=== Q&A SESSION (live analyst questions & management responses) ===\n"
474
  if qa_results:
 
475
  for doc in qa_results:
476
  output += doc.page_content[:600] + "\n---\n"
477
- else:
478
- output += "(No Q&A Session data found β€” transcript may not have contained a Q&A segment.)\n"
 
 
 
 
 
 
 
 
479
 
480
- output += (
481
- "\nINSTRUCTION: Compare the tone, confidence, and specificity between "
482
- "Prepared Remarks and Q&A. Note any divergence where management was more "
483
- "cautious, evasive, or forthcoming in one section vs the other."
484
- )
485
  return output
486
 
487
  except Exception as e:
488
- return f"Error retrieving divergence data: {e}"
489
 
490
 
491
  @tool
@@ -547,4 +567,4 @@ def get_earnings_keyword_trends(ticker: str) -> str:
547
  return header + "\n".join(rows)
548
 
549
  except Exception as e:
550
- return f"Error loading keyword trends: {e}"
 
1
  """
2
  Earnings-call ingest + inference tools.
3
 
4
+ Ingest layer - fetch transcript (Financial Modeling Prep β†’ SEC 8-K fallback),
5
  normalize into Prepared Remarks / Q&A segments,
6
  extract keyword counts, and embed into ChromaDB.
7
 
8
+ Inference layer - LangGraph @tool functions for retrieval,
9
  sentiment divergence, and keyword trend analysis.
10
+
11
+ Primary API: Financial Modeling Prep (FMP) β€” free tier, 250 req/day.
12
+ Sign up: https://financialmodelingprep.com/developer/docs
13
+ Endpoint: GET /api/v3/earning_call_transcript/{symbol}?year=YYYY&quarter=N&apikey=KEY
14
  """
15
 
16
  import json
 
74
 
75
 
76
  def _quarter_to_month(q: int) -> str:
77
+ """Map fiscal quarter to approximate month β€” used by the SEC 8-K fallback."""
78
  return {1: "03", 2: "06", 3: "09", 4: "12"}[q]
79
 
80
 
 
82
  # Transcript fetchers
83
  # ---------------------------------------------------------------------------
84
 
85
+ def fetch_transcript_fmp(
86
  ticker: str, quarter: int, year: int, api_key: str
87
  ) -> Optional[str]:
88
  """
89
+ Fetch an earnings-call transcript from Financial Modeling Prep (FMP).
90
+
91
+ Free tier: 250 requests / day β€” no premium required.
92
+ Sign up: https://financialmodelingprep.com/developer/docs
93
+
94
+ Endpoint:
95
+ GET https://financialmodelingprep.com/api/v3/earning_call_transcript/{symbol}
96
+ ?year=YYYY&quarter=N&apikey=KEY
97
+
98
+ Response schema (list, first element used):
99
+ [{"symbol": "AAPL", "quarter": 1, "year": 2025,
100
+ "date": "2025-01-30 00:00:00", "content": "<full transcript>"}]
101
+
102
+ Returns the full transcript string or None on failure.
103
  """
104
  if not api_key:
105
  return None
106
  url = (
107
+ f"https://financialmodelingprep.com/api/v3/earning_call_transcript/{ticker.upper()}"
108
+ f"?year={year}&quarter={quarter}&apikey={api_key}"
 
 
 
109
  )
110
  try:
111
+ print(f"[Earnings Ingest] Trying FMP for {ticker} Q{quarter}-{year}...")
112
  resp = requests.get(url, timeout=30)
113
  resp.raise_for_status()
114
  data = resp.json()
115
+
116
+ # FMP returns a list; first element holds the transcript
117
+ if isinstance(data, list) and data:
118
+ content = data[0].get("content", "")
119
+ if len(content) > 200:
120
+ print(f"[Earnings Ingest] FMP returned transcript ({len(content)} chars).")
121
+ return content
122
+ print(f"[Earnings Ingest] FMP returned empty/short content for {ticker} Q{quarter}-{year}.")
123
+ return None
124
+
125
+ # Error object returned (e.g. invalid key or no data for this quarter)
126
+ if isinstance(data, dict):
127
+ msg = data.get("Error Message") or data.get("message") or str(data)
128
+ print(f"[Earnings Ingest] FMP error: {msg[:120]}")
 
 
129
  return None
130
  except Exception as e:
131
+ print(f"[Earnings Ingest] FMP fetch failed: {e}")
132
  return None
133
 
134
 
 
221
  "ticker": ..., "quarter": ..., "year": ...,
222
  "prepared_remarks": str,
223
  "qa_session": str,
224
+ "source": "fmp" | "sec_8k",
225
  }
226
  """
227
  text_lower = raw_text.lower()
 
236
  prepared = raw_text[:split_pos].strip()
237
  qa = raw_text[split_pos:].strip()
238
  else:
239
+ # SEC 8-K filings don't contain a Q&A section β€” treat entire text as prepared remarks
240
  prepared = raw_text.strip()
241
  qa = ""
242
 
 
321
  ) -> str:
322
  """
323
  Full ingest pipeline for one ticker/quarter pair.
324
+ Returns a status string: 'success', 'exists', or 'failed'.
325
  """
326
  ticker = ticker.upper()
327
  collection_dir = os.path.join(chroma_path, f"{ticker}_earnings")
 
333
  print(f"[Earnings Ingest] Q{quarter}-{year} for {ticker} already ingested. Skipping.")
334
  return "exists"
335
 
336
+ # 1. Fetch transcript: FMP (free) β†’ SEC 8-K fallback
337
+ raw_text = fetch_transcript_fmp(ticker, quarter, year, api_key)
338
+ source = "fmp" if raw_text else None
339
 
340
  if not raw_text:
341
  raw_text = fetch_transcript_sec_8k(ticker, quarter, year)
 
383
  docs.extend(splitter.split_documents([qa_doc]))
384
 
385
  if not docs:
386
+ _save_metadata(chroma_path, ticker, quarter, year, keywords, "failed")
387
+ return "failed"
388
 
389
  print(f"[Earnings Ingest] Embedding {len(docs)} chunks into {collection_dir}...")
390
  embeddings = get_cached_embeddings()
 
394
  persist_directory=collection_dir,
395
  )
396
 
397
+ # SEC 8-K filings often lack a Q&A section β€” this is a successful fallback
398
+ status = "success"
399
  _save_metadata(chroma_path, ticker, quarter, year, keywords, status)
400
+ print(f"[Earnings Ingest] {ticker} Q{quarter}-{year} ingested ({status}, source={source}).")
401
  return status
402
 
403
 
 
432
  results = db.similarity_search(query, k=3)
433
 
434
  if not results:
435
+ return f"No earnings data matched '{query}' for {ticker}. Try broadening your search terms."
436
 
437
  output_parts = [f"EARNINGS CALL SEARCH RESULTS FOR {ticker.upper()} β€” '{query}':\n"]
438
  total_chars = 0
 
456
  Retrieves evidence from both Prepared Remarks and Q&A sections of the
457
  most recent earnings call for a ticker. Use this to analyze whether
458
  management tone differs between the scripted portion and live Q&A.
459
+ When only prepared remarks are available (e.g. from an SEC 8-K filing),
460
+ performs a single-section tone analysis instead.
461
  CRITICAL: The ticker's earnings data must already be ingested.
462
  """
463
  try:
 
475
  filter={"section": "Q&A Session"},
476
  )
477
 
478
+ output = f"EARNINGS TONE ANALYSIS FOR {ticker.upper()}:\n\n"
479
 
480
+ output += "=== MANAGEMENT COMMENTARY ===\n"
481
  if pr_results:
482
  for doc in pr_results:
483
  output += doc.page_content[:600] + "\n---\n"
484
  else:
485
+ # Fallback: search without section filter
486
+ fallback = db.similarity_search("management outlook guidance performance", k=3)
487
+ for doc in fallback:
488
+ output += doc.page_content[:600] + "\n---\n"
489
 
 
490
  if qa_results:
491
+ output += "\n=== ANALYST Q&A ===\n"
492
  for doc in qa_results:
493
  output += doc.page_content[:600] + "\n---\n"
494
+ output += (
495
+ "\nINSTRUCTION: Compare the tone, confidence, and specificity between "
496
+ "the Management Commentary and Analyst Q&A sections. Note any divergence "
497
+ "where management was more cautious, evasive, or forthcoming under questioning."
498
+ )
499
+ output += (
500
+ "\nINSTRUCTION: Analyze the tone, confidence, and specificity of the "
501
+ "management commentary above. (Note: Only management commentary was found, typical of SEC 8-K filings). "
502
+ "Identify forward-looking statements, hedging language, areas of emphasis, and any notable risks or opportunities mentioned."
503
+ )
504
 
 
 
 
 
 
505
  return output
506
 
507
  except Exception as e:
508
+ return f"Error retrieving tone analysis data: {e}"
509
 
510
 
511
  @tool
 
567
  return header + "\n".join(rows)
568
 
569
  except Exception as e:
570
+ return f"Error loading keyword trends: {e}"
core/graph_builder.py CHANGED
@@ -225,9 +225,10 @@ Write the memo using this structure and markdown headings:
225
  Bullet points. Use ONLY numbers, metrics, and quotes that appear in the specialist outputs. If a section had no data, say "No quantitative/fundamental/sentiment data provided" as appropriate.
226
 
227
  ## Earnings Call Insights
228
- If Earnings_Agent data is present, summarize:
229
- - Sentiment divergence between Prepared Remarks and Q&A (was management more cautious or bullish in live Q&A vs. scripted remarks?).
230
- - Notable keyword/entity trends across quarters (e.g., increasing mentions of "AI", declining mentions of "headwinds").
 
231
  If no earnings data was provided, omit this section entirely.
232
 
233
  ## Risks, Sentiment, and Context
@@ -330,8 +331,10 @@ def build_financial_graph(llm):
330
  "3. get_earnings_keyword_trends: Track keyword frequency changes across quarters.\n\n"
331
  "CRITICAL RULES:\n"
332
  "- You MUST call at least one tool. Do NOT answer from memory.\n"
333
- "- If a tool returns an error about missing data, report that the earnings data for that "
 
334
  "ticker/quarter has not been ingested and suggest running the ingest script.\n"
 
335
  "- After the tool returns, write a clear, evidence-backed analysis. Bold key findings.\n"
336
  "- Do NOT add conversational filler. Do NOT ask follow-up questions."
337
  ),
 
225
  Bullet points. Use ONLY numbers, metrics, and quotes that appear in the specialist outputs. If a section had no data, say "No quantitative/fundamental/sentiment data provided" as appropriate.
226
 
227
  ## Earnings Call Insights
228
+ If Earnings_Agent data is present, summarize management's key messages and guidance.
229
+ - If both Prepared Remarks and Q&A are present, analyze any sentiment divergence (e.g., was management more cautious in live Q&A?).
230
+ - If only Prepared Remarks are available (typical for SEC-8 / 8-K filings), focus the analysis on the tone and specificity of the management commentary.
231
+ - Note any notable keyword/entity trends across quarters (e.g., AI mentions).
232
  If no earnings data was provided, omit this section entirely.
233
 
234
  ## Risks, Sentiment, and Context
 
331
  "3. get_earnings_keyword_trends: Track keyword frequency changes across quarters.\n\n"
332
  "CRITICAL RULES:\n"
333
  "- You MUST call at least one tool. Do NOT answer from memory.\n"
334
+ "- SEC filings (Form 8-K / SEC-8) are a valid source. They typically only contain Prepared Remarks and LACK a Q&A session. This is common and NOT a failure of the data.\n"
335
+ "- If a tool returns an error about missing data (e.g., no filings found), report that the earnings data for that "
336
  "ticker/quarter has not been ingested and suggest running the ingest script.\n"
337
+ "- If Q&A is missing, simply perform your analysis on the available management commentary.\n"
338
  "- After the tool returns, write a clear, evidence-backed analysis. Bold key findings.\n"
339
  "- Do NOT add conversational filler. Do NOT ask follow-up questions."
340
  ),
scripts/ingest_earnings_calls.py CHANGED
@@ -7,7 +7,7 @@ Usage:
7
  python scripts/ingest_earnings_calls.py --tickers TSLA --quarters Q1-2025
8
 
9
  Data sources (tried in order):
10
- 1. Alpha Vantage EARNINGS_CALL_TRANSCRIPT (requires premium key)
11
  2. SEC EDGAR 8-K filings (free, always available)
12
  """
13
 
@@ -45,7 +45,7 @@ def main():
45
  args = parser.parse_args()
46
 
47
  settings = Settings()
48
- api_key = settings.alpha_vantage_api_key or os.getenv("ALPHA_VANTAGE_API_KEY", "")
49
  chroma_path = settings.earnings_chroma_path
50
 
51
  os.makedirs(chroma_path, exist_ok=True)
@@ -89,9 +89,13 @@ def main():
89
  print("INGEST SUMMARY")
90
  print(f"{'=' * 50}")
91
  for r in results:
92
- icon = {"success": "βœ…", "partial": "⚠️", "failed": "❌", "exists": "⏭️", "error": "πŸ’₯"}.get(
93
- r["status"], "❓"
94
- )
 
 
 
 
95
  print(f" {icon} {r['ticker']} {r['quarter']}: {r['status']}")
96
 
97
  failed = [r for r in results if r["status"] in ("failed", "error")]
 
7
  python scripts/ingest_earnings_calls.py --tickers TSLA --quarters Q1-2025
8
 
9
  Data sources (tried in order):
10
+ 1. Financial Modeling Prep (FMP) (free tier, 250 req/day)
11
  2. SEC EDGAR 8-K filings (free, always available)
12
  """
13
 
 
45
  args = parser.parse_args()
46
 
47
  settings = Settings()
48
+ api_key = settings.fmp_api_key or os.getenv("FMP_API_KEY", "")
49
  chroma_path = settings.earnings_chroma_path
50
 
51
  os.makedirs(chroma_path, exist_ok=True)
 
89
  print("INGEST SUMMARY")
90
  print(f"{'=' * 50}")
91
  for r in results:
92
+ icon = {
93
+ "success": "βœ…",
94
+ "partial": "🟑",
95
+ "failed": "❌",
96
+ "exists": "⏭️",
97
+ "error": "πŸ’₯",
98
+ }.get(r["status"], "❓")
99
  print(f" {icon} {r['ticker']} {r['quarter']}: {r['status']}")
100
 
101
  failed = [r for r in results if r["status"] in ("failed", "error")]