Spaces:

codey-lab
/

Check-Git

Sleeping

App Files Files Community

Check-Git / src /streamlit_app.py

Alibrown

Update src/streamlit_app.py

f1986b0 verified about 1 month ago

raw

history blame contribute delete

14 kB

	# ============================================
	# LICENSE & SOURCE
	# ============================================
	# Licensed under MIT + ESOL v1.1
	# Source: https://github.com/VolkanSah/Check-Git-ML-Repo-Analyzer
	#
	# What is Open Source? For scammers & newcomers:
	# Learn what Open Source IS and what it IS NOT:
	# https://github.com/Wall-of-Shames/What-is-Open-Source
	# ============================================
	import streamlit as st
	import requests
	import re
	import os
	import tempfile
	from typing import Dict, List, Tuple
	import json
	from huggingface_hub import InferenceClient

	# ============================================
	# STREAMLIT PERMISSION HACK by VolkanSah :D
	# ============================================
	TEMP_STREAMLIT_HOME = os.path.join(tempfile.gettempdir(), "st_config_workaround")
	os.makedirs(TEMP_STREAMLIT_HOME, exist_ok=True)
	os.environ["STREAMLIT_HOME"] = TEMP_STREAMLIT_HOME
	os.environ["STREAMLIT_GATHER_USAGE_STATS"] = "false"
	CONFIG_PATH = os.path.join(TEMP_STREAMLIT_HOME, "config.toml")
	if not os.path.exists(CONFIG_PATH):
	with open(CONFIG_PATH, "w") as f:
	f.write("[browser]\ngatherUsageStats = false\n")

	# ============================================
	# LLM-POWERED ANALYZER
	# ============================================

	class MLRepoAnalyzerLLM:
	def __init__(self, hf_token: str = None):
	self.hf_token = hf_token
	if hf_token:
	self.client = InferenceClient(token=hf_token)

	# Fallback patterns (wenn kein Token)
	self.fake_indicators = [
	r'openai\.', r'anthropic\.', r'cohere\.',
	r'replicate\.', r'api\.mistral', r'groq\.',
	r'requests\.post.api', r'urllib.api'
	]
	self.legit_indicators = [
	r'torch\.optim', r'loss\.backward\(\)', r'model\.train\(\)',
	r'optimizer\.step\(\)', r'tf\.keras\.optimizers',
	r'from\s+transformers\s+import\s+Trainer',
	r'accelerator\.backward', r'DeepSpeed',
	r'torch\.nn\.Module', r'forward\(self'
	]

	def extract_repo_info(self, url: str) -> Tuple[str, str, str]:
	"""Extract owner, repo, branch from GitHub URL"""
	pattern = r'github\.com/([^/]+)/([^/]+)(?:/tree/([^/]+))?'
	match = re.search(pattern, url)
	if not match:
	raise ValueError("Invalid GitHub URL")
	owner, repo = match.group(1), match.group(2)
	branch = match.group(3) or 'main'
	return owner, repo.replace('.git', ''), branch

	def fetch_repo_tree(self, owner: str, repo: str, branch: str) -> List[Dict]:
	"""Fetch file tree via GitHub API"""
	api_url = f"https://api.github.com/repos/{owner}/{repo}/git/trees/{branch}?recursive=1"
	response = requests.get(api_url, timeout=10)
	if response.status_code != 200:
	raise Exception(f"GitHub API error: {response.status_code}")
	return response.json().get('tree', [])

	def fetch_file_content(self, owner: str, repo: str, branch: str, path: str) -> str:
	"""Fetch raw file content"""
	raw_url = f"https://raw.githubusercontent.com/{owner}/{repo}/{branch}/{path}"
	response = requests.get(raw_url, timeout=10)
	return response.text if response.status_code == 200 else ""

	def analyze_with_llm(self, code_snippet: str, filename: str) -> Dict:
	"""Use HF Inference API to analyze code"""
	if not self.hf_token:
	return None

	prompt = f"""Analyze this Python file from a machine learning repository: {filename}

	Code snippet:
	```python
	{code_snippet[:2000]} # Limit to avoid token limits
	```

	Determine if this is:
	1. REAL ML TRAINING CODE (contains actual model training, backprop, optimizers)
	2. API WRAPPER (just calls external APIs like OpenAI, Anthropic, etc.)
	3. UNCLEAR

	Respond in JSON format:
	{{
	"classification": "REAL_TRAINING\|API_WRAPPER\|UNCLEAR",
	"confidence": 0-100,
	"reasoning": "brief explanation",
	"key_indicators": ["indicator1", "indicator2"]
	}}"""

	try:
	# Use Qwen2.5-Coder or similar code-focused model
	response = self.client.chat_completion(
	messages=[{"role": "user", "content": prompt}],
	model="Qwen/Qwen2.5-Coder-32B-Instruct", # Free on HF Inference
	max_tokens=500,
	temperature=0.1
	)

	result_text = response.choices[0].message.content

	# Extract JSON (handle markdown code blocks)
	json_match = re.search(r'```json\s(\{.?\})\s*```', result_text, re.DOTALL)
	if json_match:
	return json.loads(json_match.group(1))
	else:
	# Try direct parse
	return json.loads(result_text)

	except Exception as e:
	st.warning(f"LLM analysis failed for {filename}: {e}")
	return None

	def analyze_file_structure(self, files: List[Dict]) -> Dict:
	"""Quick structure check"""
	py_files = [f for f in files if f['path'].endswith('.py')]

	return {
	'has_train_script': any('train' in f['path'].lower() for f in py_files),
	'has_model_files': any('model' in f['path'].lower() for f in py_files),
	'has_config': any(f['path'].endswith(('.yaml', '.yml', '.json', '.toml')) for f in files),
	'has_requirements': any('requirements' in f['path'] or 'pyproject.toml' in f['path'] for f in files),
	'python_file_count': len(py_files)
	}

	def analyze_with_patterns(self, content: str) -> Tuple[int, int]:
	"""Fallback pattern matching"""
	fake_score = sum(5 for pattern in self.fake_indicators if re.search(pattern, content, re.IGNORECASE))
	legit_score = sum(10 for pattern in self.legit_indicators if re.search(pattern, content, re.IGNORECASE))
	return fake_score, legit_score

	def classify_repo(self, url: str, use_llm: bool = True) -> Dict:
	"""Main classification"""
	try:
	owner, repo, branch = self.extract_repo_info(url)
	files = self.fetch_repo_tree(owner, repo, branch)

	structure = self.analyze_file_structure(files)
	py_files = [f for f in files if f['path'].endswith('.py')][:10]

	llm_results = []
	pattern_fake_score = 0
	pattern_legit_score = 0

	for file_info in py_files:
	content = self.fetch_file_content(owner, repo, branch, file_info['path'])
	if not content:
	continue

	# LLM Analysis (if token available)
	if use_llm and self.hf_token:
	llm_result = self.analyze_with_llm(content, file_info['path'])
	if llm_result:
	llm_results.append({
	'file': file_info['path'],
	'result': llm_result
	})

	# Pattern fallback
	fake, legit = self.analyze_with_patterns(content)
	pattern_fake_score += fake
	pattern_legit_score += legit

	# Combine LLM + Pattern results
	if llm_results:
	llm_real_count = sum(1 for r in llm_results if r['result']['classification'] == 'REAL_TRAINING')
	llm_fake_count = sum(1 for r in llm_results if r['result']['classification'] == 'API_WRAPPER')

	# LLM gets more weight
	total_score = (llm_real_count * 30) - (llm_fake_count * 30) + (pattern_legit_score - pattern_fake_score)
	else:
	total_score = pattern_legit_score - pattern_fake_score

	# Verdict
	if total_score > 30:
	verdict = "✅ LEGIT - Real ML Training Code"
	confidence = "High"
	elif total_score > 0:
	verdict = "⚠️ MIXED - Contains some training code"
	confidence = "Medium"
	else:
	verdict = "❌ FAKE - API Wrapper / No Real Training"
	confidence = "High"

	return {
	'verdict': verdict,
	'confidence': confidence,
	'score': total_score,
	'structure': structure,
	'llm_results': llm_results,
	'pattern_scores': {
	'fake': pattern_fake_score,
	'legit': pattern_legit_score
	},
	'repo_info': f"{owner}/{repo}@{branch}"
	}

	except Exception as e:
	return {'error': str(e)}

	# ============================================
	# STREAMLIT UI
	# ============================================

	st.set_page_config(page_title="ML Repo Detector 🔍", page_icon="🤖", layout="wide")

	st.title("🤖 ML Training Repo Analyzer (LLM-Powered)")
	st.markdown("AI-powered detection of fake ML repos using your HuggingFace token")

	# Token input in sidebar
	with st.sidebar:
	st.markdown("### 🔑 HuggingFace Setup")
	hf_token = st.text_input(
	"HF Token (optional)",
	type="password",
	help="Get your free token at https://huggingface.co/settings/tokens"
	)

	use_llm = st.checkbox(
	"Use LLM Analysis",
	value=bool(hf_token),
	disabled=not hf_token,
	help="Requires HF token. Uses Qwen2.5-Coder for deep analysis"
	)

	st.markdown("---")
	st.markdown("### 🛠️ Models Used")
	if use_llm:
	st.success("✅ Qwen2.5-Coder-32B (Free)")
	else:
	st.info("📊 Pattern Matching Only")

	st.markdown("---")
	st.markdown("### 💡 How it works")
	st.markdown("""
	With LLM:
	- Deep code understanding
	- Context-aware analysis
	- Higher accuracy

	Without LLM:
	- Pattern matching
	- Regex-based detection
	- Still pretty good!
	""")

	# Main interface
	analyzer = MLRepoAnalyzerLLM(hf_token=hf_token if hf_token else None)

	repo_url = st.text_input(
	"GitHub Repository URL",
	placeholder="https://github.com/username/repo",
	help="Enter a public GitHub repository URL"
	)

	col1, col2 = st.columns([1, 4])
	with col1:
	analyze_btn = st.button("🚀 Analyze", type="primary", use_container_width=True)

	if analyze_btn:
	if not repo_url:
	st.error("Enter a GitHub URL!")
	else:
	with st.spinner("🔍 Analyzing repository..." + (" (using LLM)" if use_llm else " (pattern matching)")):
	result = analyzer.classify_repo(repo_url, use_llm=use_llm and bool(hf_token))

	if 'error' in result:
	st.error(f"❌ Error: {result['error']}")
	else:
	# Verdict
	st.markdown("---")
	col1, col2, col3 = st.columns([3, 1, 1])
	with col1:
	st.markdown(f"## {result['verdict']}")
	with col2:
	st.metric("Confidence", result['confidence'])
	with col3:
	st.metric("Score", result['score'])

	# LLM Results
	if result.get('llm_results'):
	st.markdown("### 🤖 LLM Analysis Results")
	for llm_res in result['llm_results'][:5]:
	with st.expander(f"📄 {llm_res['file']}"):
	res = llm_res['result']

	col1, col2 = st.columns(2)
	with col1:
	classification = res.get('classification', 'UNKNOWN')
	if classification == 'REAL_TRAINING':
	st.success(f"✅ {classification}")
	elif classification == 'API_WRAPPER':
	st.error(f"❌ {classification}")
	else:
	st.warning(f"⚠️ {classification}")

	with col2:
	st.metric("Confidence", f"{res.get('confidence', 0)}%")

	st.markdown(f"Reasoning: {res.get('reasoning', 'N/A')}")

	if res.get('key_indicators'):
	st.markdown("Key Indicators:")
	for indicator in res['key_indicators']:
	st.markdown(f"- {indicator}")

	# Pattern Analysis (fallback/additional)
	st.markdown("### 📊 Pattern Analysis")
	col1, col2 = st.columns(2)
	with col1:
	st.metric("Legit Patterns", result['pattern_scores']['legit'])
	with col2:
	st.metric("Fake Patterns", result['pattern_scores']['fake'])

	# Structure
	st.markdown("### 📁 Repository Structure")
	struct = result['structure']
	cols = st.columns(4)
	with cols[0]:
	st.metric("Python Files", struct['python_file_count'])
	with cols[1]:
	st.write("✅" if struct['has_train_script'] else "❌", "train.py")
	with cols[2]:
	st.write("✅" if struct['has_model_files'] else "❌", "model files")
	with cols[3]:
	st.write("✅" if struct['has_config'] else "❌", "configs")

	# Footer
	st.markdown("---")
	st.markdown("💡 Your HF token = your quota. No data stored. Analysis runs on HF's free inference API.")