Spaces:

evalstate
/

diffusers-pr-api

Running

App Files Files Community

diffusers-pr-api / src /slop_farmer /data /normalize.py

evalstate HF Staff

Deploy Diffusers PR API

dbf7313 verified 6 days ago

raw

history blame contribute delete

9.37 kB

	from __future__ import annotations

	from typing import Any
	from urllib.parse import urlparse


	def _user_fields(user: dict[str, Any] \| None) -> dict[str, Any]:
	user = user or {}
	return {
	"author_login": user.get("login"),
	"author_id": user.get("id"),
	"author_node_id": user.get("node_id"),
	"author_type": user.get("type"),
	"author_site_admin": user.get("site_admin"),
	}


	def _labels(labels: list[dict[str, Any]] \| None) -> list[str]:
	return [
	name
	for label in labels or []
	if isinstance(label, dict) and isinstance((name := label.get("name")), str) and name
	]


	def _assignees(users: list[dict[str, Any]] \| None) -> list[str]:
	return [
	login
	for user in users or []
	if isinstance(user, dict) and isinstance((login := user.get("login")), str) and login
	]


	def issue_url_to_number(issue_url: str \| None) -> int \| None:
	if not issue_url:
	return None
	path = urlparse(issue_url).path.rstrip("/")
	tail = path.rsplit("/", 1)[-1]
	try:
	return int(tail)
	except ValueError:
	return None


	def normalize_issue(
	repo: str, item: dict[str, Any], snapshot_id: str, extracted_at: str
	) -> dict[str, Any]:
	return {
	"repo": repo,
	"github_id": item.get("id"),
	"github_node_id": item.get("node_id"),
	"number": item.get("number"),
	"html_url": item.get("html_url"),
	"api_url": item.get("url"),
	"title": item.get("title"),
	"body": item.get("body"),
	"state": item.get("state"),
	"state_reason": item.get("state_reason"),
	"locked": item.get("locked"),
	"comments_count": item.get("comments"),
	"labels": _labels(item.get("labels")),
	"assignees": _assignees(item.get("assignees")),
	"created_at": item.get("created_at"),
	"updated_at": item.get("updated_at"),
	"closed_at": item.get("closed_at"),
	"author_association": item.get("author_association"),
	"milestone_title": (item.get("milestone") or {}).get("title"),
	"snapshot_id": snapshot_id,
	"extracted_at": extracted_at,
	**_user_fields(item.get("user")),
	}


	def normalize_pull_request(
	repo: str,
	issue_stub: dict[str, Any],
	pr_detail: dict[str, Any],
	snapshot_id: str,
	extracted_at: str,
	) -> dict[str, Any]:
	head = pr_detail.get("head") or {}
	base = pr_detail.get("base") or {}
	return {
	"repo": repo,
	"github_id": pr_detail.get("id") or issue_stub.get("id"),
	"github_node_id": pr_detail.get("node_id") or issue_stub.get("node_id"),
	"number": issue_stub.get("number"),
	"html_url": issue_stub.get("html_url"),
	"api_url": issue_stub.get("url"),
	"title": issue_stub.get("title"),
	"body": issue_stub.get("body"),
	"state": issue_stub.get("state"),
	"state_reason": issue_stub.get("state_reason"),
	"locked": issue_stub.get("locked"),
	"comments_count": issue_stub.get("comments"),
	"labels": _labels(issue_stub.get("labels")),
	"assignees": _assignees(issue_stub.get("assignees")),
	"created_at": issue_stub.get("created_at"),
	"updated_at": issue_stub.get("updated_at"),
	"closed_at": issue_stub.get("closed_at"),
	"author_association": issue_stub.get("author_association")
	or pr_detail.get("author_association"),
	"merged_at": pr_detail.get("merged_at"),
	"merge_commit_sha": pr_detail.get("merge_commit_sha"),
	"merged": pr_detail.get("merged"),
	"mergeable": pr_detail.get("mergeable"),
	"mergeable_state": pr_detail.get("mergeable_state"),
	"draft": pr_detail.get("draft"),
	"additions": pr_detail.get("additions"),
	"deletions": pr_detail.get("deletions"),
	"changed_files": pr_detail.get("changed_files"),
	"commits": pr_detail.get("commits"),
	"review_comments_count": pr_detail.get("review_comments"),
	"maintainer_can_modify": pr_detail.get("maintainer_can_modify"),
	"head_ref": head.get("ref"),
	"head_sha": head.get("sha"),
	"head_repo_full_name": (head.get("repo") or {}).get("full_name"),
	"base_ref": base.get("ref"),
	"base_sha": base.get("sha"),
	"base_repo_full_name": (base.get("repo") or {}).get("full_name"),
	"snapshot_id": snapshot_id,
	"extracted_at": extracted_at,
	**_user_fields(issue_stub.get("user")),
	}


	def normalize_comment(
	repo: str,
	item: dict[str, Any],
	parent_kind: str,
	parent_number: int \| None,
	snapshot_id: str,
	extracted_at: str,
	) -> dict[str, Any]:
	return {
	"repo": repo,
	"github_id": item.get("id"),
	"github_node_id": item.get("node_id"),
	"parent_kind": parent_kind,
	"parent_number": parent_number,
	"html_url": item.get("html_url"),
	"api_url": item.get("url"),
	"issue_api_url": item.get("issue_url"),
	"body": item.get("body"),
	"created_at": item.get("created_at"),
	"updated_at": item.get("updated_at"),
	"author_association": item.get("author_association"),
	"snapshot_id": snapshot_id,
	"extracted_at": extracted_at,
	**_user_fields(item.get("user")),
	}


	def normalize_review(
	repo: str, pr_number: int, item: dict[str, Any], snapshot_id: str, extracted_at: str
	) -> dict[str, Any]:
	return {
	"repo": repo,
	"github_id": item.get("id"),
	"github_node_id": item.get("node_id"),
	"pull_request_number": pr_number,
	"html_url": item.get("html_url"),
	"api_url": item.get("url"),
	"body": item.get("body"),
	"state": item.get("state"),
	"submitted_at": item.get("submitted_at"),
	"commit_id": item.get("commit_id"),
	"author_association": item.get("author_association"),
	"snapshot_id": snapshot_id,
	"extracted_at": extracted_at,
	**_user_fields(item.get("user")),
	}


	def normalize_review_comment(
	repo: str, pr_number: int, item: dict[str, Any], snapshot_id: str, extracted_at: str
	) -> dict[str, Any]:
	return {
	"repo": repo,
	"github_id": item.get("id"),
	"github_node_id": item.get("node_id"),
	"pull_request_number": pr_number,
	"review_id": item.get("pull_request_review_id"),
	"html_url": item.get("html_url"),
	"api_url": item.get("url"),
	"pull_request_api_url": item.get("pull_request_url"),
	"body": item.get("body"),
	"path": item.get("path"),
	"commit_id": item.get("commit_id"),
	"original_commit_id": item.get("original_commit_id"),
	"position": item.get("position"),
	"original_position": item.get("original_position"),
	"line": item.get("line"),
	"start_line": item.get("start_line"),
	"side": item.get("side"),
	"start_side": item.get("start_side"),
	"subject_type": item.get("subject_type"),
	"created_at": item.get("created_at"),
	"updated_at": item.get("updated_at"),
	"author_association": item.get("author_association"),
	"snapshot_id": snapshot_id,
	"extracted_at": extracted_at,
	**_user_fields(item.get("user")),
	}


	def normalize_pr_file(
	repo: str,
	pr_number: int,
	item: dict[str, Any],
	snapshot_id: str,
	extracted_at: str,
	) -> dict[str, Any]:
	return {
	"repo": repo,
	"pull_request_number": pr_number,
	"sha": item.get("sha"),
	"filename": item.get("filename"),
	"status": item.get("status"),
	"additions": item.get("additions"),
	"deletions": item.get("deletions"),
	"changes": item.get("changes"),
	"blob_url": item.get("blob_url"),
	"raw_url": item.get("raw_url"),
	"contents_url": item.get("contents_url"),
	"previous_filename": item.get("previous_filename"),
	"patch": item.get("patch"),
	"snapshot_id": snapshot_id,
	"extracted_at": extracted_at,
	}


	def normalize_pr_diff(
	repo: str,
	pr_number: int,
	html_url: str \| None,
	api_url: str \| None,
	diff: str,
	snapshot_id: str,
	extracted_at: str,
	) -> dict[str, Any]:
	return {
	"repo": repo,
	"pull_request_number": pr_number,
	"html_url": html_url,
	"api_url": api_url,
	"diff": diff,
	"snapshot_id": snapshot_id,
	"extracted_at": extracted_at,
	}


	def normalize_timeline_event(
	repo: str,
	number: int,
	parent_kind: str,
	item: dict[str, Any],
	snapshot_id: str,
	extracted_at: str,
	) -> dict[str, Any]:
	source = item.get("source") or {}
	issue = source.get("issue") or {}
	return {
	"repo": repo,
	"parent_kind": parent_kind,
	"parent_number": number,
	"event": item.get("event"),
	"created_at": item.get("created_at"),
	"actor_login": (item.get("actor") or {}).get("login"),
	"source_issue_number": issue.get("number"),
	"source_issue_title": issue.get("title"),
	"source_issue_url": issue.get("html_url"),
	"commit_id": item.get("commit_id"),
	"label_name": (item.get("label") or {}).get("name"),
	"snapshot_id": snapshot_id,
	"extracted_at": extracted_at,
	}