from __future__ import annotations from typing import Any from urllib.parse import urlparse def _user_fields(user: dict[str, Any] | None) -> dict[str, Any]: user = user or {} return { "author_login": user.get("login"), "author_id": user.get("id"), "author_node_id": user.get("node_id"), "author_type": user.get("type"), "author_site_admin": user.get("site_admin"), } def _labels(labels: list[dict[str, Any]] | None) -> list[str]: return [ name for label in labels or [] if isinstance(label, dict) and isinstance((name := label.get("name")), str) and name ] def _assignees(users: list[dict[str, Any]] | None) -> list[str]: return [ login for user in users or [] if isinstance(user, dict) and isinstance((login := user.get("login")), str) and login ] def issue_url_to_number(issue_url: str | None) -> int | None: if not issue_url: return None path = urlparse(issue_url).path.rstrip("/") tail = path.rsplit("/", 1)[-1] try: return int(tail) except ValueError: return None def normalize_issue( repo: str, item: dict[str, Any], snapshot_id: str, extracted_at: str ) -> dict[str, Any]: return { "repo": repo, "github_id": item.get("id"), "github_node_id": item.get("node_id"), "number": item.get("number"), "html_url": item.get("html_url"), "api_url": item.get("url"), "title": item.get("title"), "body": item.get("body"), "state": item.get("state"), "state_reason": item.get("state_reason"), "locked": item.get("locked"), "comments_count": item.get("comments"), "labels": _labels(item.get("labels")), "assignees": _assignees(item.get("assignees")), "created_at": item.get("created_at"), "updated_at": item.get("updated_at"), "closed_at": item.get("closed_at"), "author_association": item.get("author_association"), "milestone_title": (item.get("milestone") or {}).get("title"), "snapshot_id": snapshot_id, "extracted_at": extracted_at, **_user_fields(item.get("user")), } def normalize_pull_request( repo: str, issue_stub: dict[str, Any], pr_detail: dict[str, Any], snapshot_id: str, extracted_at: str, ) -> dict[str, Any]: head = pr_detail.get("head") or {} base = pr_detail.get("base") or {} return { "repo": repo, "github_id": pr_detail.get("id") or issue_stub.get("id"), "github_node_id": pr_detail.get("node_id") or issue_stub.get("node_id"), "number": issue_stub.get("number"), "html_url": issue_stub.get("html_url"), "api_url": issue_stub.get("url"), "title": issue_stub.get("title"), "body": issue_stub.get("body"), "state": issue_stub.get("state"), "state_reason": issue_stub.get("state_reason"), "locked": issue_stub.get("locked"), "comments_count": issue_stub.get("comments"), "labels": _labels(issue_stub.get("labels")), "assignees": _assignees(issue_stub.get("assignees")), "created_at": issue_stub.get("created_at"), "updated_at": issue_stub.get("updated_at"), "closed_at": issue_stub.get("closed_at"), "author_association": issue_stub.get("author_association") or pr_detail.get("author_association"), "merged_at": pr_detail.get("merged_at"), "merge_commit_sha": pr_detail.get("merge_commit_sha"), "merged": pr_detail.get("merged"), "mergeable": pr_detail.get("mergeable"), "mergeable_state": pr_detail.get("mergeable_state"), "draft": pr_detail.get("draft"), "additions": pr_detail.get("additions"), "deletions": pr_detail.get("deletions"), "changed_files": pr_detail.get("changed_files"), "commits": pr_detail.get("commits"), "review_comments_count": pr_detail.get("review_comments"), "maintainer_can_modify": pr_detail.get("maintainer_can_modify"), "head_ref": head.get("ref"), "head_sha": head.get("sha"), "head_repo_full_name": (head.get("repo") or {}).get("full_name"), "base_ref": base.get("ref"), "base_sha": base.get("sha"), "base_repo_full_name": (base.get("repo") or {}).get("full_name"), "snapshot_id": snapshot_id, "extracted_at": extracted_at, **_user_fields(issue_stub.get("user")), } def normalize_comment( repo: str, item: dict[str, Any], parent_kind: str, parent_number: int | None, snapshot_id: str, extracted_at: str, ) -> dict[str, Any]: return { "repo": repo, "github_id": item.get("id"), "github_node_id": item.get("node_id"), "parent_kind": parent_kind, "parent_number": parent_number, "html_url": item.get("html_url"), "api_url": item.get("url"), "issue_api_url": item.get("issue_url"), "body": item.get("body"), "created_at": item.get("created_at"), "updated_at": item.get("updated_at"), "author_association": item.get("author_association"), "snapshot_id": snapshot_id, "extracted_at": extracted_at, **_user_fields(item.get("user")), } def normalize_review( repo: str, pr_number: int, item: dict[str, Any], snapshot_id: str, extracted_at: str ) -> dict[str, Any]: return { "repo": repo, "github_id": item.get("id"), "github_node_id": item.get("node_id"), "pull_request_number": pr_number, "html_url": item.get("html_url"), "api_url": item.get("url"), "body": item.get("body"), "state": item.get("state"), "submitted_at": item.get("submitted_at"), "commit_id": item.get("commit_id"), "author_association": item.get("author_association"), "snapshot_id": snapshot_id, "extracted_at": extracted_at, **_user_fields(item.get("user")), } def normalize_review_comment( repo: str, pr_number: int, item: dict[str, Any], snapshot_id: str, extracted_at: str ) -> dict[str, Any]: return { "repo": repo, "github_id": item.get("id"), "github_node_id": item.get("node_id"), "pull_request_number": pr_number, "review_id": item.get("pull_request_review_id"), "html_url": item.get("html_url"), "api_url": item.get("url"), "pull_request_api_url": item.get("pull_request_url"), "body": item.get("body"), "path": item.get("path"), "commit_id": item.get("commit_id"), "original_commit_id": item.get("original_commit_id"), "position": item.get("position"), "original_position": item.get("original_position"), "line": item.get("line"), "start_line": item.get("start_line"), "side": item.get("side"), "start_side": item.get("start_side"), "subject_type": item.get("subject_type"), "created_at": item.get("created_at"), "updated_at": item.get("updated_at"), "author_association": item.get("author_association"), "snapshot_id": snapshot_id, "extracted_at": extracted_at, **_user_fields(item.get("user")), } def normalize_pr_file( repo: str, pr_number: int, item: dict[str, Any], snapshot_id: str, extracted_at: str, ) -> dict[str, Any]: return { "repo": repo, "pull_request_number": pr_number, "sha": item.get("sha"), "filename": item.get("filename"), "status": item.get("status"), "additions": item.get("additions"), "deletions": item.get("deletions"), "changes": item.get("changes"), "blob_url": item.get("blob_url"), "raw_url": item.get("raw_url"), "contents_url": item.get("contents_url"), "previous_filename": item.get("previous_filename"), "patch": item.get("patch"), "snapshot_id": snapshot_id, "extracted_at": extracted_at, } def normalize_pr_diff( repo: str, pr_number: int, html_url: str | None, api_url: str | None, diff: str, snapshot_id: str, extracted_at: str, ) -> dict[str, Any]: return { "repo": repo, "pull_request_number": pr_number, "html_url": html_url, "api_url": api_url, "diff": diff, "snapshot_id": snapshot_id, "extracted_at": extracted_at, } def normalize_timeline_event( repo: str, number: int, parent_kind: str, item: dict[str, Any], snapshot_id: str, extracted_at: str, ) -> dict[str, Any]: source = item.get("source") or {} issue = source.get("issue") or {} return { "repo": repo, "parent_kind": parent_kind, "parent_number": number, "event": item.get("event"), "created_at": item.get("created_at"), "actor_login": (item.get("actor") or {}).get("login"), "source_issue_number": issue.get("number"), "source_issue_title": issue.get("title"), "source_issue_url": issue.get("html_url"), "commit_id": item.get("commit_id"), "label_name": (item.get("label") or {}).get("name"), "snapshot_id": snapshot_id, "extracted_at": extracted_at, }