diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 000000000..04be9f36b --- /dev/null +++ b/.dockerignore @@ -0,0 +1,23 @@ +.git +__pycache__ +*.py[oc] +.venv +.env +.envrc +.ruff_cache +.mypy_cache +.pytest_cache +.claude +.coverage +.DS_Store +build +dist +wheels +*.egg-info +docs +site +.github +.qdrant_code_embeddings +CLAUDE.md +AGENTS.md +PROJECT.md diff --git a/.env.example b/.env.example index dc518b501..9e449e031 100644 --- a/.env.example +++ b/.env.example @@ -60,6 +60,17 @@ # CYPHER_MODEL=gemini-2.5-flash # CYPHER_API_KEY=your-google-api-key +# Example 6: LiteLLM with custom provider +# ORCHESTRATOR_PROVIDER=litellm_proxy +# ORCHESTRATOR_MODEL=gpt-oss:120b +# ORCHESTRATOR_ENDPOINT=http://litellm:4000/v1 +# ORCHESTRATOR_API_KEY=sk-your-litellm-key + +# CYPHER_PROVIDER=litellm_proxy +# CYPHER_MODEL=openrouter/gpt-oss:120b +# CYPHER_ENDPOINT=http://litellm:4000/v1 +# CYPHER_API_KEY=sk-your-litellm-key + # Thinking budget for reasoning models (optional) # ORCHESTRATOR_THINKING_BUDGET=10000 # CYPHER_THINKING_BUDGET=5000 @@ -68,9 +79,20 @@ MEMGRAPH_HOST=localhost MEMGRAPH_PORT=7687 MEMGRAPH_HTTP_PORT=7444 +# Memgraph authentication credentials +# Leave MEMGRAPH_USERNAME empty (or omit it) if your Memgraph instance doesn't require authentication +# If authentication is enabled, provide both username and password +# Common defaults: username=neo4j, password=password (or your custom credentials) +MEMGRAPH_USERNAME= +MEMGRAPH_PASSWORD= LAB_PORT=3000 MEMGRAPH_BATCH_SIZE=1000 +# Qdrant settings +# Leave QDRANT_URL unset to use local file mode (only suitable below ~20k embeddings) +# For larger codebases, run the bundled docker-compose service and point at it: +# QDRANT_URL=http://localhost:6333 + # Repository settings TARGET_REPO_PATH=. diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS new file mode 100644 index 000000000..49ff9c712 --- /dev/null +++ b/.github/CODEOWNERS @@ -0,0 +1 @@ +* @vitali87 diff --git a/.github/FUNDING.yml b/.github/FUNDING.yml index d5f29c336..163b5ae21 100644 --- a/.github/FUNDING.yml +++ b/.github/FUNDING.yml @@ -3,5 +3,4 @@ github: vitali87 buy_me_a_coffee: vitali87 -# Uncomment and add username when you set up Patreon: -# patreon: YOUR_USERNAME +patreon: vitali87 diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml index 4b6f8f59b..2c5488f8e 100644 --- a/.github/ISSUE_TEMPLATE/config.yml +++ b/.github/ISSUE_TEMPLATE/config.yml @@ -1,11 +1,8 @@ -blank_issues_enabled: false +blank_issues_enabled: true contact_links: - - name: 💬 Discussions - url: https://github.com/vitali87/code-graph-rag/discussions - about: Ask questions and discuss ideas with the community - name: 📚 Documentation - url: https://github.com/vitali87/code-graph-rag#readme + url: https://codeberg.org/vitali87/code-graph-rag about: Read the documentation and setup guides - name: 🎓 MCP Server Setup - url: https://github.com/vitali87/code-graph-rag/blob/main/docs/claude-code-setup.md + url: https://codeberg.org/vitali87/code-graph-rag/src/branch/main/docs/claude-code-setup.md about: Setup Code-Graph-RAG as an MCP server with Claude Code diff --git a/.github/ISSUE_TEMPLATE/documentation.yml b/.github/ISSUE_TEMPLATE/documentation.yml index 0f84c3651..f3dbfcce1 100644 --- a/.github/ISSUE_TEMPLATE/documentation.yml +++ b/.github/ISSUE_TEMPLATE/documentation.yml @@ -31,7 +31,7 @@ body: attributes: label: URL (if applicable) description: Link to the documentation page - placeholder: "https://github.com/vitali87/code-graph-rag/blob/main/..." + placeholder: "https://codeberg.org/vitali87/code-graph-rag/src/branch/main/..." - type: textarea id: current-state diff --git a/.github/ISSUE_TEMPLATE/question.yml b/.github/ISSUE_TEMPLATE/question.yml index 47d83bcd9..40150201c 100644 --- a/.github/ISSUE_TEMPLATE/question.yml +++ b/.github/ISSUE_TEMPLATE/question.yml @@ -6,7 +6,7 @@ body: - type: markdown attributes: value: | - Thank you for your question! For general discussions or open-ended questions, consider using [GitHub Discussions](https://github.com/vitali87/code-graph-rag/discussions). + Thank you for your question! Please keep questions concrete; for broader topics, prefer opening an [issue](https://codeberg.org/vitali87/code-graph-rag/issues) with the `question` label. - type: textarea id: question diff --git a/.github/dependabot.yml b/.github/dependabot.yml new file mode 100644 index 000000000..a075b29ee --- /dev/null +++ b/.github/dependabot.yml @@ -0,0 +1,16 @@ +version: 2 +updates: + - package-ecosystem: "github-actions" + directory: "/" + schedule: + interval: "weekly" + + - package-ecosystem: "docker" + directory: "/" + schedule: + interval: "weekly" + + - package-ecosystem: "pip" + directory: "/" + schedule: + interval: "weekly" diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md new file mode 100644 index 000000000..8dc054f6c --- /dev/null +++ b/.github/pull_request_template.md @@ -0,0 +1,38 @@ +## Summary + + + +- + +## Type of Change + + + +- [ ] Bug fix +- [ ] New feature +- [ ] Performance improvement +- [ ] Refactoring (no functional changes) +- [ ] Documentation +- [ ] CI/CD or tooling +- [ ] Dependencies + +## Related Issues + + + +## Test Plan + + + +- [ ] Unit tests pass (`make test-parallel` or `uv run pytest -n auto -m "not integration"`) +- [ ] New tests added +- [ ] Integration tests pass (`make test-integration`, requires Docker) +- [ ] Manual testing (describe below) + +## Checklist + +- [ ] PR title follows [Conventional Commits](https://www.conventionalcommits.org/) format +- [ ] All pre-commit checks pass (`make pre-commit`) +- [ ] No hardcoded strings in non-config/non-constants files +- [ ] No `# type: ignore`, `cast()`, `Any`, or `object` type hints +- [ ] No new comments or docstrings (code should be self-documenting) diff --git a/.github/workflows/build-binaries.yml b/.github/workflows/build-binaries.yml index c548d82ea..315cfa45a 100644 --- a/.github/workflows/build-binaries.yml +++ b/.github/workflows/build-binaries.yml @@ -8,10 +8,14 @@ on: release: types: [created] +permissions: read-all + jobs: build: name: Build ${{ matrix.platform }}-${{ matrix.arch }} runs-on: ${{ matrix.os }} + permissions: + contents: write timeout-minutes: 30 strategy: fail-fast: false @@ -32,18 +36,18 @@ jobs: steps: - name: Checkout code - uses: actions/checkout@v4 + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: fetch-depth: 0 submodules: recursive - name: Set up Python - uses: actions/setup-python@v5 + uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0 with: python-version: "3.12" - name: Install uv - uses: astral-sh/setup-uv@v4 + uses: astral-sh/setup-uv@e4db8464a088ece1b920f60402e813ea4de65b8f # v4 with: enable-cache: true cache-dependency-glob: "uv.lock" @@ -66,7 +70,7 @@ jobs: fi - name: Upload binary artifact - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0 with: name: code-graph-rag-${{ matrix.platform }}-${{ matrix.arch }} path: dist/code-graph-rag-* @@ -75,7 +79,39 @@ jobs: - name: Upload to release if: startsWith(github.ref, 'refs/tags/v') - uses: softprops/action-gh-release@v2 + uses: softprops/action-gh-release@a06a81a03ee405af7f2048a818ed3f03bbf83c7b # v2 with: files: dist/code-graph-rag-* fail_on_unmatched_files: true + + sign-release: + name: Sign Release Artifacts + if: startsWith(github.ref, 'refs/tags/v') + needs: build + runs-on: ubuntu-latest + permissions: + contents: write + id-token: write + steps: + - name: Install cosign + uses: sigstore/cosign-installer@ba7bc0a3fef59531c69a25acd34668d6d3fe6f22 # v4.1.0 + + - name: Download all artifacts + uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4 + with: + path: artifacts + merge-multiple: true + + - name: Sign artifacts + shell: bash + run: | + for f in artifacts/*; do + [ -f "$f" ] || continue + cosign sign-blob --yes --bundle "${f}.sigstore.json" "$f" + done + + - name: Upload signatures to release + uses: softprops/action-gh-release@a06a81a03ee405af7f2048a818ed3f03bbf83c7b # v2 + with: + files: artifacts/*.sigstore.json + fail_on_unmatched_files: false diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 43b0cc8db..a7742b439 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -7,6 +7,8 @@ on: branches: [main, master, develop] workflow_dispatch: +permissions: read-all + concurrency: group: ${{ github.workflow }}-${{ github.ref }} cancel-in-progress: true @@ -19,16 +21,16 @@ jobs: steps: - name: Checkout code - uses: actions/checkout@v4 + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - name: Install uv - uses: astral-sh/setup-uv@v4 + uses: astral-sh/setup-uv@e4db8464a088ece1b920f60402e813ea4de65b8f # v4 with: enable-cache: true cache-dependency-glob: "uv.lock" - name: Set up Python - uses: actions/setup-python@v5 + uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0 with: python-version: "3.12" @@ -51,16 +53,16 @@ jobs: steps: - name: Checkout code - uses: actions/checkout@v4 + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - name: Install uv - uses: astral-sh/setup-uv@v4 + uses: astral-sh/setup-uv@e4db8464a088ece1b920f60402e813ea4de65b8f # v4 with: enable-cache: true cache-dependency-glob: "uv.lock" - name: Set up Python - uses: actions/setup-python@v5 + uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0 with: python-version: "3.12" @@ -75,7 +77,7 @@ jobs: test-unit: name: Unit Tests (${{ matrix.os }}) runs-on: ${{ matrix.os }} - timeout-minutes: 15 + timeout-minutes: 20 strategy: fail-fast: false matrix: @@ -83,19 +85,19 @@ jobs: steps: - name: Checkout code - uses: actions/checkout@v4 + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: submodules: recursive fetch-depth: 0 - name: Install uv - uses: astral-sh/setup-uv@v4 + uses: astral-sh/setup-uv@e4db8464a088ece1b920f60402e813ea4de65b8f # v4 with: enable-cache: true cache-dependency-glob: "uv.lock" - name: Set up Python - uses: actions/setup-python@v5 + uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0 with: python-version: "3.12" @@ -103,13 +105,19 @@ jobs: run: | uv sync --extra treesitter-full --extra test --extra semantic --group dev - - name: Run unit tests (parallel) + - name: Run unit tests (parallel, with coverage) + if: matrix.os == 'macos-latest' run: | uv run pytest -n auto -m "not integration" --tb=short --cov=codebase_rag --cov-report=xml --cov-report=term + - name: Run unit tests (parallel, no coverage) + if: matrix.os != 'macos-latest' + run: | + uv run pytest -n auto -m "not integration" --tb=short + - name: Upload coverage to Codecov - if: always() && secrets.CODECOV_TOKEN != '' - uses: codecov/codecov-action@v4 + if: always() && matrix.os == 'macos-latest' + uses: codecov/codecov-action@1af58845a975a7985b0beb0cbe6fbbb71a41dbad # v5.5.3 with: files: ./coverage.xml flags: unit-${{ matrix.os }} @@ -123,7 +131,7 @@ jobs: steps: - name: Checkout code - uses: actions/checkout@v4 + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: submodules: recursive fetch-depth: 0 @@ -133,7 +141,7 @@ jobs: docker run -d --name memgraph -p 7687:7687 memgraph/memgraph-platform:latest echo "Waiting for Memgraph to start..." for i in {1..30}; do - if docker exec memgraph echo "SELECT 1;" 2>/dev/null; then + if docker exec memgraph mgconsole --no-history -c "RETURN 1;" 2>/dev/null; then echo "Memgraph is ready!" break fi @@ -142,13 +150,13 @@ jobs: done - name: Install uv - uses: astral-sh/setup-uv@v4 + uses: astral-sh/setup-uv@e4db8464a088ece1b920f60402e813ea4de65b8f # v4 with: enable-cache: true cache-dependency-glob: "uv.lock" - name: Set up Python - uses: actions/setup-python@v5 + uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0 with: python-version: "3.12" @@ -164,8 +172,8 @@ jobs: uv run pytest -m "integration" -v --tb=short --cov=codebase_rag --cov-report=xml --cov-report=term - name: Upload coverage to Codecov - if: always() && secrets.CODECOV_TOKEN != '' - uses: codecov/codecov-action@v4 + if: always() + uses: codecov/codecov-action@1af58845a975a7985b0beb0cbe6fbbb71a41dbad # v5.5.3 with: files: ./coverage.xml flags: integration-ubuntu-latest @@ -187,7 +195,7 @@ jobs: steps: - name: Check PR title format - uses: amannn/action-semantic-pull-request@v5 + uses: amannn/action-semantic-pull-request@48f256284bd46cdaab1048c3721360e808335d50 # v6.1.1 env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} with: diff --git a/.github/workflows/claude-code-review.yml b/.github/workflows/claude-code-review.yml index ecd3732f3..6c0c48ebf 100644 --- a/.github/workflows/claude-code-review.yml +++ b/.github/workflows/claude-code-review.yml @@ -10,6 +10,8 @@ on: - "*.py" - "pyproject.toml" +permissions: read-all + jobs: claude-review: name: AI Code Review @@ -26,13 +28,13 @@ jobs: steps: - name: Checkout repository - uses: actions/checkout@v4 + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: fetch-depth: 1 - name: Run Claude Code Review id: claude-review - uses: anthropics/claude-code-action@beta + uses: anthropics/claude-code-action@28f83620103c48a57093dcc2837eec89e036bb9f # beta with: claude_code_oauth_token: ${{ secrets.CLAUDE_CODE_OAUTH_TOKEN }} diff --git a/.github/workflows/docker-publish.yml b/.github/workflows/docker-publish.yml new file mode 100644 index 000000000..853e4df66 --- /dev/null +++ b/.github/workflows/docker-publish.yml @@ -0,0 +1,62 @@ +name: Docker Publish + +on: + push: + tags: + - 'v*' + workflow_dispatch: + +env: + REGISTRY: ghcr.io + IMAGE_NAME: ${{ github.repository }} + +permissions: read-all + +jobs: + build-and-push: + runs-on: ubuntu-latest + timeout-minutes: 60 + permissions: + contents: read + packages: write + attestations: write + id-token: write + steps: + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + + - uses: docker/setup-qemu-action@c7c53464625b32c7a7e944ae62b3e17d2b600130 # v3 + + - uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # v3 + + - uses: docker/login-action@b45d80f862d83dbcd57f89517bcf500b2ab88fb2 # v4.0.0 + with: + registry: ${{ env.REGISTRY }} + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - uses: docker/metadata-action@c299e40c65443455700f0fdfc63efafe5b349051 # v5 + id: meta + with: + images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }} + tags: | + type=semver,pattern={{version}} + type=semver,pattern={{major}}.{{minor}} + type=semver,pattern={{major}} + type=sha + + - uses: docker/build-push-action@d08e5c354a6adb9ed34480a06d141179aa583294 # v7.0.0 + id: push + with: + context: . + platforms: linux/amd64,linux/arm64 + push: true + tags: ${{ steps.meta.outputs.tags }} + labels: ${{ steps.meta.outputs.labels }} + cache-from: type=gha + cache-to: type=gha,mode=max + + - uses: actions/attest-build-provenance@96b4a1ef7235a096b17240c259729fdd70c83d45 # v2 + with: + subject-name: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }} + subject-digest: ${{ steps.push.outputs.digest }} + push-to-registry: true diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml new file mode 100644 index 000000000..912c8eb02 --- /dev/null +++ b/.github/workflows/docs.yml @@ -0,0 +1,58 @@ +name: Deploy Documentation + +on: + push: + branches: + - main + paths: + - "docs/**" + - "mkdocs.yml" + # (H) Rebuilds periodically so the GitHub repo widget (version, stars, forks) + # stays current; MkDocs Material fetches these stats at build time. + schedule: + - cron: "0 */6 * * *" + workflow_dispatch: + +permissions: + contents: read + pages: write + id-token: write + +concurrency: + group: pages + cancel-in-progress: false + +jobs: + build: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + + - uses: astral-sh/setup-uv@e4db8464a088ece1b920f60402e813ea4de65b8f # v4 + with: + enable-cache: true + cache-dependency-glob: "uv.lock" + + - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0 + with: + python-version: "3.12" + + - name: Install dependencies + run: uv sync --group docs + + - name: Build site + run: uv run mkdocs build --strict + + - uses: actions/upload-pages-artifact@7b1f4a764d45c48632c6b24a0339c27f5614fb0b # v4.0.0 + with: + path: site + + deploy: + needs: build + runs-on: ubuntu-latest + environment: + name: github-pages + url: ${{ steps.deployment.outputs.page_url }} + steps: + - id: deployment + uses: actions/deploy-pages@d6db90164ac5ed86f2b6aed7e0febac5b3c0c03e # v4 diff --git a/.github/workflows/label-sync.yml b/.github/workflows/label-sync.yml index ec787447e..40cc0e2c0 100644 --- a/.github/workflows/label-sync.yml +++ b/.github/workflows/label-sync.yml @@ -9,9 +9,10 @@ on: - ".github/workflows/label-sync.yml" workflow_dispatch: schedule: - # Run weekly on Mondays at 00:00 UTC to ensure labels stay in sync - cron: "0 0 * * 1" +permissions: read-all + jobs: sync-labels: name: Sync Repository Labels @@ -22,10 +23,10 @@ jobs: steps: - name: Checkout repository - uses: actions/checkout@v4 + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - name: Sync labels - uses: micnncim/action-label-syncer@v1 + uses: micnncim/action-label-syncer@3abd5ab72fda571e69fffd97bd4e0033dd5f495c # v1 env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} with: diff --git a/.github/workflows/osv-scanner.yml b/.github/workflows/osv-scanner.yml new file mode 100644 index 000000000..5ac2a0a24 --- /dev/null +++ b/.github/workflows/osv-scanner.yml @@ -0,0 +1,50 @@ +# This workflow uses actions that are not certified by GitHub. +# They are provided by a third-party and are governed by +# separate terms of service, privacy policy, and support +# documentation. + +# A sample workflow which sets up periodic OSV-Scanner scanning for vulnerabilities, +# in addition to a PR check which fails if new vulnerabilities are introduced. +# +# For more examples and options, including how to ignore specific vulnerabilities, +# see https://google.github.io/osv-scanner/github-action/ + +name: OSV-Scanner + +on: + pull_request: + branches: [ "main" ] + merge_group: + branches: [ "main" ] + schedule: + - cron: '29 2 * * 4' + push: + branches: [ "main" ] + +permissions: read-all + +jobs: + scan-scheduled: + if: ${{ github.event_name == 'push' || github.event_name == 'schedule' }} + uses: google/osv-scanner-action/.github/workflows/osv-scanner-reusable.yml@c5996e0193a3df57d695c1b8a1dec2a4c62e8730 # v2.3.3 + permissions: + actions: read + security-events: write + contents: read + with: + scan-args: |- + -r + --skip-git + ./ + scan-pr: + if: ${{ github.event_name == 'pull_request' || github.event_name == 'merge_group' }} + uses: google/osv-scanner-action/.github/workflows/osv-scanner-reusable-pr.yml@c5996e0193a3df57d695c1b8a1dec2a4c62e8730 # v2.3.3 + permissions: + actions: read + security-events: write + contents: read + with: + scan-args: |- + -r + --skip-git + ./ diff --git a/.github/workflows/poor-quality-management.yml b/.github/workflows/poor-quality-management.yml index df73ada89..657a86dae 100644 --- a/.github/workflows/poor-quality-management.yml +++ b/.github/workflows/poor-quality-management.yml @@ -4,9 +4,11 @@ on: pull_request_target: types: [labeled] schedule: - - cron: "0 9 * * *" # Daily at 9 AM UTC + - cron: "0 9 * * *" workflow_dispatch: +permissions: read-all + jobs: notify-poor-quality: name: Notify Poor Quality PR @@ -19,7 +21,7 @@ jobs: steps: - name: Add warning comment - uses: actions/github-script@v7 + uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0 with: script: | const message = `⚠️ **This PR has been marked as poor-quality.** @@ -73,7 +75,7 @@ jobs: steps: - name: Close PRs with poor-quality label older than 7 days - uses: actions/github-script@v7 + uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0 with: script: | const LABEL_NAME = 'poor-quality'; diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml new file mode 100644 index 000000000..1201a3a14 --- /dev/null +++ b/.github/workflows/publish.yml @@ -0,0 +1,38 @@ +name: Publish to PyPI + +on: + release: + types: [published] + +permissions: read-all + +jobs: + publish: + name: Publish to PyPI + runs-on: ubuntu-latest + timeout-minutes: 10 + environment: pypi + permissions: + id-token: write + contents: read + + steps: + - name: Checkout code + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + + - name: Install uv + uses: astral-sh/setup-uv@e4db8464a088ece1b920f60402e813ea4de65b8f # v4 + with: + enable-cache: true + cache-dependency-glob: "uv.lock" + + - name: Set up Python + uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0 + with: + python-version: "3.12" + + - name: Build package + run: uv build + + - name: Publish to PyPI + uses: pypa/gh-action-pypi-publish@ed0c53931b1dc9bd32cbe73a98c7f6766f8a527e # release/v1 diff --git a/.github/workflows/scorecard.yml b/.github/workflows/scorecard.yml new file mode 100644 index 000000000..08b117574 --- /dev/null +++ b/.github/workflows/scorecard.yml @@ -0,0 +1,78 @@ +# This workflow uses actions that are not certified by GitHub. They are provided +# by a third-party and are governed by separate terms of service, privacy +# policy, and support documentation. + +name: Scorecard supply-chain security +on: + # For Branch-Protection check. Only the default branch is supported. See + # https://github.com/ossf/scorecard/blob/main/docs/checks.md#branch-protection + branch_protection_rule: + # To guarantee Maintained check is occasionally updated. See + # https://github.com/ossf/scorecard/blob/main/docs/checks.md#maintained + schedule: + - cron: '32 23 * * 2' + push: + branches: [ "main" ] + +# Declare default permissions as read only. +permissions: read-all + +jobs: + analysis: + name: Scorecard analysis + runs-on: ubuntu-latest + # `publish_results: true` only works when run from the default branch. conditional can be removed if disabled. + if: github.event.repository.default_branch == github.ref_name || github.event_name == 'pull_request' + permissions: + # Needed to upload the results to code-scanning dashboard. + security-events: write + # Needed to publish results and get a badge (see publish_results below). + id-token: write + # Uncomment the permissions below if installing in a private repository. + # contents: read + # actions: read + + steps: + - name: "Checkout code" + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + with: + persist-credentials: false + + - name: "Run analysis" + uses: ossf/scorecard-action@4eaacf0543bb3f2c246792bd56e8cdeffafb205a # v2.4.3 + with: + results_file: results.sarif + results_format: sarif + # (Optional) "write" PAT token. Uncomment the `repo_token` line below if: + # - you want to enable the Branch-Protection check on a *public* repository, or + # - you are installing Scorecard on a *private* repository + # To create the PAT, follow the steps in https://github.com/ossf/scorecard-action?tab=readme-ov-file#authentication-with-fine-grained-pat-optional. + # repo_token: ${{ secrets.SCORECARD_TOKEN }} + + # Public repositories: + # - Publish results to OpenSSF REST API for easy access by consumers + # - Allows the repository to include the Scorecard badge. + # - See https://github.com/ossf/scorecard-action#publishing-results. + # For private repositories: + # - `publish_results` will always be set to `false`, regardless + # of the value entered here. + publish_results: true + + # (Optional) Uncomment file_mode if you have a .gitattributes with files marked export-ignore + # file_mode: git + + # Upload the results as artifacts (optional). Commenting out will disable uploads of run results in SARIF + # format to the repository Actions tab. + - name: "Upload artifact" + uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0 + with: + name: SARIF file + path: results.sarif + retention-days: 5 + + # Upload the results to GitHub's code scanning dashboard (optional). + # Commenting out will disable upload of results to your repo's Code Scanning dashboard + - name: "Upload to code-scanning" + uses: github/codeql-action/upload-sarif@38697555549f1db7851b81482ff19f1fa5c4fedc # v3 + with: + sarif_file: results.sarif diff --git a/.github/workflows/sonarcloud.yml b/.github/workflows/sonarcloud.yml new file mode 100644 index 000000000..123b16f0a --- /dev/null +++ b/.github/workflows/sonarcloud.yml @@ -0,0 +1,45 @@ +name: SonarCloud + +on: + push: + branches: [main] + pull_request: + branches: [main] + +permissions: + contents: read + +jobs: + sonarcloud: + name: SonarCloud Analysis + if: github.event.pull_request.head.repo.full_name == github.repository || github.event_name == 'push' + runs-on: ubuntu-latest + timeout-minutes: 15 + + steps: + - name: Checkout code + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + with: + fetch-depth: 0 + + - name: Install uv + uses: astral-sh/setup-uv@38f3f104447c67c051c4a08e39b64a148898af3a # v4 + with: + enable-cache: true + cache-dependency-glob: "uv.lock" + + - name: Set up Python + uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0 + with: + python-version: "3.12" + + - name: Install dependencies + run: uv sync --extra treesitter-full --extra test --extra semantic --group dev + + - name: Run tests with coverage + run: uv run pytest -n auto -m "not integration" --tb=short --cov=codebase_rag --cov-report=xml + + - name: SonarCloud Scan + uses: SonarSource/sonarqube-scan-action@fd88b7d7ccbaefd23d8f36f73b59db7a3d246602 # v6 + env: + SONAR_TOKEN: ${{ secrets.SONAR_TOKEN }} diff --git a/.github/workflows/split-score.yml b/.github/workflows/split-score.yml new file mode 100644 index 000000000..7c65ac2e2 --- /dev/null +++ b/.github/workflows/split-score.yml @@ -0,0 +1,22 @@ +name: PR Split Score + +on: + pull_request: + branches: [main] + +permissions: + contents: read + pull-requests: write + +jobs: + score: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: pr-split score + uses: vitali87/pr-split@v1.0.0 + with: + max-loc: "400" diff --git a/.github/workflows/version-bump.yml b/.github/workflows/version-bump.yml index 0940adcad..596a01ccd 100644 --- a/.github/workflows/version-bump.yml +++ b/.github/workflows/version-bump.yml @@ -16,6 +16,8 @@ on: - minor - major +permissions: read-all + jobs: bump-version: name: Auto Version Bump @@ -26,7 +28,7 @@ jobs: contents: write steps: - name: Checkout repository - uses: actions/checkout@v4 + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: fetch-depth: 2 token: ${{ secrets.GITHUB_TOKEN }} @@ -90,12 +92,17 @@ jobs: run: | sed -i 's/^version = ".*"/version = "${{ steps.bump_version.outputs.new }}"/' pyproject.toml + - name: Update server.json + if: steps.check_manual.outputs.skip == 'false' + run: | + sed -i 's/"version": "[^"]*"/"version": "${{ steps.bump_version.outputs.new }}"/g' server.json + - name: Commit version bump if: steps.check_manual.outputs.skip == 'false' run: | git config user.name "github-actions[bot]" git config user.email "github-actions[bot]@users.noreply.github.com" - git add pyproject.toml + git add pyproject.toml server.json git commit -m "chore: bump version to ${{ steps.bump_version.outputs.new }}" git push diff --git a/.gitignore b/.gitignore index 4b6211856..c44ce990d 100644 --- a/.gitignore +++ b/.gitignore @@ -23,3 +23,19 @@ PROJECT.md .DS_Store .pypi_cache.json .omc +site/ + +# Eval harness scratch workspace (regenerated each run); result files are committed +evals/results/l3_workspace/ +# Rust oracle build artifacts (the source + Cargo.lock are committed) +evals/oracles/rs_oracle/target/ +# TypeScript oracle deps (the source + package-lock.json are committed) +evals/oracles/ts_oracle/node_modules/ +# Java oracle compiled classes (the source is committed) +evals/oracles/java_oracle/*.class +# Lua oracle deps (the source + package-lock.json are committed) +evals/oracles/lua_oracle/node_modules/ +# PHP oracle deps (the source + package-lock.json are committed) +evals/oracles/php_oracle/node_modules/ +.cgr-hash-cache.json +.cgr-dir-mtimes.json diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 92a09727a..12a7db5f0 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -5,23 +5,24 @@ repos: - id: trailing-whitespace - id: end-of-file-fixer - id: check-yaml + args: [--unsafe] - id: check-toml - repo: https://github.com/astral-sh/ruff-pre-commit rev: v0.12.2 hooks: - id: ruff args: [--fix, --exit-non-zero-on-fix] - exclude: ^codec/schema_pb2\.(py|pyi)$ + exclude: ^(codec/schema_pb2\.(py|pyi)|benchmarks/|optimize/)$ - id: ruff-format - exclude: ^codec/schema_pb2\.(py|pyi)$ + exclude: ^(codec/schema_pb2\.(py|pyi)|benchmarks/|optimize/)$ - repo: local hooks: - id: ty name: ty check - entry: uv run ty check --exclude codebase_rag/tests/ + entry: uv run ty check --exclude codebase_rag/tests/ --exclude benchmarks/ --exclude optimize/ --exclude codec/ --exclude grammars/ --exclude query_modules/ language: system types: [python] - exclude: ^codec/.*_pb2\.py$ + exclude: ^(codec/.*_pb2\.py|benchmarks/|optimize/|grammars/|query_modules/)$ pass_filenames: false - repo: local hooks: @@ -30,7 +31,7 @@ repos: entry: uv run python scripts/check_no_docs.py language: system types: [python] - exclude: ^codec/schema_pb2\.py$ + exclude: ^(codec/schema_pb2\.py|benchmarks/|optimize/) - repo: local hooks: - id: generate-readme @@ -45,7 +46,7 @@ repos: - id: bandit args: ["-c", "pyproject.toml", "--severity-level", "high"] additional_dependencies: ["bandit[toml]"] - exclude: ^(codebase_rag/tests/|scripts/) + exclude: ^(codebase_rag/tests/|scripts/|benchmarks/|optimize/) - repo: https://github.com/compilerla/conventional-pre-commit rev: v4.2.0 hooks: diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md new file mode 100644 index 000000000..9b47f9561 --- /dev/null +++ b/CODE_OF_CONDUCT.md @@ -0,0 +1,128 @@ +# Contributor Covenant Code of Conduct + +## Our Pledge + +We as members, contributors, and leaders pledge to make participation in our +community a harassment-free experience for everyone, regardless of age, body +size, visible or invisible disability, ethnicity, sex characteristics, gender +identity and expression, level of experience, education, socio-economic status, +nationality, personal appearance, race, religion, or sexual identity +and orientation. + +We pledge to act and interact in ways that contribute to an open, welcoming, +diverse, inclusive, and healthy community. + +## Our Standards + +Examples of behavior that contributes to a positive environment for our +community include: + +* Demonstrating empathy and kindness toward other people +* Being respectful of differing opinions, viewpoints, and experiences +* Giving and gracefully accepting constructive feedback +* Accepting responsibility and apologizing to those affected by our mistakes, + and learning from the experience +* Focusing on what is best not just for us as individuals, but for the + overall community + +Examples of unacceptable behavior include: + +* The use of sexualized language or imagery, and sexual attention or + advances of any kind +* Trolling, insulting or derogatory comments, and personal or political attacks +* Public or private harassment +* Publishing others' private information, such as a physical or email + address, without their explicit permission +* Other conduct which could reasonably be considered inappropriate in a + professional setting + +## Enforcement Responsibilities + +Community leaders are responsible for clarifying and enforcing our standards of +acceptable behavior and will take appropriate and fair corrective action in +response to any behavior that they deem inappropriate, threatening, offensive, +or harmful. + +Community leaders have the right and responsibility to remove, edit, or reject +comments, commits, code, wiki edits, issues, and other contributions that are +not aligned to this Code of Conduct, and will communicate reasons for moderation +decisions when appropriate. + +## Scope + +This Code of Conduct applies within all community spaces, and also applies when +an individual is officially representing the community in public spaces. +Examples of representing our community include using an official e-mail address, +posting via an official social media account, or acting as an appointed +representative at an online or offline event. + +## Enforcement + +Instances of abusive, harassing, or otherwise unacceptable behavior may be +reported to the community leaders responsible for enforcement at +eheva87@gmail.com. +All complaints will be reviewed and investigated promptly and fairly. + +All community leaders are obligated to respect the privacy and security of the +reporter of any incident. + +## Enforcement Guidelines + +Community leaders will follow these Community Impact Guidelines in determining +the consequences for any action they deem in violation of this Code of Conduct: + +### 1. Correction + +**Community Impact**: Use of inappropriate language or other behavior deemed +unprofessional or unwelcome in the community. + +**Consequence**: A private, written warning from community leaders, providing +clarity around the nature of the violation and an explanation of why the +behavior was inappropriate. A public apology may be requested. + +### 2. Warning + +**Community Impact**: A violation through a single incident or series +of actions. + +**Consequence**: A warning with consequences for continued behavior. No +interaction with the people involved, including unsolicited interaction with +those enforcing the Code of Conduct, for a specified period of time. This +includes avoiding interactions in community spaces as well as external channels +like social media. Violating these terms may lead to a temporary or +permanent ban. + +### 3. Temporary Ban + +**Community Impact**: A serious violation of community standards, including +sustained inappropriate behavior. + +**Consequence**: A temporary ban from any sort of interaction or public +communication with the community for a specified period of time. No public or +private interaction with the people involved, including unsolicited interaction +with those enforcing the Code of Conduct, is allowed during this period. +Violating these terms may lead to a permanent ban. + +### 4. Permanent Ban + +**Community Impact**: Demonstrating a pattern of violation of community +standards, including sustained inappropriate behavior, harassment of an +individual, or aggression toward or disparagement of classes of individuals. + +**Consequence**: A permanent ban from any sort of public interaction within +the community. + +## Attribution + +This Code of Conduct is adapted from the [Contributor Covenant][homepage], +version 2.0, available at +https://www.contributor-covenant.org/version/2/0/code_of_conduct.html. + +Community Impact Guidelines were inspired by [Mozilla's code of conduct +enforcement ladder](https://github.com/mozilla/diversity). + +[homepage]: https://www.contributor-covenant.org + +For answers to common questions about this code of conduct, see the FAQ at +https://www.contributor-covenant.org/faq. Translations are available at +https://www.contributor-covenant.org/translations. diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index cfc7c6d05..7d955c589 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -4,7 +4,7 @@ Thank you for your interest in contributing to Code Graph RAG! We welcome contri ## Getting Started -1. **Browse Issues**: Check out our [GitHub Issues](https://github.com/vitali87/code-graph-rag/issues) to find tasks that need work +1. **Browse Issues**: Check out our [issue tracker](https://codeberg.org/vitali87/code-graph-rag/issues) to find tasks that need work - Look for issues labeled `good first issue` for beginner-friendly tasks - Issues labeled `help wanted` are open for community contributions 2. **Pick an Issue**: Choose an issue that interests you and matches your skill level diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 000000000..e965de91d --- /dev/null +++ b/Dockerfile @@ -0,0 +1,51 @@ +FROM ghcr.io/astral-sh/uv:0.10@sha256:72ab0aeb448090480ccabb99fb5f52b0dc3c71923bffb5e2e26517a1c27b7fec AS uv + +FROM python:3.14-slim@sha256:fb83750094b46fd6b8adaa80f66e2302ecbe45d513f6cece637a841e1025b4ca AS builder + +COPY --from=uv /uv /uvx /bin/ + +RUN apt-get update && \ + apt-get install -y --no-install-recommends \ + cmake build-essential libssl-dev zlib1g-dev libzstd-dev && \ + rm -rf /var/lib/apt/lists/* + +WORKDIR /app + +COPY pyproject.toml uv.lock ./ +RUN uv sync --frozen --no-dev --extra treesitter-full --no-install-project --no-binary-package pymgclient + +COPY . . +RUN uv sync --frozen --no-dev --extra treesitter-full --no-binary-package pymgclient + +FROM python:3.14-slim@sha256:fb83750094b46fd6b8adaa80f66e2302ecbe45d513f6cece637a841e1025b4ca + +RUN apt-get update && \ + apt-get install -y --no-install-recommends ripgrep libssl3 zlib1g libzstd1 && \ + rm -rf /var/lib/apt/lists/* + +RUN useradd --create-home appuser +USER appuser +WORKDIR /app + +COPY --from=builder --chown=appuser:appuser /app/.venv /app/.venv +COPY --from=builder --chown=appuser:appuser /app/codebase_rag /app/codebase_rag +COPY --from=builder --chown=appuser:appuser /app/codec /app/codec +COPY --from=builder --chown=appuser:appuser /app/cgr /app/cgr +COPY --from=builder --chown=appuser:appuser /app/pyproject.toml /app/pyproject.toml + +ENV PATH="/app/.venv/bin:$PATH" + +COPY --chmod=755 <<'EOF' /app/entrypoint.sh +#!/bin/sh +ARCH=$(uname -m) +case "$ARCH" in + x86_64) LIBDIR="/lib/x86_64-linux-gnu" ;; + aarch64) LIBDIR="/lib/aarch64-linux-gnu" ;; + *) LIBDIR="/lib" ;; +esac +export LD_PRELOAD="$LIBDIR/libz.so.1:$LIBDIR/libzstd.so.1" +exec code-graph-rag "$@" +EOF + +ENTRYPOINT ["/app/entrypoint.sh"] +CMD ["mcp-server"] diff --git a/LICENSE b/LICENSE index fd189113e..4765780e7 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ MIT License -Copyright (c) [2025] [Vitali Avagyan] +Copyright (c) 2025 Vitali Avagyan Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/Makefile b/Makefile index 10c757dac..d8fa492d8 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,4 @@ -.PHONY: help all install dev test test-parallel test-integration test-all test-parallel-all clean python build-grammars watch readme lint format typecheck check pre-commit +.PHONY: help all install dev test test-parallel test-integration test-all test-parallel-all clean python build-grammars watch readme lint format typecheck check pre-commit release PYTHON := uv run @@ -77,6 +77,9 @@ typecheck: ## Run type checking with ty check: lint typecheck test ## Run all checks: lint, typecheck, test +release: ## Build, verify, and publish the current pyproject version to PyPI, then tag and create a GitHub Release + ./scripts/release.sh + pre-commit: ## Run all pre-commit checks locally (comprehensive test before commit) @echo "Running pre-commit checks..." @echo "1. Formatting code..." diff --git a/PYPI_README.md b/PYPI_README.md new file mode 100644 index 000000000..a1dd20c0b --- /dev/null +++ b/PYPI_README.md @@ -0,0 +1,160 @@ +# Code-Graph-RAG + +A graph-based RAG system that parses multi-language codebases with Tree-sitter, builds knowledge graphs in Memgraph, and enables natural language querying, editing, and optimization. + +## Install + +```bash +pip install code-graph-rag +``` + +With all Tree-sitter grammars (Python, JS, TS, Rust, Go, Java, Scala, C++, Lua): + +```bash +pip install 'code-graph-rag[treesitter-full]' +``` + +With semantic code search (UniXcoder embeddings): + +```bash +pip install 'code-graph-rag[semantic]' +``` + +### Prerequisites + +- Python 3.12+ +- Docker (for Memgraph) +- `cmake` (for building pymgclient) +- `ripgrep` (`rg`) (for shell command text searching) + +## CLI Quick Start + +The package installs a `cgr` command. + +**Start Memgraph, parse a repo, and query it:** + +```bash +docker compose up -d # start Memgraph +cgr start --repo-path ./my-project \ + --update-graph --clean # parse & launch interactive chat +``` + +**Index to protobuf for offline use:** + +```bash +cgr index -o ./index-output --repo-path ./my-project +``` + +**Export knowledge graph to JSON:** + +```bash +cgr export -o graph.json +``` + +**AI-guided optimization:** + +```bash +cgr optimize python --repo-path ./my-project +``` + +**Run as an MCP server (for Claude Code):** + +```bash +cgr mcp-server +``` + +**Check your setup:** + +```bash +cgr doctor +``` + +## Python SDK + +The `cgr` package provides short imports for programmatic use. + +### Load and query an exported graph + +```python +from cgr import load_graph + +graph = load_graph("graph.json") +print(graph.summary()) + +functions = graph.find_nodes_by_label("Function") +for fn in functions[:5]: + rels = graph.get_relationships_for_node(fn.node_id) + print(f"{fn.properties['name']}: {len(rels)} relationships") +``` + +### Query Memgraph with Cypher + +```python +from cgr import MemgraphIngestor + +with MemgraphIngestor(host="localhost", port=7687) as db: + rows = db.fetch_all("MATCH (f:Function) RETURN f.name LIMIT 10") + for row in rows: + print(row) +``` + +### Generate Cypher from natural language + +```python +import asyncio +from cgr import CypherGenerator + +async def main(): + gen = CypherGenerator() + cypher = await gen.generate("Find all classes that inherit from BaseModel") + print(cypher) + +asyncio.run(main()) +``` + +### Semantic code search + +Requires the `semantic` extra. + +```python +from cgr import embed_code + +embedding = embed_code("def authenticate(user, password): ...") +print(f"Embedding dimension: {len(embedding)}") +``` + +### Configuration + +```python +from cgr import settings + +settings.set_orchestrator("openai", "gpt-4o", api_key="sk-...") +settings.set_cypher("google", "gemini-2.5-flash", api_key="your-key") +``` + +## Environment Variables + +Configure via `.env` or environment variables: + +| Variable | Default | Description | +|----------|---------|-------------| +| `MEMGRAPH_HOST` | `localhost` | Memgraph hostname | +| `MEMGRAPH_PORT` | `7687` | Memgraph port | +| `ORCHESTRATOR_PROVIDER` | | Provider: `google`, `openai`, `ollama` | +| `ORCHESTRATOR_MODEL` | | Model ID (e.g. `gpt-4o`, `gemini-2.5-pro`) | +| `ORCHESTRATOR_API_KEY` | | API key for the provider (not needed for `ollama`) | +| `CYPHER_PROVIDER` | | Provider for Cypher generation | +| `CYPHER_MODEL` | | Model ID for Cypher generation (e.g. `codellama`, `gpt-4o-mini`) | +| `CYPHER_API_KEY` | | API key for Cypher provider (not needed for `ollama`) | +| `TARGET_REPO_PATH` | `.` | Default repository path | + +## Documentation + +Full documentation, architecture details, and contribution guide: +[docs.code-graph-rag.com](https://docs.code-graph-rag.com) + +## License + +MIT + + diff --git a/README.md b/README.md index 5ef87d4e0..46d6e6526 100644 --- a/README.md +++ b/README.md @@ -1,26 +1,54 @@
+ + Code-Graph-RAG Logo

+ + + Enterprise Support + + PyPI Downloads + + +

@@ -35,8 +63,9 @@ An accurate Retrieval-Augmented Generation (RAG) system that analyzes multi-lang ## Latest News 🔥 -- **[NEW]** **MCP Server Integration**: Code-Graph-RAG now works as an MCP server with Claude Code! Query and edit your codebase using natural language directly from Claude Code. [Setup Guide](docs/claude-code-setup.md) -- [2025/10/21] **Semantic Code Search**: Added intent-based code search using UniXcoder embeddings. Find functions by describing what they do (e.g., "error handling functions", "authentication code") rather than by exact names. +- **PHP Language Support**: Full PHP language support added — classes, interfaces, traits, enums, namespaces, PHP 8 attributes, and call graph analysis. Contributed by [@rs-ipps](https://github.com/rs-ipps). +- **C Language Support**: Full C language support added — functions, structs, unions, enums, preprocessor includes, and call graph analysis. Contributed by [@dj0nes](https://github.com/dj0nes). +- **Visualise any GitHub repo instantly!** Just change `github.com` to `gitcgr.com` in any repo URL — that's it, only 3 letters! Get an interactive graph of the entire codebase structure. Try it now: [gitcgr.com](https://gitcgr.com) ## 🚀 Features @@ -45,16 +74,16 @@ An accurate Retrieval-Augmented Generation (RAG) system that analyzes multi-lang | Language | Status | Extensions | Functions | Classes/Structs | Modules | Package Detection | Additional Features | |--------|------|----------|---------|---------------|-------|-----------------|-------------------| +| C | Fully Supported | .c | ✓ | ✓ | ✓ | ✓ | Functions, structs, unions, enums, preprocessor includes | | C++ | Fully Supported | .cpp, .h, .hpp, .cc, .cxx, .hxx, .hh, .ixx, .cppm, .ccm | ✓ | ✓ | ✓ | ✓ | Constructors, destructors, operator overloading, templates, lambdas, C++20 modules, namespaces | | Java | Fully Supported | .java | ✓ | ✓ | ✓ | - | Generics, annotations, modern features (records/sealed classes), concurrency, reflection | | JavaScript | Fully Supported | .js, .jsx | ✓ | ✓ | ✓ | - | ES6 modules, CommonJS, prototype methods, object methods, arrow functions | | Lua | Fully Supported | .lua | ✓ | - | ✓ | - | Local/global functions, metatables, closures, coroutines | +| PHP | Fully Supported | .php | ✓ | ✓ | ✓ | - | Classes, interfaces, traits, enums, namespaces, PHP 8 attributes | | Python | Fully Supported | .py | ✓ | ✓ | ✓ | ✓ | Type inference, decorators, nested functions | | Rust | Fully Supported | .rs | ✓ | ✓ | ✓ | ✓ | impl blocks, associated functions | | TypeScript | Fully Supported | .ts, .tsx | ✓ | ✓ | ✓ | - | Interfaces, type aliases, enums, namespaces, ES6/CommonJS modules | -| C# | In Development | .cs | ✓ | ✓ | ✓ | - | Classes, interfaces, generics (planned) | | Go | In Development | .go | ✓ | ✓ | ✓ | - | Methods, type declarations | -| PHP | In Development | .php | ✓ | ✓ | ✓ | - | Classes, functions, namespaces | | Scala | In Development | .scala, .sc | ✓ | ✓ | ✓ | - | Case classes, objects | - **🌳 Tree-sitter Parsing**: Uses Tree-sitter for robust, language-agnostic AST parsing @@ -111,9 +140,54 @@ sudo dnf install ripgrep ## 🛠️ Installation +### System-wide install (recommended for end users) + +`cgr` is published to PyPI and can be installed system-wide so it works from any +target repo without activating a project virtualenv. Install with the +`treesitter-full` (all languages) and `semantic` (vector search) extras: + +```bash +# with uv (recommended) +uv tool install "code-graph-rag[treesitter-full,semantic]" + +# or with pipx +pipx install "code-graph-rag[treesitter-full,semantic]" +``` + +For a Python-only install, omit the extras. For local development from a clone, +use `uv tool install --editable "/path/to/code-graph-rag[treesitter-full,semantic]"`. + +After install, `cgr` is on PATH. From any repository, run: + ```bash -git clone https://github.com/vitali87/code-graph-rag.git +cd ~/path/to/some-target-repo +cgr daemon up # one-time: start the shared memgraph + qdrant stack +cgr start # auto-sync the current repo and drop into the agent +``` + +`cgr start` defaults `--repo-path` to the current directory and auto-syncs the +graph incrementally on entry. Pass `--no-sync` to skip the sync, or +`--no-start-stack` if memgraph/qdrant already run elsewhere. + +Useful subcommands: + +| Command | Purpose | +|---|---| +| `cgr daemon up/down/status/restart/logs` | Manage the shared docker stack | +| `cgr stop` | Alias for `cgr daemon down` | +| `cgr status` | Show stack state + per-project last-sync timestamp | +| `cgr workspace create/list/show/delete` | Manage named bundles of repos | +| `cgr workspace add-repo / remove-repo` | Edit a workspace's repo set | +| `cgr start --workspace mono` | Open the agent over every project in the workspace | +| `cgr start --projects a,b,c` | Scope agent queries to the listed projects | +Indexed data persists across `cgr daemon down` thanks to named memgraph + qdrant +volumes (`memgraph_data`, `memgraph_log`, `qdrant_storage`). + +### Local development install + +```bash +git clone https://codeberg.org/vitali87/code-graph-rag.git cd code-graph-rag ``` @@ -218,9 +292,20 @@ ollama pull llama3.2 4. **Start Memgraph database**: ```bash -docker-compose up -d +docker compose up -d +``` + +5. **Verify installation**: +```bash +# If installed from PyPI: +cgr --help + +# If running from source: +uv run cgr --help ``` +> **Note**: When running from source (cloned repo), prefix all `cgr` commands below with `uv run`, e.g., `uv run cgr start ...` + ## 🛠️ Makefile Commands Use the Makefile for common development tasks: @@ -246,6 +331,7 @@ Use the Makefile for common development tasks: | `make format` | Run ruff format | | `make typecheck` | Run type checking with ty | | `make check` | Run all checks: lint, typecheck, test | +| `make release` | Build, verify, and publish the current pyproject version to PyPI, then tag and create a GitHub Release | | `make pre-commit` | Run all pre-commit checks locally (comprehensive test before commit) | @@ -284,12 +370,23 @@ The system automatically detects and processes files for all supported languages ### Step 2: Query the Codebase +**Interactive mode:** + Start the interactive RAG CLI: ```bash cgr start --repo-path /path/to/your/repo ``` +**Non-interactive mode (single query):** + +Run a single query and exit, with output sent to stdout (useful for scripting): + +```bash +python -m codebase_rag.main start --repo-path /path/to/your/repo \ + --ask-agent "What functions call UserService.create_user?" +``` + ### Step 2.5: Real-Time Graph Updates (Optional) For active development, you can keep your knowledge graph automatically synchronized with code changes using the realtime updater. This is particularly useful when you're actively modifying code and want the AI assistant to always work with the latest codebase structure. @@ -454,7 +551,7 @@ cgr optimize javascript --repo-path /path/to/frontend \ ``` **Supported Languages for Optimization:** -All supported languages: `python`, `javascript`, `typescript`, `rust`, `go`, `java`, `scala`, `cpp` +All supported languages: `python`, `javascript`, `typescript`, `rust`, `go`, `java`, `scala`, `c`, `cpp` **How It Works:** 1. **Analysis Phase**: The agent analyzes your codebase structure using the knowledge graph @@ -532,13 +629,16 @@ claude mcp add --transport stdio code-graph-rag \ | `list_projects` | List all indexed projects in the knowledge graph database. Returns a list of project names that have been indexed. | | `delete_project` | Delete a specific project from the knowledge graph database. This removes all nodes associated with the project while preserving other projects. Use list_projects first to see available projects. | | `wipe_database` | WARNING: Completely wipe the entire database, removing ALL indexed projects. This cannot be undone. Use delete_project for removing individual projects. | -| `index_repository` | Parse and ingest the repository into the Memgraph knowledge graph. This builds a comprehensive graph of functions, classes, dependencies, and relationships. Note: This preserves other projects - only the current project is re-indexed. | -| `query_code_graph` | Query the codebase knowledge graph using natural language. Ask questions like 'What functions call UserService.create_user?' or 'Show me all classes that implement the Repository interface'. | +| `index_repository` | WARNING: Clears all data for the current project including its embeddings. Parse and ingest the repository into the Memgraph knowledge graph. Use update_repository for incremental updates. Only use when explicitly requested. | +| `update_repository` | Update the repository in the Memgraph knowledge graph without clearing existing data. Use this for incremental updates. | +| `query_code_graph` | Query the codebase knowledge graph using natural language. Use semantic_search unless you know the exact names of classes/functions you are searching for. Ask questions like 'What functions call UserService.create_user?' or 'Show me all classes that implement the Repository interface'. | | `get_code_snippet` | Retrieve source code for a function, class, or method by its qualified name. Returns the source code, file path, line numbers, and docstring. | | `surgical_replace_code` | Surgically replace an exact code block in a file using diff-match-patch. Only modifies the exact target block, leaving the rest unchanged. | | `read_file` | Read the contents of a file from the project. Supports pagination for large files. | | `write_file` | Write content to a file, creating it if it doesn't exist. | | `list_directory` | List contents of a directory in the project. | +| `semantic_search` | Performs a semantic search for functions based on a natural language query describing their purpose, returning a list of potential matches with similarity scores. Requires the 'semantic' extra to be installed. | +| `ask_agent` | Ask the Code Graph RAG agent a question about the codebase. Uses the full RAG pipeline to analyze the code graph and provide a detailed answer. Use this for general questions about architecture, functionality, and code relationships. | ### Example Usage @@ -561,35 +661,35 @@ The knowledge graph uses the following node types and relationships: | Label | Properties | |-----|----------| | Project | `{name: string}` | -| Package | `{qualified_name: string, name: string, path: string}` | -| Folder | `{path: string, name: string}` | -| File | `{path: string, name: string, extension: string}` | -| Module | `{qualified_name: string, name: string, path: string}` | -| Class | `{qualified_name: string, name: string, decorators: list[string]}` | -| Function | `{qualified_name: string, name: string, decorators: list[string]}` | -| Method | `{qualified_name: string, name: string, decorators: list[string]}` | -| Interface | `{qualified_name: string, name: string}` | -| Enum | `{qualified_name: string, name: string}` | +| Package | `{qualified_name: string, name: string, path: string, absolute_path: string}` | +| Folder | `{path: string, name: string, absolute_path: string}` | +| File | `{path: string, name: string, extension: string, absolute_path: string}` | +| Module | `{qualified_name: string, name: string, path: string, absolute_path: string}` | +| Class | `{qualified_name: string, name: string, decorators: list[string], path: string, absolute_path: string}` | +| Function | `{qualified_name: string, name: string, decorators: list[string], path: string, absolute_path: string}` | +| Method | `{qualified_name: string, name: string, decorators: list[string], path: string, absolute_path: string}` | +| Interface | `{qualified_name: string, name: string, path: string, absolute_path: string}` | +| Enum | `{qualified_name: string, name: string, path: string, absolute_path: string}` | | Type | `{qualified_name: string, name: string}` | | Union | `{qualified_name: string, name: string}` | -| ModuleInterface | `{qualified_name: string, name: string, path: string}` | -| ModuleImplementation | `{qualified_name: string, name: string, path: string, implements_module: string}` | +| ModuleInterface | `{qualified_name: string, name: string, path: string, absolute_path: string}` | +| ModuleImplementation | `{qualified_name: string, name: string, path: string, absolute_path: string, implements_module: string}` | | ExternalPackage | `{name: string, version_spec: string}` | ### Language-Specific Mappings +- **C**: `enum_specifier`, `function_definition`, `struct_specifier`, `union_specifier` - **C++**: `class_specifier`, `declaration`, `enum_specifier`, `field_declaration`, `function_definition`, `lambda_expression`, `struct_specifier`, `template_declaration`, `union_specifier` - **Java**: `annotation_type_declaration`, `class_declaration`, `constructor_declaration`, `enum_declaration`, `interface_declaration`, `method_declaration`, `record_declaration` - **JavaScript**: `arrow_function`, `class`, `class_declaration`, `function_declaration`, `function_expression`, `generator_function_declaration`, `method_definition` - **Lua**: `function_declaration`, `function_definition` +- **PHP**: `anonymous_function`, `arrow_function`, `class_declaration`, `enum_declaration`, `function_definition`, `interface_declaration`, `method_declaration`, `trait_declaration` - **Python**: `class_definition`, `function_definition` - **Rust**: `closure_expression`, `enum_item`, `function_item`, `function_signature_item`, `impl_item`, `struct_item`, `trait_item`, `type_item`, `union_item` - **TypeScript**: `abstract_class_declaration`, `arrow_function`, `class`, `class_declaration`, `enum_declaration`, `function_declaration`, `function_expression`, `function_signature`, `generator_function_declaration`, `interface_declaration`, `internal_module`, `method_definition`, `type_alias_declaration` -- **C#**: `anonymous_method_expression`, `class_declaration`, `constructor_declaration`, `destructor_declaration`, `enum_declaration`, `function_pointer_type`, `interface_declaration`, `lambda_expression`, `local_function_statement`, `method_declaration`, `struct_declaration` -- **Go**: `function_declaration`, `method_declaration`, `type_declaration` -- **PHP**: `anonymous_function`, `arrow_function`, `class_declaration`, `enum_declaration`, `function_definition`, `function_static_declaration`, `interface_declaration`, `trait_declaration` +- **Go**: `function_declaration`, `method_declaration`, `type_alias`, `type_spec` - **Scala**: `class_definition`, `function_declaration`, `function_definition`, `object_definition`, `trait_definition` @@ -614,6 +714,7 @@ The knowledge graph uses the following node types and relationships: | ModuleImplementation | IMPLEMENTS | ModuleInterface | | Project | DEPENDS_ON_EXTERNAL | ExternalPackage | | Function, Method | CALLS | Function, Method | +| Module, Function, Method | INSTANTIATES | Class | ## 🔧 Configuration @@ -679,6 +780,7 @@ my_build_output - **pydantic-settings**: Settings management using Pydantic - **pymgclient**: Memgraph database adapter for Python language - **python-dotenv**: Read key-value pairs from a .env file and set them as environment variables +- **tiktoken**: tiktoken is a fast BPE tokeniser for use with OpenAI's models - **toml**: Python Library for Tom's Obvious, Minimal Language - **tree-sitter-python**: Python grammar for tree-sitter - **tree-sitter**: Python bindings to the Tree-sitter parsing library @@ -691,6 +793,7 @@ my_build_output - **protobuf** - **defusedxml**: XML bomb protection for Python stdlib modules - **huggingface-hub**: Client library to download and publish models, datasets and other repos on the huggingface.co hub +- **griffe**: Signatures for entire Python programs. Extract the structure, the frame, the skeleton of your project, to generate API documentation or find breaking changes in your API. ## 🤖 Agentic Workflow & Tools @@ -705,11 +808,10 @@ The agent has access to a suite of tools to understand and interact with the cod | Tool | Description | |----|-----------| | `query_graph` | Query the codebase knowledge graph using natural language questions. Ask in plain English about classes, functions, methods, dependencies, or code structure. Examples: 'Find all functions that call each other', 'What classes are in the user module', 'Show me functions with the longest call chains'. | -| `read_file` | Reads the content of text-based files. For documents like PDFs or images, use the 'analyze_document' tool instead. | +| `read_file` | Reads the content of text-based files. Images and PDFs the user references are attached inline; read them directly. | | `create_file` | Creates a new file with content. IMPORTANT: Check file existence first! Overwrites completely WITHOUT showing diff. Use only for new files, not existing file modifications. | | `replace_code` | Surgically replaces specific code blocks in files. Requires exact target code and replacement. Only modifies the specified block, leaving rest of file unchanged. True surgical patching. | | `list_directory` | Lists the contents of a directory to explore the codebase. | -| `analyze_document` | Analyzes documents (PDFs, images) to answer questions about their content. | | `execute_shell` | Executes shell commands from allowlist. Read-only commands run without approval; write operations require user confirmation. | | `semantic_search` | Performs a semantic search for functions based on a natural language query describing their purpose, returning a list of potential matches with similarity scores. | | `get_function_source` | Retrieves the source code for a specific function or method using its internal node ID, typically obtained from a semantic search result. | @@ -735,7 +837,7 @@ Code-Graph-RAG makes it easy to add support for any language that has a Tree-sit > **⚠️ Recommendation**: While you can add languages yourself, we recommend waiting for official full support to ensure optimal parsing quality, comprehensive feature coverage, and robust integration. The languages marked as "In Development" above will receive dedicated optimization and testing. -> **💡 Request Support**: If you want a specific language to be officially supported, please [submit an issue](https://github.com/vitali87/code-graph-rag/issues) with your language request. +> **💡 Request Support**: If you want a specific language to be officially supported, please [submit an issue](https://codeberg.org/vitali87/code-graph-rag/issues) with your language request. #### Quick Start: Add a Language @@ -887,3 +989,7 @@ We also offer custom development, integration consulting, technical support cont ## Star History [![Star History Chart](https://api.star-history.com/svg?repos=vitali87/code-graph-rag&type=Date)](https://www.star-history.com/#vitali87/code-graph-rag&Date) + +## Fork History + +[![Fork History Chart](https://fork-history.site/svg?repos=vitali87/code-graph-rag)](https://fork-history.site/#vitali87/code-graph-rag) diff --git a/SECURITY.md b/SECURITY.md new file mode 100644 index 000000000..66299ab1b --- /dev/null +++ b/SECURITY.md @@ -0,0 +1,46 @@ +# Security Policy + +## Supported Versions + +| Version | Supported | +| ------- | ------------------ | +| 0.0.x | :white_check_mark: | + +As the project is in early development (pre 1.0), only the latest release receives security updates. Please ensure you are running the most recent version before reporting a vulnerability. + +## Reporting a Vulnerability + +**Please do not report security vulnerabilities through public issues, pull requests, or any other public channels.** + +Instead, please open a [confidential issue](https://codeberg.org/vitali87/code-graph-rag/issues/new) on Codeberg (tick "this issue is confidential" before submitting). This ensures the details remain confidential until a fix is available. + +When reporting, please include: + +- A description of the vulnerability and its potential impact +- Steps to reproduce or a proof of concept +- The version(s) affected +- Any suggested fix, if available + +## What to Expect + +- **Acknowledgement** within 72 hours of your report +- **Status update** within 7 days with an initial assessment +- **Resolution target** of 30 days for confirmed vulnerabilities, though critical issues will be prioritized for faster turnaround + +If the vulnerability is accepted, we will work on a fix, coordinate disclosure with you, and credit you in the release notes (unless you prefer to remain anonymous). + +If the vulnerability is declined, we will provide a clear explanation of why. + +## Scope + +This policy applies to the `code-graph-rag` Python package and its official repository. Third party dependencies are outside the direct scope of this policy, though we use Dependabot to monitor and update them. + +## Security Measures in This Project + +- **Dependency scanning**: Dependabot is enabled for automated dependency updates +- **Secret scanning**: GitHub secret scanning is active on this repository +- **Branch protection**: The `main` branch requires pull request reviews before merging + +## Preferred Languages + +We accept security reports in English. diff --git a/benchmarks/bench_ast_cache.py b/benchmarks/bench_ast_cache.py new file mode 100644 index 000000000..b1e3e65d9 --- /dev/null +++ b/benchmarks/bench_ast_cache.py @@ -0,0 +1,134 @@ +import statistics +import sys +import time +from collections import OrderedDict +from pathlib import Path + +WARMUP_RUNS = 3 +BENCH_RUNS = 50 + + +class MockNode: + __slots__ = ("data",) + + def __init__(self, size: int) -> None: + self.data = b"\x00" * size + + +def bench_ordered_dict_insert(count: int, item_size: int) -> float: + start = time.perf_counter() + cache: OrderedDict[Path, tuple[MockNode, str]] = OrderedDict() + for i in range(count): + key = Path(f"/fake/path/module_{i}.py") + cache[key] = (MockNode(item_size), "python") + return time.perf_counter() - start + + +def bench_ordered_dict_lookup(cache: OrderedDict, keys: list[Path]) -> float: + start = time.perf_counter() + for key in keys: + _ = key in cache + return time.perf_counter() - start + + +def bench_ordered_dict_access_lru(cache: OrderedDict, keys: list[Path]) -> float: + start = time.perf_counter() + for key in keys: + if key in cache: + cache.move_to_end(key) + _ = cache[key] + return time.perf_counter() - start + + +def bench_ordered_dict_eviction(count: int, max_size: int, item_size: int) -> float: + start = time.perf_counter() + cache: OrderedDict[Path, tuple[MockNode, str]] = OrderedDict() + for i in range(count): + key = Path(f"/fake/path/module_{i}.py") + cache[key] = (MockNode(item_size), "python") + while len(cache) > max_size: + cache.popitem(last=False) + return time.perf_counter() - start + + +def bench_getsizeof_overhead(cache: OrderedDict) -> float: + start = time.perf_counter() + _ = sum(sys.getsizeof(v) for v in cache.values()) + return time.perf_counter() - start + + +def run_benchmark(name: str, func, *args) -> dict[str, float]: + for _ in range(WARMUP_RUNS): + func(*args) + + times = [] + for _ in range(BENCH_RUNS): + times.append(func(*args)) + + return { + "name": name, + "median_ms": statistics.median(times) * 1000, + "mean_ms": statistics.mean(times) * 1000, + "stddev_ms": statistics.stdev(times) * 1000 if len(times) > 1 else 0, + "min_ms": min(times) * 1000, + "max_ms": max(times) * 1000, + "p95_ms": sorted(times)[int(len(times) * 0.95)] * 1000, + } + + +def print_results(results: list[dict[str, float]]) -> None: + print(f"\n{'Benchmark':<45} {'Median':>10} {'Mean':>10} {'StdDev':>10} {'Min':>10} {'Max':>10} {'P95':>10}") + print("-" * 115) + for r in results: + print( + f"{r['name']:<45} {r['median_ms']:>9.3f}ms {r['mean_ms']:>9.3f}ms " + f"{r['stddev_ms']:>9.3f}ms {r['min_ms']:>9.3f}ms {r['max_ms']:>9.3f}ms " + f"{r['p95_ms']:>9.3f}ms" + ) + + +def main() -> None: + configs = [ + (500, 1024), + (2000, 4096), + (5000, 8192), + ] + + for count, item_size in configs: + print(f"\n{'='*115}") + print(f"BoundedASTCache Benchmark (entries={count}, item_size={item_size}B)") + print(f"{'='*115}") + + results = [] + + r = run_benchmark(f"insert ({count})", bench_ordered_dict_insert, count, item_size) + results.append(r) + + cache: OrderedDict[Path, tuple[MockNode, str]] = OrderedDict() + keys: list[Path] = [] + for i in range(count): + key = Path(f"/fake/path/module_{i}.py") + keys.append(key) + cache[key] = (MockNode(item_size), "python") + + r = run_benchmark(f"lookup ({count})", bench_ordered_dict_lookup, cache, keys) + results.append(r) + + r = run_benchmark(f"access+LRU ({count})", bench_ordered_dict_access_lru, cache, keys) + results.append(r) + + max_size = count // 2 + r = run_benchmark( + f"insert+evict (max={max_size})", + bench_ordered_dict_eviction, count, max_size, item_size, + ) + results.append(r) + + r = run_benchmark(f"getsizeof scan ({count})", bench_getsizeof_overhead, cache) + results.append(r) + + print_results(results) + + +if __name__ == "__main__": + main() diff --git a/benchmarks/bench_dropin_replacements.py b/benchmarks/bench_dropin_replacements.py new file mode 100644 index 000000000..ee4eb0b0a --- /dev/null +++ b/benchmarks/bench_dropin_replacements.py @@ -0,0 +1,267 @@ +import hashlib +import json +import os +import statistics +import tempfile +import time +from pathlib import Path + +try: + import blake3 + import orjson +except ImportError as e: + print(f"SKIP bench_dropin_replacements: {e}") + print("Install with: uv pip install blake3 orjson") + raise SystemExit(0) + +WARMUP_RUNS = 3 +BENCH_RUNS = 30 + + +def generate_graph_data(num_nodes: int, num_rels: int) -> dict: + nodes = [] + for i in range(num_nodes): + nodes.append({ + "node_id": i, + "labels": ["Function" if i % 3 == 0 else "Class" if i % 3 == 1 else "Module"], + "properties": { + "qualified_name": f"project.module{i // 100}.Class{i // 10}.method{i}", + "name": f"method{i}", + "start_line": i * 10, + "end_line": i * 10 + 9, + "docstring": f"Method {i} documentation string with some content" if i % 5 == 0 else None, + "decorators": ["staticmethod"] if i % 7 == 0 else [], + "is_exported": i % 4 == 0, + }, + }) + + rels = [] + for i in range(num_rels): + rels.append({ + "from_id": i % num_nodes, + "to_id": (i * 7 + 3) % num_nodes, + "type": "CALLS" if i % 3 == 0 else "DEFINES" if i % 3 == 1 else "IMPORTS", + "properties": {"weight": i % 10} if i % 5 == 0 else {}, + }) + + return { + "nodes": nodes, + "relationships": rels, + "metadata": { + "total_nodes": num_nodes, + "total_relationships": num_rels, + "exported_at": "2026-03-14T10:00:00+00:00", + }, + } + + +def generate_snippets(count: int, avg_length: int = 200) -> list[str]: + import random + import string + random.seed(42) + snippets = [] + for _ in range(count): + length = avg_length + random.randint(-50, 50) + snippet = "".join(random.choices(string.ascii_letters + string.digits + " \n\t", k=length)) + snippets.append(snippet) + return snippets + + +def create_test_files(directory: str, count: int, avg_size_kb: int) -> list[Path]: + paths = [] + for i in range(count): + path = Path(directory) / f"file_{i}.py" + content = os.urandom(avg_size_kb * 1024) + path.write_bytes(content) + paths.append(path) + return paths + + +def bench_json_dumps(data: dict) -> float: + start = time.perf_counter() + _ = json.dumps(data) + return time.perf_counter() - start + + +def bench_orjson_dumps(data: dict) -> float: + start = time.perf_counter() + _ = orjson.dumps(data) + return time.perf_counter() - start + + +def bench_json_dumps_indent(data: dict) -> float: + start = time.perf_counter() + _ = json.dumps(data, indent=2, ensure_ascii=False) + return time.perf_counter() - start + + +def bench_orjson_dumps_indent(data: dict) -> float: + start = time.perf_counter() + _ = orjson.dumps(data, option=orjson.OPT_INDENT_2) + return time.perf_counter() - start + + +def bench_json_loads(json_bytes: bytes) -> float: + start = time.perf_counter() + _ = json.loads(json_bytes) + return time.perf_counter() - start + + +def bench_orjson_loads(json_bytes: bytes) -> float: + start = time.perf_counter() + _ = orjson.loads(json_bytes) + return time.perf_counter() - start + + +def bench_sha256_hashing(snippets: list[str]) -> float: + start = time.perf_counter() + for s in snippets: + _ = hashlib.sha256(s.encode()).hexdigest() + return time.perf_counter() - start + + +def bench_blake3_hashing(snippets: list[str]) -> float: + start = time.perf_counter() + for s in snippets: + _ = blake3.blake3(s.encode()).hexdigest() + return time.perf_counter() - start + + +def bench_sha256_file(files: list[Path]) -> float: + start = time.perf_counter() + for f in files: + hasher = hashlib.sha256() + with f.open("rb") as fh: + while chunk := fh.read(8192): + hasher.update(chunk) + _ = hasher.hexdigest() + return time.perf_counter() - start + + +def bench_blake3_file(files: list[Path]) -> float: + start = time.perf_counter() + for f in files: + hasher = blake3.blake3() + with f.open("rb") as fh: + while chunk := fh.read(8192): + hasher.update(chunk) + _ = hasher.hexdigest() + return time.perf_counter() - start + + +def run_benchmark(name: str, func, *args) -> dict[str, float]: + for _ in range(WARMUP_RUNS): + func(*args) + + times = [] + for _ in range(BENCH_RUNS): + times.append(func(*args)) + + return { + "name": name, + "median_ms": statistics.median(times) * 1000, + "mean_ms": statistics.mean(times) * 1000, + "stddev_ms": statistics.stdev(times) * 1000 if len(times) > 1 else 0, + "min_ms": min(times) * 1000, + "max_ms": max(times) * 1000, + "p95_ms": sorted(times)[int(len(times) * 0.95)] * 1000, + } + + +def print_results(results: list[dict[str, float]]) -> None: + print(f"\n{'Benchmark':<50} {'Median':>10} {'Mean':>10} {'StdDev':>10} {'Min':>10} {'Max':>10} {'P95':>10}") + print("-" * 120) + for r in results: + print( + f"{r['name']:<50} {r['median_ms']:>9.3f}ms {r['mean_ms']:>9.3f}ms " + f"{r['stddev_ms']:>9.3f}ms {r['min_ms']:>9.3f}ms {r['max_ms']:>9.3f}ms " + f"{r['p95_ms']:>9.3f}ms" + ) + + +def print_comparison(baseline: dict[str, float], optimized: dict[str, float]) -> None: + speedup = baseline["median_ms"] / optimized["median_ms"] if optimized["median_ms"] > 0 else float("inf") + print(f" -> Speedup: {speedup:.1f}x (median)") + + +def main() -> None: + print("=" * 120) + print("DROP-IN REPLACEMENT BENCHMARKS: Python stdlib vs Rust-backed alternatives") + print("=" * 120) + + # --- JSON Serialization --- + for num_nodes, num_rels in [(1000, 2000), (5000, 10000), (20000, 50000)]: + print(f"\n{'='*120}") + print(f"JSON Serialization: stdlib json vs orjson (nodes={num_nodes}, rels={num_rels})") + print(f"{'='*120}") + + data = generate_graph_data(num_nodes, num_rels) + json_bytes = json.dumps(data).encode() + orjson_bytes = orjson.dumps(data) + print(f"Data size: {len(json_bytes) / 1024:.1f} KB") + + results = [] + + r1 = run_benchmark(f"json.dumps compact ({num_nodes}n)", bench_json_dumps, data) + results.append(r1) + r2 = run_benchmark(f"orjson.dumps compact ({num_nodes}n)", bench_orjson_dumps, data) + results.append(r2) + + r3 = run_benchmark(f"json.dumps indented ({num_nodes}n)", bench_json_dumps_indent, data) + results.append(r3) + r4 = run_benchmark(f"orjson.dumps indented ({num_nodes}n)", bench_orjson_dumps_indent, data) + results.append(r4) + + r5 = run_benchmark(f"json.loads ({num_nodes}n)", bench_json_loads, json_bytes) + results.append(r5) + r6 = run_benchmark(f"orjson.loads ({num_nodes}n)", bench_orjson_loads, orjson_bytes) + results.append(r6) + + print_results(results) + + print("\nSpeedups:") + print(f" dumps compact: {r1['median_ms'] / r2['median_ms']:.1f}x") + print(f" dumps indented: {r3['median_ms'] / r4['median_ms']:.1f}x") + print(f" loads: {r5['median_ms'] / r6['median_ms']:.1f}x") + + # --- Hashing: SHA256 vs BLAKE3 --- + print(f"\n\n{'='*120}") + print("Hashing: hashlib.sha256 vs blake3 (snippet hashing for EmbeddingCache)") + print(f"{'='*120}") + + for size in [500, 2000, 10000]: + snippets = generate_snippets(size) + print(f"\n--- Snippet count: {size} ---") + + results = [] + r1 = run_benchmark(f"hashlib.sha256 ({size} snippets)", bench_sha256_hashing, snippets) + results.append(r1) + r2 = run_benchmark(f"blake3 ({size} snippets)", bench_blake3_hashing, snippets) + results.append(r2) + + print_results(results) + print(f" Speedup: {r1['median_ms'] / r2['median_ms']:.1f}x") + + # --- File Hashing --- + print(f"\n\n{'='*120}") + print("File Hashing: SHA256 vs BLAKE3 (incremental build file change detection)") + print(f"{'='*120}") + + for file_count, avg_size_kb in [(50, 5), (200, 10), (500, 20)]: + with tempfile.TemporaryDirectory() as tmpdir: + files = create_test_files(tmpdir, file_count, avg_size_kb) + total_mb = sum(f.stat().st_size for f in files) / (1024 * 1024) + print(f"\n--- Files: {file_count}, Total: {total_mb:.1f} MB ---") + + results = [] + r1 = run_benchmark(f"sha256 ({file_count}f, {avg_size_kb}KB avg)", bench_sha256_file, files) + results.append(r1) + r2 = run_benchmark(f"blake3 ({file_count}f, {avg_size_kb}KB avg)", bench_blake3_file, files) + results.append(r2) + + print_results(results) + print(f" Speedup: {r1['median_ms'] / r2['median_ms']:.1f}x") + + +if __name__ == "__main__": + main() diff --git a/benchmarks/bench_embedding_cache.py b/benchmarks/bench_embedding_cache.py new file mode 100644 index 000000000..b63e93338 --- /dev/null +++ b/benchmarks/bench_embedding_cache.py @@ -0,0 +1,130 @@ +import hashlib +import random +import statistics +import string +import time + +from codebase_rag.embedder import EmbeddingCache + +WARMUP_RUNS = 3 +BENCH_RUNS = 50 +EMBEDDING_DIM = 768 + + +def generate_snippets(count: int, avg_length: int = 200) -> list[str]: + snippets = [] + for i in range(count): + length = avg_length + random.randint(-50, 50) + snippet = "".join(random.choices(string.ascii_letters + string.digits + " \n\t", k=length)) + snippets.append(snippet) + return snippets + + +def generate_embedding() -> list[float]: + return [random.random() for _ in range(EMBEDDING_DIM)] + + +def bench_sha256_hashing(snippets: list[str]) -> float: + start = time.perf_counter() + for s in snippets: + _ = hashlib.sha256(s.encode()).hexdigest() + return time.perf_counter() - start + + +def bench_cache_put(cache: EmbeddingCache, snippets: list[str], embeddings: list[list[float]]) -> float: + start = time.perf_counter() + for s, e in zip(snippets, embeddings): + cache.put(s, e) + return time.perf_counter() - start + + +def bench_cache_get_hit(cache: EmbeddingCache, snippets: list[str]) -> float: + start = time.perf_counter() + for s in snippets: + _ = cache.get(s) + return time.perf_counter() - start + + +def bench_cache_get_miss(cache: EmbeddingCache, miss_snippets: list[str]) -> float: + start = time.perf_counter() + for s in miss_snippets: + _ = cache.get(s) + return time.perf_counter() - start + + +def bench_cache_get_many(cache: EmbeddingCache, snippets: list[str]) -> float: + start = time.perf_counter() + _ = cache.get_many(snippets) + return time.perf_counter() - start + + +def run_benchmark(name: str, func, *args) -> dict[str, float]: + for _ in range(WARMUP_RUNS): + func(*args) + + times = [] + for _ in range(BENCH_RUNS): + times.append(func(*args)) + + return { + "name": name, + "median_ms": statistics.median(times) * 1000, + "mean_ms": statistics.mean(times) * 1000, + "stddev_ms": statistics.stdev(times) * 1000 if len(times) > 1 else 0, + "min_ms": min(times) * 1000, + "max_ms": max(times) * 1000, + "p95_ms": sorted(times)[int(len(times) * 0.95)] * 1000, + } + + +def print_results(results: list[dict[str, float]]) -> None: + print(f"\n{'Benchmark':<40} {'Median':>10} {'Mean':>10} {'StdDev':>10} {'Min':>10} {'Max':>10} {'P95':>10}") + print("-" * 110) + for r in results: + print( + f"{r['name']:<40} {r['median_ms']:>9.3f}ms {r['mean_ms']:>9.3f}ms " + f"{r['stddev_ms']:>9.3f}ms {r['min_ms']:>9.3f}ms {r['max_ms']:>9.3f}ms " + f"{r['p95_ms']:>9.3f}ms" + ) + + +def main() -> None: + random.seed(42) + + sizes = [500, 2000, 10000] + + for size in sizes: + print(f"\n{'='*110}") + print(f"EmbeddingCache Benchmark (n={size})") + print(f"{'='*110}") + + snippets = generate_snippets(size) + embeddings = [generate_embedding() for _ in range(size)] + miss_snippets = generate_snippets(size, avg_length=300) + + results = [] + + r = run_benchmark(f"sha256 hashing ({size})", bench_sha256_hashing, snippets) + results.append(r) + + cache = EmbeddingCache() + r = run_benchmark(f"cache.put ({size})", bench_cache_put, cache, snippets, embeddings) + results.append(r) + + cache = EmbeddingCache() + cache.put_many(snippets, embeddings) + + r = run_benchmark(f"cache.get hit ({size})", bench_cache_get_hit, cache, snippets) + results.append(r) + + r = run_benchmark(f"cache.get miss ({size})", bench_cache_get_miss, cache, miss_snippets) + results.append(r) + + r = run_benchmark(f"cache.get_many ({size})", bench_cache_get_many, cache, snippets) + results.append(r) + + print_results(results) + + +if __name__ == "__main__": + main() diff --git a/benchmarks/bench_file_hashing.py b/benchmarks/bench_file_hashing.py new file mode 100644 index 000000000..3be76059b --- /dev/null +++ b/benchmarks/bench_file_hashing.py @@ -0,0 +1,138 @@ +import hashlib +import os +import statistics +import tempfile +import time +from pathlib import Path + +WARMUP_RUNS = 3 +BENCH_RUNS = 30 + + +def create_test_files(directory: str, count: int, avg_size_kb: int) -> list[Path]: + paths = [] + for i in range(count): + path = Path(directory) / f"file_{i}.py" + content = os.urandom(avg_size_kb * 1024) + path.write_bytes(content) + paths.append(path) + return paths + + +def hash_file_sha256(filepath: Path) -> str: + hasher = hashlib.sha256() + with filepath.open("rb") as f: + while chunk := f.read(8192): + hasher.update(chunk) + return hasher.hexdigest() + + +def hash_file_sha256_large_buffer(filepath: Path) -> str: + hasher = hashlib.sha256() + with filepath.open("rb") as f: + while chunk := f.read(65536): + hasher.update(chunk) + return hasher.hexdigest() + + +def hash_file_sha256_mmap(filepath: Path) -> str: + import mmap + hasher = hashlib.sha256() + with filepath.open("rb") as f: + with mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ) as mm: + hasher.update(mm) + return hasher.hexdigest() + + +def hash_file_md5(filepath: Path) -> str: + hasher = hashlib.md5() + with filepath.open("rb") as f: + while chunk := f.read(8192): + hasher.update(chunk) + return hasher.hexdigest() + + +def hash_file_blake2b(filepath: Path) -> str: + hasher = hashlib.blake2b() + with filepath.open("rb") as f: + while chunk := f.read(8192): + hasher.update(chunk) + return hasher.hexdigest() + + +def bench_hash_files(files: list[Path], hash_func) -> float: + start = time.perf_counter() + for f in files: + _ = hash_func(f) + return time.perf_counter() - start + + +def run_benchmark(name: str, func, *args) -> dict[str, float]: + for _ in range(WARMUP_RUNS): + func(*args) + + times = [] + for _ in range(BENCH_RUNS): + times.append(func(*args)) + + return { + "name": name, + "median_ms": statistics.median(times) * 1000, + "mean_ms": statistics.mean(times) * 1000, + "stddev_ms": statistics.stdev(times) * 1000 if len(times) > 1 else 0, + "min_ms": min(times) * 1000, + "max_ms": max(times) * 1000, + "p95_ms": sorted(times)[int(len(times) * 0.95)] * 1000, + } + + +def print_results(results: list[dict[str, float]]) -> None: + print(f"\n{'Benchmark':<45} {'Median':>10} {'Mean':>10} {'StdDev':>10} {'Min':>10} {'Max':>10} {'P95':>10}") + print("-" * 115) + for r in results: + print( + f"{r['name']:<45} {r['median_ms']:>9.3f}ms {r['mean_ms']:>9.3f}ms " + f"{r['stddev_ms']:>9.3f}ms {r['min_ms']:>9.3f}ms {r['max_ms']:>9.3f}ms " + f"{r['p95_ms']:>9.3f}ms" + ) + + +def main() -> None: + configs = [ + (50, 5), + (200, 10), + (500, 20), + ] + + for file_count, avg_size_kb in configs: + print(f"\n{'='*115}") + print(f"File Hashing Benchmark (files={file_count}, avg_size={avg_size_kb}KB)") + print(f"{'='*115}") + + with tempfile.TemporaryDirectory() as tmpdir: + files = create_test_files(tmpdir, file_count, avg_size_kb) + total_mb = sum(f.stat().st_size for f in files) / (1024 * 1024) + print(f"Total data: {total_mb:.1f} MB") + + results = [] + + r = run_benchmark(f"sha256 8KB buf ({file_count}f)", bench_hash_files, files, hash_file_sha256) + results.append(r) + + r = run_benchmark(f"sha256 64KB buf ({file_count}f)", bench_hash_files, files, hash_file_sha256_large_buffer) + results.append(r) + + r = run_benchmark(f"sha256 mmap ({file_count}f)", bench_hash_files, files, hash_file_sha256_mmap) + results.append(r) + + r = run_benchmark(f"md5 ({file_count}f)", bench_hash_files, files, hash_file_md5) + results.append(r) + + r = run_benchmark(f"blake2b ({file_count}f)", bench_hash_files, files, hash_file_blake2b) + results.append(r) + + print_results(results) + + +if __name__ == "__main__": + main() diff --git a/benchmarks/bench_find_ending_with_fix.py b/benchmarks/bench_find_ending_with_fix.py new file mode 100644 index 000000000..c9ef01cae --- /dev/null +++ b/benchmarks/bench_find_ending_with_fix.py @@ -0,0 +1,218 @@ +import statistics +import time +from collections import defaultdict + +from codebase_rag.graph_updater import FunctionRegistryTrie +from codebase_rag.types_defs import NodeType, SimpleNameLookup + +WARMUP_RUNS = 3 +BENCH_RUNS = 30 + + +def generate_realistic_registry(count: int) -> tuple[list[str], list[str]]: + modules = ["codebase_rag", "utils", "parsers", "services", "tools", "models"] + submodules = ["core", "api", "handlers", "helpers", "base", "factory"] + classes = ["Handler", "Manager", "Factory", "Builder", "Processor", "Resolver", + "Analyzer", "Extractor", "Generator", "Validator"] + methods = ["process", "handle", "create", "build", "resolve", "validate", + "execute", "parse", "extract", "transform", "analyze", "generate", + "find", "get", "set", "update", "delete", "check"] + + qualified_names = [] + for i in range(count): + mod = modules[i % len(modules)] + sub = submodules[(i // len(modules)) % len(submodules)] + cls = classes[(i // (len(modules) * len(submodules))) % len(classes)] + meth = methods[(i // (len(modules) * len(submodules) * len(classes))) % len(methods)] + qualified_names.append(f"{mod}.{sub}.{cls}.method_{i}.{meth}") + + lookup_suffixes = methods + [f"method_{i}" for i in range(0, count, count // 20)] + return qualified_names, lookup_suffixes + + +def bench_linear_scan_endswith(entries: dict[str, NodeType], suffix: str) -> float: + start = time.perf_counter() + _ = [qn for qn in entries.keys() if qn.endswith(f".{suffix}")] + return time.perf_counter() - start + + +def bench_indexed_lookup(lookup: SimpleNameLookup, suffix: str) -> float: + start = time.perf_counter() + _ = list(lookup.get(suffix, set())) + return time.perf_counter() - start + + +def bench_trie_find_ending_with_index_hit( + trie: FunctionRegistryTrie, suffixes: list[str], indexed_suffixes: set[str] +) -> float: + start = time.perf_counter() + for suffix in suffixes: + if suffix in indexed_suffixes: + _ = trie.find_ending_with(suffix) + return time.perf_counter() - start + + +def bench_trie_find_ending_with_index_miss( + trie: FunctionRegistryTrie, suffixes: list[str], indexed_suffixes: set[str] +) -> float: + start = time.perf_counter() + for suffix in suffixes: + if suffix not in indexed_suffixes: + _ = trie.find_ending_with(suffix) + return time.perf_counter() - start + + +def bench_trie_find_ending_with_all( + trie: FunctionRegistryTrie, suffixes: list[str] +) -> float: + start = time.perf_counter() + for suffix in suffixes: + _ = trie.find_ending_with(suffix) + return time.perf_counter() - start + + +def bench_linear_scan_batch(entries: dict[str, NodeType], suffixes: list[str]) -> float: + start = time.perf_counter() + for suffix in suffixes: + _ = [qn for qn in entries.keys() if qn.endswith(f".{suffix}")] + return time.perf_counter() - start + + +def bench_indexed_lookup_batch(lookup: SimpleNameLookup, suffixes: list[str]) -> float: + start = time.perf_counter() + for suffix in suffixes: + _ = list(lookup.get(suffix, set())) + return time.perf_counter() - start + + +def bench_full_suffix_index_batch( + suffix_index: dict[str, set[str]], suffixes: list[str] +) -> float: + start = time.perf_counter() + for suffix in suffixes: + _ = list(suffix_index.get(suffix, set())) + return time.perf_counter() - start + + +def build_full_suffix_index(qualified_names: list[str]) -> dict[str, set[str]]: + index: dict[str, set[str]] = defaultdict(set) + for qn in qualified_names: + simple_name = qn.rsplit(".", 1)[-1] + index[simple_name].add(qn) + return dict(index) + + +def run_benchmark(name: str, func, *args) -> dict[str, float]: + for _ in range(WARMUP_RUNS): + func(*args) + + times = [] + for _ in range(BENCH_RUNS): + times.append(func(*args)) + + return { + "name": name, + "median_ms": statistics.median(times) * 1000, + "mean_ms": statistics.mean(times) * 1000, + "stddev_ms": statistics.stdev(times) * 1000 if len(times) > 1 else 0, + "min_ms": min(times) * 1000, + "max_ms": max(times) * 1000, + "p95_ms": sorted(times)[int(len(times) * 0.95)] * 1000, + } + + +def print_results(results: list[dict[str, float]]) -> None: + print(f"\n{'Benchmark':<55} {'Median':>10} {'Mean':>10} {'StdDev':>10} {'Min':>10} {'Max':>10} {'P95':>10}") + print("-" * 125) + for r in results: + print( + f"{r['name']:<55} {r['median_ms']:>9.3f}ms {r['mean_ms']:>9.3f}ms " + f"{r['stddev_ms']:>9.3f}ms {r['min_ms']:>9.3f}ms {r['max_ms']:>9.3f}ms " + f"{r['p95_ms']:>9.3f}ms" + ) + + +def main() -> None: + print("=" * 125) + print("find_ending_with FIX BENCHMARK: Linear Scan vs Indexed Lookup") + print("This benchmarks the #1 CPU hotspot (48.3% of total runtime)") + print("=" * 125) + + sizes = [1000, 4500, 10000] + + for size in sizes: + print(f"\n{'='*125}") + print(f"Registry size: {size} entries") + print(f"{'='*125}") + + qualified_names, lookup_suffixes = generate_realistic_registry(size) + + simple_lookup: SimpleNameLookup = defaultdict(set) + trie = FunctionRegistryTrie(simple_name_lookup=simple_lookup) + for qn in qualified_names: + trie.insert(qn, NodeType.FUNCTION) + simple_name = qn.rsplit(".", 1)[-1] + simple_lookup[simple_name].add(qn) + + full_suffix_index = build_full_suffix_index(qualified_names) + + partially_indexed_suffixes = set(list(simple_lookup.keys())[:len(simple_lookup) // 5]) + miss_suffixes = [s for s in lookup_suffixes if s not in partially_indexed_suffixes] + + results = [] + + print(f"\nSingle-suffix operations (on '{lookup_suffixes[0]}'):") + r = run_benchmark( + f"LINEAR SCAN endswith ({size} entries)", + bench_linear_scan_endswith, dict(trie.items()), lookup_suffixes[0], + ) + results.append(r) + + r = run_benchmark( + f"INDEXED lookup (hit) ({size} entries)", + bench_indexed_lookup, simple_lookup, lookup_suffixes[0], + ) + results.append(r) + + print_results(results) + if results[1]["median_ms"] > 0: + speedup = results[0]["median_ms"] / results[1]["median_ms"] + print(f"\n -> Index hit speedup: {speedup:.0f}x") + + results = [] + num_queries = len(lookup_suffixes) + print(f"\nBatch operations ({num_queries} queries, simulating call resolution):") + + r = run_benchmark( + f"LINEAR SCAN batch ({num_queries}q, {size} entries)", + bench_linear_scan_batch, dict(trie.items()), lookup_suffixes, + ) + results.append(r) + + r = run_benchmark( + f"PARTIAL INDEX batch ({num_queries}q, {size} entries)", + bench_trie_find_ending_with_all, trie, lookup_suffixes, + ) + results.append(r) + + r = run_benchmark( + f"FULL SUFFIX INDEX batch ({num_queries}q, {size} entries)", + bench_full_suffix_index_batch, full_suffix_index, lookup_suffixes, + ) + results.append(r) + + print_results(results) + + if results[2]["median_ms"] > 0: + print(f"\n -> Linear scan vs full index: {results[0]['median_ms'] / results[2]['median_ms']:.0f}x speedup") + print(f" -> Partial index vs full index: {results[1]['median_ms'] / results[2]['median_ms']:.1f}x speedup") + + print(f"\n\n{'='*125}") + print("CONCLUSION: The 48.3% CPU hotspot is caused by linear scans on index misses.") + print("Building a complete suffix index eliminates the bottleneck entirely.") + print("This is a pure Python fix requiring zero FFI, zero new dependencies.") + print(f"{'='*125}") + + +if __name__ == "__main__": + main() diff --git a/benchmarks/bench_graph_loader.py b/benchmarks/bench_graph_loader.py new file mode 100644 index 000000000..f93ccd7a4 --- /dev/null +++ b/benchmarks/bench_graph_loader.py @@ -0,0 +1,169 @@ +import json +import statistics +import tempfile +import time +from pathlib import Path + +from codebase_rag.graph_loader import GraphLoader + +WARMUP_RUNS = 2 +BENCH_RUNS = 20 + + +def generate_graph_json(num_nodes: int, num_rels: int) -> str: + nodes = [] + for i in range(num_nodes): + nodes.append({ + "node_id": i, + "labels": ["Function" if i % 3 == 0 else "Class" if i % 3 == 1 else "Module"], + "properties": { + "qualified_name": f"project.module{i // 100}.Class{i // 10}.method{i}", + "name": f"method{i}", + "start_line": i * 10, + "end_line": i * 10 + 9, + }, + }) + + rels = [] + for i in range(num_rels): + rels.append({ + "from_id": i % num_nodes, + "to_id": (i * 7 + 3) % num_nodes, + "type": "CALLS" if i % 2 == 0 else "DEFINES", + "properties": {}, + }) + + graph = { + "nodes": nodes, + "relationships": rels, + "metadata": { + "total_nodes": num_nodes, + "total_relationships": num_rels, + }, + } + return json.dumps(graph) + + +def bench_json_parse(json_str: str) -> float: + start = time.perf_counter() + _ = json.loads(json_str) + return time.perf_counter() - start + + +def bench_graph_load(file_path: str) -> float: + start = time.perf_counter() + loader = GraphLoader(file_path) + loader.load() + return time.perf_counter() - start + + +def bench_find_nodes_by_label(loader: GraphLoader) -> float: + labels = ["Function", "Class", "Module"] + start = time.perf_counter() + for label in labels: + _ = loader.find_nodes_by_label(label) + return time.perf_counter() - start + + +def bench_find_node_by_property(loader: GraphLoader) -> float: + start = time.perf_counter() + for i in range(100): + qn = f"project.module{i}.Class{i * 10 // 10}.method{i * 10}" + _ = loader.find_node_by_property("qualified_name", qn) + return time.perf_counter() - start + + +def bench_get_relationships(loader: GraphLoader, num_nodes: int) -> float: + start = time.perf_counter() + for i in range(min(500, num_nodes)): + _ = loader.get_relationships_for_node(i) + return time.perf_counter() - start + + +def bench_summary(loader: GraphLoader) -> float: + start = time.perf_counter() + _ = loader.summary() + return time.perf_counter() - start + + +def run_benchmark(name: str, func, *args) -> dict[str, float]: + for _ in range(WARMUP_RUNS): + func(*args) + + times = [] + for _ in range(BENCH_RUNS): + times.append(func(*args)) + + return { + "name": name, + "median_ms": statistics.median(times) * 1000, + "mean_ms": statistics.mean(times) * 1000, + "stddev_ms": statistics.stdev(times) * 1000 if len(times) > 1 else 0, + "min_ms": min(times) * 1000, + "max_ms": max(times) * 1000, + "p95_ms": sorted(times)[int(len(times) * 0.95)] * 1000, + } + + +def print_results(results: list[dict[str, float]]) -> None: + print(f"\n{'Benchmark':<40} {'Median':>10} {'Mean':>10} {'StdDev':>10} {'Min':>10} {'Max':>10} {'P95':>10}") + print("-" * 110) + for r in results: + print( + f"{r['name']:<40} {r['median_ms']:>9.3f}ms {r['mean_ms']:>9.3f}ms " + f"{r['stddev_ms']:>9.3f}ms {r['min_ms']:>9.3f}ms {r['max_ms']:>9.3f}ms " + f"{r['p95_ms']:>9.3f}ms" + ) + + +def main() -> None: + configs = [ + (1000, 2000), + (5000, 10000), + (20000, 50000), + ] + + for num_nodes, num_rels in configs: + print(f"\n{'='*110}") + print(f"GraphLoader Benchmark (nodes={num_nodes}, rels={num_rels})") + print(f"{'='*110}") + + json_str = generate_graph_json(num_nodes, num_rels) + print(f"JSON size: {len(json_str) / 1024:.1f} KB") + + with tempfile.NamedTemporaryFile( + mode="w", suffix=".json", delete=False + ) as tmp: + tmp.write(json_str) + tmp_path = tmp.name + + results = [] + + r = run_benchmark(f"json.loads ({num_nodes}n)", bench_json_parse, json_str) + results.append(r) + + r = run_benchmark(f"GraphLoader.load ({num_nodes}n)", bench_graph_load, tmp_path) + results.append(r) + + loader = GraphLoader(tmp_path) + loader.load() + + r = run_benchmark(f"find_nodes_by_label ({num_nodes}n)", bench_find_nodes_by_label, loader) + results.append(r) + + r = run_benchmark(f"find_node_by_property ({num_nodes}n)", bench_find_node_by_property, loader) + results.append(r) + + r = run_benchmark(f"get_relationships ({num_nodes}n)", bench_get_relationships, loader, num_nodes) + results.append(r) + + r = run_benchmark(f"summary ({num_nodes}n)", bench_summary, loader) + results.append(r) + + print_results(results) + + Path(tmp_path).unlink(missing_ok=True) + + +if __name__ == "__main__": + main() diff --git a/benchmarks/bench_json_serialization.py b/benchmarks/bench_json_serialization.py new file mode 100644 index 000000000..98fc477f7 --- /dev/null +++ b/benchmarks/bench_json_serialization.py @@ -0,0 +1,159 @@ +import json +import statistics +import tempfile +import time +from pathlib import Path + +WARMUP_RUNS = 3 +BENCH_RUNS = 20 + + +def generate_graph_data(num_nodes: int, num_rels: int) -> dict: + nodes = [] + for i in range(num_nodes): + nodes.append({ + "id": i, + "labels": ["Function" if i % 3 == 0 else "Class" if i % 3 == 1 else "Module"], + "properties": { + "qualified_name": f"project.module{i // 100}.Class{i // 10}.method{i}", + "name": f"method{i}", + "start_line": i * 10, + "end_line": i * 10 + 9, + "docstring": f"Method {i} documentation string with some content" if i % 5 == 0 else None, + "decorators": ["staticmethod"] if i % 7 == 0 else [], + "is_exported": i % 4 == 0, + }, + }) + + rels = [] + for i in range(num_rels): + rels.append({ + "from_id": i % num_nodes, + "to_id": (i * 7 + 3) % num_nodes, + "type": "CALLS" if i % 3 == 0 else "DEFINES" if i % 3 == 1 else "IMPORTS", + "properties": {"weight": i % 10} if i % 5 == 0 else {}, + }) + + return { + "nodes": nodes, + "relationships": rels, + "metadata": { + "total_nodes": num_nodes, + "total_relationships": num_rels, + "exported_at": "2026-03-14T10:00:00+00:00", + }, + } + + +def bench_json_dumps(data: dict) -> float: + start = time.perf_counter() + _ = json.dumps(data) + return time.perf_counter() - start + + +def bench_json_dumps_indent(data: dict) -> float: + start = time.perf_counter() + _ = json.dumps(data, indent=2, ensure_ascii=False) + return time.perf_counter() - start + + +def bench_json_loads(json_str: str) -> float: + start = time.perf_counter() + _ = json.loads(json_str) + return time.perf_counter() - start + + +def bench_json_dump_file(data: dict, path: str) -> float: + start = time.perf_counter() + with open(path, "w", encoding="utf-8") as f: + json.dump(data, f, indent=2, ensure_ascii=False) + return time.perf_counter() - start + + +def bench_json_load_file(path: str) -> float: + start = time.perf_counter() + with open(path, encoding="utf-8") as f: + _ = json.load(f) + return time.perf_counter() - start + + +def run_benchmark(name: str, func, *args) -> dict[str, float]: + for _ in range(WARMUP_RUNS): + func(*args) + + times = [] + for _ in range(BENCH_RUNS): + times.append(func(*args)) + + return { + "name": name, + "median_ms": statistics.median(times) * 1000, + "mean_ms": statistics.mean(times) * 1000, + "stddev_ms": statistics.stdev(times) * 1000 if len(times) > 1 else 0, + "min_ms": min(times) * 1000, + "max_ms": max(times) * 1000, + "p95_ms": sorted(times)[int(len(times) * 0.95)] * 1000, + } + + +def print_results(results: list[dict[str, float]]) -> None: + print(f"\n{'Benchmark':<45} {'Median':>10} {'Mean':>10} {'StdDev':>10} {'Min':>10} {'Max':>10} {'P95':>10}") + print("-" * 115) + for r in results: + print( + f"{r['name']:<45} {r['median_ms']:>9.3f}ms {r['mean_ms']:>9.3f}ms " + f"{r['stddev_ms']:>9.3f}ms {r['min_ms']:>9.3f}ms {r['max_ms']:>9.3f}ms " + f"{r['p95_ms']:>9.3f}ms" + ) + + +def main() -> None: + configs = [ + (1000, 2000), + (5000, 10000), + (20000, 50000), + ] + + for num_nodes, num_rels in configs: + print(f"\n{'='*115}") + print(f"JSON Serialization Benchmark (nodes={num_nodes}, rels={num_rels})") + print(f"{'='*115}") + + data = generate_graph_data(num_nodes, num_rels) + json_str = json.dumps(data) + json_str_indented = json.dumps(data, indent=2, ensure_ascii=False) + print(f"Compact JSON: {len(json_str) / 1024:.1f} KB, Indented: {len(json_str_indented) / 1024:.1f} KB") + + with tempfile.NamedTemporaryFile( + mode="w", suffix=".json", delete=False + ) as tmp: + json.dump(data, tmp, indent=2, ensure_ascii=False) + tmp_path = tmp.name + + results = [] + + r = run_benchmark(f"json.dumps compact ({num_nodes}n)", bench_json_dumps, data) + results.append(r) + + r = run_benchmark(f"json.dumps indented ({num_nodes}n)", bench_json_dumps_indent, data) + results.append(r) + + r = run_benchmark(f"json.loads compact ({num_nodes}n)", bench_json_loads, json_str) + results.append(r) + + r = run_benchmark(f"json.loads indented ({num_nodes}n)", bench_json_loads, json_str_indented) + results.append(r) + + r = run_benchmark(f"json.dump to file ({num_nodes}n)", bench_json_dump_file, data, tmp_path) + results.append(r) + + r = run_benchmark(f"json.load from file ({num_nodes}n)", bench_json_load_file, tmp_path) + results.append(r) + + print_results(results) + + Path(tmp_path).unlink(missing_ok=True) + + +if __name__ == "__main__": + main() diff --git a/benchmarks/bench_pathlib_vs_string.py b/benchmarks/bench_pathlib_vs_string.py new file mode 100644 index 000000000..1794b2cef --- /dev/null +++ b/benchmarks/bench_pathlib_vs_string.py @@ -0,0 +1,214 @@ +import os +import statistics +import time +from pathlib import Path, PurePosixPath + +WARMUP_RUNS = 3 +BENCH_RUNS = 50 + + +def generate_file_paths(repo_root: str, count: int) -> list[str]: + dirs = ["src", "lib", "utils", "core", "parsers", "services", "tools", "tests"] + subdirs = ["base", "handlers", "helpers", "models", "schemas", "config"] + extensions = [".py", ".js", ".ts", ".rs", ".go", ".java", ".cpp"] + + paths = [] + for i in range(count): + d = dirs[i % len(dirs)] + sd = subdirs[(i // len(dirs)) % len(subdirs)] + ext = extensions[(i // (len(dirs) * len(subdirs))) % len(extensions)] + paths.append(f"{repo_root}/{d}/{sd}/module_{i}{ext}") + return paths + + +def generate_skip_patterns() -> list[str]: + return [ + "node_modules", ".git", "__pycache__", ".venv", "dist", "build", + ".mypy_cache", ".pytest_cache", ".tox", "egg-info", + ] + + +def bench_pathlib_relative_to(paths: list[str], repo_root: str) -> float: + repo_path = Path(repo_root) + start = time.perf_counter() + for p in paths: + path = Path(p) + _ = path.relative_to(repo_path) + return time.perf_counter() - start + + +def bench_string_removeprefix(paths: list[str], repo_root: str) -> float: + prefix = repo_root + "/" + start = time.perf_counter() + for p in paths: + _ = p.removeprefix(prefix) + return time.perf_counter() - start + + +def bench_os_path_relpath(paths: list[str], repo_root: str) -> float: + start = time.perf_counter() + for p in paths: + _ = os.path.relpath(p, repo_root) + return time.perf_counter() - start + + +def bench_pathlib_should_skip(paths: list[str], repo_root: str, skip_patterns: list[str]) -> float: + repo_path = Path(repo_root) + skip_set = set(skip_patterns) + start = time.perf_counter() + for p in paths: + path = Path(p) + try: + relative = path.relative_to(repo_path) + parts = relative.parts + _ = any(part in skip_set for part in parts) + except ValueError: + pass + return time.perf_counter() - start + + +def bench_string_should_skip(paths: list[str], repo_root: str, skip_patterns: list[str]) -> float: + prefix = repo_root + "/" + skip_set = set(skip_patterns) + start = time.perf_counter() + for p in paths: + relative = p.removeprefix(prefix) + parts = relative.split("/") + _ = any(part in skip_set for part in parts) + return time.perf_counter() - start + + +def bench_pathlib_suffix_check(paths: list[str]) -> float: + start = time.perf_counter() + for p in paths: + path = Path(p) + _ = path.suffix + return time.perf_counter() - start + + +def bench_string_suffix_check(paths: list[str]) -> float: + start = time.perf_counter() + for p in paths: + dot_idx = p.rfind(".") + _ = p[dot_idx:] if dot_idx >= 0 else "" + return time.perf_counter() - start + + +def bench_os_path_splitext(paths: list[str]) -> float: + start = time.perf_counter() + for p in paths: + _, _ = os.path.splitext(p) + return time.perf_counter() - start + + +def bench_pathlib_name(paths: list[str]) -> float: + start = time.perf_counter() + for p in paths: + path = Path(p) + _ = path.name + return time.perf_counter() - start + + +def bench_string_name(paths: list[str]) -> float: + start = time.perf_counter() + for p in paths: + slash_idx = p.rfind("/") + _ = p[slash_idx + 1:] if slash_idx >= 0 else p + return time.perf_counter() - start + + +def run_benchmark(name: str, func, *args) -> dict[str, float]: + for _ in range(WARMUP_RUNS): + func(*args) + + times = [] + for _ in range(BENCH_RUNS): + times.append(func(*args)) + + return { + "name": name, + "median_ms": statistics.median(times) * 1000, + "mean_ms": statistics.mean(times) * 1000, + "stddev_ms": statistics.stdev(times) * 1000 if len(times) > 1 else 0, + "min_ms": min(times) * 1000, + "max_ms": max(times) * 1000, + "p95_ms": sorted(times)[int(len(times) * 0.95)] * 1000, + } + + +def print_results(results: list[dict[str, float]]) -> None: + print(f"\n{'Benchmark':<55} {'Median':>10} {'Mean':>10} {'StdDev':>10} {'Min':>10} {'Max':>10} {'P95':>10}") + print("-" * 125) + for r in results: + print( + f"{r['name']:<55} {r['median_ms']:>9.3f}ms {r['mean_ms']:>9.3f}ms " + f"{r['stddev_ms']:>9.3f}ms {r['min_ms']:>9.3f}ms {r['max_ms']:>9.3f}ms " + f"{r['p95_ms']:>9.3f}ms" + ) + + +def main() -> None: + print("=" * 125) + print("pathlib vs String Operations Benchmark") + print("This benchmarks the #2 CPU hotspot (13.7% of total runtime)") + print("=" * 125) + + repo_root = "/Users/developer/projects/large-repo" + skip_patterns = generate_skip_patterns() + + for count in [1000, 5000, 20000, 59012]: + print(f"\n{'='*125}") + print(f"Path count: {count} (59012 = actual profiled call count)") + print(f"{'='*125}") + + paths = generate_file_paths(repo_root, count) + + results = [] + + print("\n--- relative_to vs removeprefix ---") + r1 = run_benchmark(f"pathlib.relative_to ({count}p)", bench_pathlib_relative_to, paths, repo_root) + results.append(r1) + r2 = run_benchmark(f"str.removeprefix ({count}p)", bench_string_removeprefix, paths, repo_root) + results.append(r2) + r3 = run_benchmark(f"os.path.relpath ({count}p)", bench_os_path_relpath, paths, repo_root) + results.append(r3) + + print_results(results) + print(f"\n -> pathlib vs str.removeprefix: {r1['median_ms'] / r2['median_ms']:.0f}x slower") + print(f" -> pathlib vs os.path.relpath: {r1['median_ms'] / r3['median_ms']:.1f}x slower") + + results = [] + print("\n--- should_skip_path (full function) ---") + r1 = run_benchmark(f"pathlib should_skip ({count}p)", bench_pathlib_should_skip, paths, repo_root, skip_patterns) + results.append(r1) + r2 = run_benchmark(f"string should_skip ({count}p)", bench_string_should_skip, paths, repo_root, skip_patterns) + results.append(r2) + + print_results(results) + print(f"\n -> pathlib vs string: {r1['median_ms'] / r2['median_ms']:.1f}x slower") + + results = [] + print("\n--- Suffix/extension extraction ---") + r1 = run_benchmark(f"Path.suffix ({count}p)", bench_pathlib_suffix_check, paths) + results.append(r1) + r2 = run_benchmark(f"str.rfind ({count}p)", bench_string_suffix_check, paths) + results.append(r2) + r3 = run_benchmark(f"os.path.splitext ({count}p)", bench_os_path_splitext, paths) + results.append(r3) + + print_results(results) + print(f"\n -> Path.suffix vs str.rfind: {r1['median_ms'] / r2['median_ms']:.1f}x slower") + + results = [] + print("\n--- Filename extraction ---") + r1 = run_benchmark(f"Path.name ({count}p)", bench_pathlib_name, paths) + results.append(r1) + r2 = run_benchmark(f"str.rfind+slice ({count}p)", bench_string_name, paths) + results.append(r2) + + print_results(results) + print(f"\n -> Path.name vs str: {r1['median_ms'] / r2['median_ms']:.1f}x slower") + + +if __name__ == "__main__": + main() diff --git a/benchmarks/bench_string_ops.py b/benchmarks/bench_string_ops.py new file mode 100644 index 000000000..cc10e91f8 --- /dev/null +++ b/benchmarks/bench_string_ops.py @@ -0,0 +1,148 @@ +import re +import statistics +import time + +WARMUP_RUNS = 3 +BENCH_RUNS = 100 + +SEPARATOR_PATTERN = re.compile(r"[.:]|::") + + +def generate_qualified_names(count: int) -> list[str]: + names = [] + modules = ["project", "utils", "core", "api", "services", "models"] + classes = ["Handler", "Manager", "Factory", "Builder", "Processor", "Resolver"] + methods = ["process", "handle", "create", "build", "resolve", "validate"] + for i in range(count): + mod = modules[i % len(modules)] + cls = classes[(i // len(modules)) % len(classes)] + meth = methods[(i // (len(modules) * len(classes))) % len(methods)] + names.append(f"{mod}.{cls}.sub{i}.{meth}") + return names + + +def bench_str_split(names: list[str]) -> float: + start = time.perf_counter() + for name in names: + _ = name.split(".") + return time.perf_counter() - start + + +def bench_str_endswith(names: list[str]) -> float: + suffixes = [".process", ".handle", ".create", ".build", ".resolve"] + start = time.perf_counter() + for name in names: + for suffix in suffixes: + _ = name.endswith(suffix) + return time.perf_counter() - start + + +def bench_str_startswith(names: list[str]) -> float: + prefixes = ["project.", "utils.", "core.", "api."] + start = time.perf_counter() + for name in names: + for prefix in prefixes: + _ = name.startswith(prefix) + return time.perf_counter() - start + + +def bench_str_join(names: list[str]) -> float: + split_names = [name.split(".") for name in names] + start = time.perf_counter() + for parts in split_names: + _ = ".".join(parts) + return time.perf_counter() - start + + +def bench_str_replace(names: list[str]) -> float: + start = time.perf_counter() + for name in names: + _ = name.replace("/", ".") + return time.perf_counter() - start + + +def bench_regex_split(names: list[str]) -> float: + start = time.perf_counter() + for name in names: + _ = SEPARATOR_PATTERN.split(name) + return time.perf_counter() - start + + +def bench_str_format(names: list[str]) -> float: + start = time.perf_counter() + for name in names: + _ = f"module.{name}.method" + return time.perf_counter() - start + + +def bench_import_distance(names: list[str]) -> float: + start = time.perf_counter() + for i in range(0, len(names) - 1, 2): + caller_parts = names[i].split(".") + candidate_parts = names[i + 1].split(".") + common = 0 + for j in range(min(len(caller_parts), len(candidate_parts))): + if caller_parts[j] == candidate_parts[j]: + common += 1 + else: + break + _ = max(len(caller_parts), len(candidate_parts)) - common + return time.perf_counter() - start + + +def run_benchmark(name: str, func, *args) -> dict[str, float]: + for _ in range(WARMUP_RUNS): + func(*args) + + times = [] + for _ in range(BENCH_RUNS): + times.append(func(*args)) + + return { + "name": name, + "median_ms": statistics.median(times) * 1000, + "mean_ms": statistics.mean(times) * 1000, + "stddev_ms": statistics.stdev(times) * 1000 if len(times) > 1 else 0, + "min_ms": min(times) * 1000, + "max_ms": max(times) * 1000, + "p95_ms": sorted(times)[int(len(times) * 0.95)] * 1000, + } + + +def print_results(results: list[dict[str, float]]) -> None: + print(f"\n{'Benchmark':<40} {'Median':>10} {'Mean':>10} {'StdDev':>10} {'Min':>10} {'Max':>10} {'P95':>10}") + print("-" * 110) + for r in results: + print( + f"{r['name']:<40} {r['median_ms']:>9.3f}ms {r['mean_ms']:>9.3f}ms " + f"{r['stddev_ms']:>9.3f}ms {r['min_ms']:>9.3f}ms {r['max_ms']:>9.3f}ms " + f"{r['p95_ms']:>9.3f}ms" + ) + + +def main() -> None: + sizes = [1000, 5000, 20000] + + for size in sizes: + print(f"\n{'='*110}") + print(f"String Operations Benchmark (n={size})") + print(f"{'='*110}") + + names = generate_qualified_names(size) + + results = [ + run_benchmark(f"str.split ({size})", bench_str_split, names), + run_benchmark(f"str.endswith ({size})", bench_str_endswith, names), + run_benchmark(f"str.startswith ({size})", bench_str_startswith, names), + run_benchmark(f"str.join ({size})", bench_str_join, names), + run_benchmark(f"str.replace ({size})", bench_str_replace, names), + run_benchmark(f"regex split ({size})", bench_regex_split, names), + run_benchmark(f"f-string format ({size})", bench_str_format, names), + run_benchmark(f"import_distance ({size})", bench_import_distance, names), + ] + + print_results(results) + + +if __name__ == "__main__": + main() diff --git a/benchmarks/bench_trie.py b/benchmarks/bench_trie.py new file mode 100644 index 000000000..dba339100 --- /dev/null +++ b/benchmarks/bench_trie.py @@ -0,0 +1,138 @@ +import statistics +import time +from collections import defaultdict + +from codebase_rag.graph_updater import FunctionRegistryTrie +from codebase_rag.types_defs import NodeType, SimpleNameLookup + +WARMUP_RUNS = 3 +BENCH_RUNS = 50 + + +def generate_qualified_names(count: int) -> list[str]: + names = [] + modules = ["project", "utils", "core", "api", "services", "models"] + classes = ["Handler", "Manager", "Factory", "Builder", "Processor", "Resolver"] + methods = ["process", "handle", "create", "build", "resolve", "validate", "execute"] + for i in range(count): + mod = modules[i % len(modules)] + cls = classes[(i // len(modules)) % len(classes)] + meth = methods[(i // (len(modules) * len(classes))) % len(methods)] + sub = f"sub{i}" + names.append(f"{mod}.{cls}.{sub}.{meth}") + return names + + +def bench_insert(trie: FunctionRegistryTrie, names: list[str]) -> float: + start = time.perf_counter() + for name in names: + trie.insert(name, NodeType.FUNCTION) + return time.perf_counter() - start + + +def bench_lookup(trie: FunctionRegistryTrie, names: list[str]) -> float: + start = time.perf_counter() + for name in names: + _ = name in trie + return time.perf_counter() - start + + +def bench_find_ending_with(trie: FunctionRegistryTrie) -> float: + suffixes = ["process", "handle", "create", "build", "resolve", "validate", "execute"] + start = time.perf_counter() + for suffix in suffixes: + _ = trie.find_ending_with(suffix) + return time.perf_counter() - start + + +def bench_find_with_prefix(trie: FunctionRegistryTrie) -> float: + prefixes = ["project", "utils", "core", "api", "services", "models"] + start = time.perf_counter() + for prefix in prefixes: + _ = trie.find_with_prefix(prefix) + return time.perf_counter() - start + + +def bench_delete(names: list[str]) -> float: + simple_lookup: SimpleNameLookup = defaultdict(set) + trie = FunctionRegistryTrie(simple_name_lookup=simple_lookup) + for name in names: + trie.insert(name, NodeType.FUNCTION) + simple_name = name.split(".")[-1] + simple_lookup[simple_name].add(name) + + start = time.perf_counter() + for name in names[:len(names) // 4]: + del trie[name] + return time.perf_counter() - start + + +def run_benchmark(name: str, func, *args) -> dict[str, float]: + for _ in range(WARMUP_RUNS): + func(*args) + + times = [] + for _ in range(BENCH_RUNS): + times.append(func(*args)) + + return { + "name": name, + "median_ms": statistics.median(times) * 1000, + "mean_ms": statistics.mean(times) * 1000, + "stddev_ms": statistics.stdev(times) * 1000 if len(times) > 1 else 0, + "min_ms": min(times) * 1000, + "max_ms": max(times) * 1000, + "p95_ms": sorted(times)[int(len(times) * 0.95)] * 1000, + } + + +def print_results(results: list[dict[str, float]]) -> None: + print(f"\n{'Benchmark':<35} {'Median':>10} {'Mean':>10} {'StdDev':>10} {'Min':>10} {'Max':>10} {'P95':>10}") + print("-" * 105) + for r in results: + print( + f"{r['name']:<35} {r['median_ms']:>9.3f}ms {r['mean_ms']:>9.3f}ms " + f"{r['stddev_ms']:>9.3f}ms {r['min_ms']:>9.3f}ms {r['max_ms']:>9.3f}ms " + f"{r['p95_ms']:>9.3f}ms" + ) + + +def main() -> None: + sizes = [1000, 5000, 10000, 50000] + + for size in sizes: + print(f"\n{'='*105}") + print(f"FunctionRegistryTrie Benchmark (n={size})") + print(f"{'='*105}") + + names = generate_qualified_names(size) + + simple_lookup: SimpleNameLookup = defaultdict(set) + trie = FunctionRegistryTrie(simple_name_lookup=simple_lookup) + + results = [] + + r = run_benchmark(f"insert ({size})", bench_insert, trie, names) + results.append(r) + + for name in names: + simple_name = name.split(".")[-1] + simple_lookup[simple_name].add(name) + + r = run_benchmark(f"lookup ({size})", bench_lookup, trie, names) + results.append(r) + + r = run_benchmark(f"find_ending_with ({size})", bench_find_ending_with, trie) + results.append(r) + + r = run_benchmark(f"find_with_prefix ({size})", bench_find_with_prefix, trie) + results.append(r) + + r = run_benchmark(f"delete 25% ({size})", bench_delete, names) + results.append(r) + + print_results(results) + + +if __name__ == "__main__": + main() diff --git a/benchmarks/results/bench_ast_cache_20260315_000043.txt b/benchmarks/results/bench_ast_cache_20260315_000043.txt new file mode 100644 index 000000000..5084d79ef --- /dev/null +++ b/benchmarks/results/bench_ast_cache_20260315_000043.txt @@ -0,0 +1,42 @@ +Benchmark: bench_ast_cache.py +Timestamp: 20260315_000043 +Exit code: 0 +Duration: 2.2s +Python: 3.12.2 (main, Feb 25 2024, 03:55:42) [Clang 17.0.6 ] +================================================================================ + +=================================================================================================================== +BoundedASTCache Benchmark (entries=500, item_size=1024B) +=================================================================================================================== + +Benchmark Median Mean StdDev Min Max P95 +------------------------------------------------------------------------------------------------------------------- +insert (500) 1.119ms 1.128ms 0.020ms 1.113ms 1.229ms 1.158ms +lookup (500) 0.019ms 0.019ms 0.000ms 0.018ms 0.019ms 0.019ms +access+LRU (500) 0.053ms 0.053ms 0.000ms 0.053ms 0.056ms 0.053ms +insert+evict (max=250) 1.141ms 1.155ms 0.092ms 1.133ms 1.792ms 1.158ms +getsizeof scan (500) 0.062ms 0.062ms 0.001ms 0.061ms 0.067ms 0.062ms + +=================================================================================================================== +BoundedASTCache Benchmark (entries=2000, item_size=4096B) +=================================================================================================================== + +Benchmark Median Mean StdDev Min Max P95 +------------------------------------------------------------------------------------------------------------------- +insert (2000) 4.717ms 4.798ms 0.248ms 4.591ms 5.567ms 5.558ms +lookup (2000) 0.077ms 0.077ms 0.000ms 0.076ms 0.078ms 0.077ms +access+LRU (2000) 0.214ms 0.214ms 0.001ms 0.213ms 0.217ms 0.216ms +insert+evict (max=1000) 4.768ms 4.814ms 0.221ms 4.614ms 5.870ms 5.103ms +getsizeof scan (2000) 0.257ms 0.259ms 0.005ms 0.254ms 0.279ms 0.269ms + +=================================================================================================================== +BoundedASTCache Benchmark (entries=5000, item_size=8192B) +=================================================================================================================== + +Benchmark Median Mean StdDev Min Max P95 +------------------------------------------------------------------------------------------------------------------- +insert (5000) 12.829ms 13.137ms 0.611ms 12.561ms 14.340ms 14.280ms +lookup (5000) 0.206ms 0.206ms 0.002ms 0.203ms 0.210ms 0.209ms +access+LRU (5000) 0.551ms 0.552ms 0.005ms 0.544ms 0.565ms 0.563ms +insert+evict (max=2500) 12.558ms 12.992ms 0.936ms 12.246ms 16.534ms 14.787ms +getsizeof scan (5000) 0.681ms 0.686ms 0.027ms 0.651ms 0.812ms 0.740ms diff --git a/benchmarks/results/bench_embedding_cache_20260315_000043.txt b/benchmarks/results/bench_embedding_cache_20260315_000043.txt new file mode 100644 index 000000000..807a58402 --- /dev/null +++ b/benchmarks/results/bench_embedding_cache_20260315_000043.txt @@ -0,0 +1,42 @@ +Benchmark: bench_embedding_cache.py +Timestamp: 20260315_000043 +Exit code: 0 +Duration: 3.4s +Python: 3.12.2 (main, Feb 25 2024, 03:55:42) [Clang 17.0.6 ] +================================================================================ + +============================================================================================================== +EmbeddingCache Benchmark (n=500) +============================================================================================================== + +Benchmark Median Mean StdDev Min Max P95 +-------------------------------------------------------------------------------------------------------------- +sha256 hashing (500) 0.155ms 0.151ms 0.006ms 0.143ms 0.161ms 0.159ms +cache.put (500) 0.182ms 0.182ms 0.002ms 0.179ms 0.187ms 0.185ms +cache.get hit (500) 0.177ms 0.177ms 0.001ms 0.176ms 0.180ms 0.179ms +cache.get miss (500) 0.190ms 0.192ms 0.003ms 0.189ms 0.207ms 0.195ms +cache.get_many (500) 0.190ms 0.190ms 0.001ms 0.189ms 0.193ms 0.191ms + +============================================================================================================== +EmbeddingCache Benchmark (n=2000) +============================================================================================================== + +Benchmark Median Mean StdDev Min Max P95 +-------------------------------------------------------------------------------------------------------------- +sha256 hashing (2000) 0.562ms 0.564ms 0.006ms 0.557ms 0.581ms 0.576ms +cache.put (2000) 0.751ms 0.760ms 0.027ms 0.738ms 0.918ms 0.794ms +cache.get hit (2000) 0.729ms 0.732ms 0.009ms 0.719ms 0.765ms 0.748ms +cache.get miss (2000) 0.797ms 0.801ms 0.026ms 0.771ms 0.866ms 0.839ms +cache.get_many (2000) 0.798ms 0.808ms 0.028ms 0.777ms 0.888ms 0.856ms + +============================================================================================================== +EmbeddingCache Benchmark (n=10000) +============================================================================================================== + +Benchmark Median Mean StdDev Min Max P95 +-------------------------------------------------------------------------------------------------------------- +sha256 hashing (10000) 2.884ms 2.875ms 0.034ms 2.815ms 2.950ms 2.921ms +cache.put (10000) 3.790ms 3.786ms 0.024ms 3.729ms 3.827ms 3.821ms +cache.get hit (10000) 3.690ms 3.697ms 0.029ms 3.653ms 3.775ms 3.750ms +cache.get miss (10000) 3.939ms 3.943ms 0.041ms 3.878ms 4.079ms 4.018ms +cache.get_many (10000) 3.987ms 3.989ms 0.023ms 3.948ms 4.051ms 4.041ms diff --git a/benchmarks/results/bench_file_hashing_20260315_000043.txt b/benchmarks/results/bench_file_hashing_20260315_000043.txt new file mode 100644 index 000000000..6346ad2f7 --- /dev/null +++ b/benchmarks/results/bench_file_hashing_20260315_000043.txt @@ -0,0 +1,45 @@ +Benchmark: bench_file_hashing.py +Timestamp: 20260315_000043 +Exit code: 0 +Duration: 4.4s +Python: 3.12.2 (main, Feb 25 2024, 03:55:42) [Clang 17.0.6 ] +================================================================================ + +=================================================================================================================== +File Hashing Benchmark (files=50, avg_size=5KB) +=================================================================================================================== +Total data: 0.2 MB + +Benchmark Median Mean StdDev Min Max P95 +------------------------------------------------------------------------------------------------------------------- +sha256 8KB buf (50f) 1.006ms 1.016ms 0.043ms 0.977ms 1.186ms 1.146ms +sha256 64KB buf (50f) 1.075ms 1.070ms 0.016ms 1.036ms 1.106ms 1.090ms +sha256 mmap (50f) 1.356ms 1.355ms 0.033ms 1.299ms 1.453ms 1.395ms +md5 (50f) 1.310ms 1.374ms 0.171ms 1.191ms 1.878ms 1.727ms +blake2b (50f) 1.201ms 1.253ms 0.147ms 1.106ms 1.718ms 1.632ms + +=================================================================================================================== +File Hashing Benchmark (files=200, avg_size=10KB) +=================================================================================================================== +Total data: 2.0 MB + +Benchmark Median Mean StdDev Min Max P95 +------------------------------------------------------------------------------------------------------------------- +sha256 8KB buf (200f) 4.587ms 4.777ms 0.512ms 4.377ms 6.201ms 6.185ms +sha256 64KB buf (200f) 4.729ms 4.819ms 0.285ms 4.557ms 5.794ms 5.706ms +sha256 mmap (200f) 5.984ms 8.714ms 11.275ms 5.650ms 63.888ms 29.536ms +md5 (200f) 6.532ms 6.547ms 0.143ms 6.367ms 6.993ms 6.804ms +blake2b (200f) 5.217ms 5.289ms 0.272ms 5.068ms 6.416ms 6.003ms + +=================================================================================================================== +File Hashing Benchmark (files=500, avg_size=20KB) +=================================================================================================================== +Total data: 9.8 MB + +Benchmark Median Mean StdDev Min Max P95 +------------------------------------------------------------------------------------------------------------------- +sha256 8KB buf (500f) 13.926ms 14.170ms 0.910ms 13.581ms 18.406ms 15.773ms +sha256 64KB buf (500f) 14.268ms 14.312ms 0.253ms 13.957ms 15.319ms 14.640ms +sha256 mmap (500f) 16.699ms 20.110ms 15.978ms 16.299ms 104.163ms 25.618ms +md5 (500f) 23.512ms 23.670ms 0.567ms 23.157ms 25.836ms 25.075ms +blake2b (500f) 17.669ms 17.783ms 0.496ms 17.229ms 19.433ms 18.815ms diff --git a/benchmarks/results/bench_graph_loader_20260315_000043.txt b/benchmarks/results/bench_graph_loader_20260315_000043.txt new file mode 100644 index 000000000..d9cd28a0b --- /dev/null +++ b/benchmarks/results/bench_graph_loader_20260315_000043.txt @@ -0,0 +1,48 @@ +Benchmark: bench_graph_loader.py +Timestamp: 20260315_000043 +Exit code: 0 +Duration: 2.9s +Python: 3.12.2 (main, Feb 25 2024, 03:55:42) [Clang 17.0.6 ] +================================================================================ + +============================================================================================================== +GraphLoader Benchmark (nodes=1000, rels=2000) +============================================================================================================== +JSON size: 298.2 KB + +Benchmark Median Mean StdDev Min Max P95 +-------------------------------------------------------------------------------------------------------------- +json.loads (1000n) 1.001ms 1.011ms 0.029ms 0.974ms 1.071ms 1.071ms +GraphLoader.load (1000n) 2.040ms 2.143ms 0.583ms 1.865ms 4.581ms 4.581ms +find_nodes_by_label (1000n) 0.001ms 0.001ms 0.000ms 0.000ms 0.001ms 0.001ms +find_node_by_property (1000n) 0.030ms 0.030ms 0.000ms 0.029ms 0.030ms 0.030ms +get_relationships (1000n) 0.148ms 0.148ms 0.001ms 0.146ms 0.151ms 0.151ms +summary (1000n) 0.069ms 0.070ms 0.001ms 0.068ms 0.073ms 0.073ms + +============================================================================================================== +GraphLoader Benchmark (nodes=5000, rels=10000) +============================================================================================================== +JSON size: 1537.8 KB + +Benchmark Median Mean StdDev Min Max P95 +-------------------------------------------------------------------------------------------------------------- +json.loads (5000n) 5.032ms 5.002ms 0.112ms 4.843ms 5.180ms 5.180ms +GraphLoader.load (5000n) 10.106ms 11.137ms 2.030ms 9.396ms 14.997ms 14.997ms +find_nodes_by_label (5000n) 0.000ms 0.000ms 0.000ms 0.000ms 0.001ms 0.001ms +find_node_by_property (5000n) 0.030ms 0.030ms 0.000ms 0.030ms 0.030ms 0.030ms +get_relationships (5000n) 0.150ms 0.152ms 0.005ms 0.148ms 0.170ms 0.170ms +summary (5000n) 0.350ms 0.356ms 0.018ms 0.341ms 0.420ms 0.420ms + +============================================================================================================== +GraphLoader Benchmark (nodes=20000, rels=50000) +============================================================================================================== +JSON size: 6979.7 KB + +Benchmark Median Mean StdDev Min Max P95 +-------------------------------------------------------------------------------------------------------------- +json.loads (20000n) 24.136ms 24.783ms 2.550ms 23.565ms 35.321ms 35.321ms +GraphLoader.load (20000n) 61.008ms 62.676ms 5.050ms 57.534ms 75.337ms 75.337ms +find_nodes_by_label (20000n) 0.000ms 0.000ms 0.000ms 0.000ms 0.001ms 0.001ms +find_node_by_property (20000n) 0.030ms 0.030ms 0.000ms 0.030ms 0.030ms 0.030ms +get_relationships (20000n) 0.152ms 0.153ms 0.001ms 0.151ms 0.155ms 0.155ms +summary (20000n) 1.738ms 1.745ms 0.023ms 1.714ms 1.819ms 1.819ms diff --git a/benchmarks/results/bench_json_serialization_20260315_000043.txt b/benchmarks/results/bench_json_serialization_20260315_000043.txt new file mode 100644 index 000000000..aab002921 --- /dev/null +++ b/benchmarks/results/bench_json_serialization_20260315_000043.txt @@ -0,0 +1,48 @@ +Benchmark: bench_json_serialization.py +Timestamp: 20260315_000043 +Exit code: 0 +Duration: 18.8s +Python: 3.12.2 (main, Feb 25 2024, 03:55:42) [Clang 17.0.6 ] +================================================================================ + +=================================================================================================================== +JSON Serialization Benchmark (nodes=1000, rels=2000) +=================================================================================================================== +Compact JSON: 366.8 KB, Indented: 547.7 KB + +Benchmark Median Mean StdDev Min Max P95 +------------------------------------------------------------------------------------------------------------------- +json.dumps compact (1000n) 1.089ms 1.094ms 0.010ms 1.084ms 1.117ms 1.117ms +json.dumps indented (1000n) 9.612ms 9.703ms 0.220ms 9.560ms 10.479ms 10.479ms +json.loads compact (1000n) 1.202ms 1.202ms 0.015ms 1.185ms 1.260ms 1.260ms +json.loads indented (1000n) 1.286ms 1.281ms 0.023ms 1.253ms 1.325ms 1.325ms +json.dump to file (1000n) 12.239ms 12.241ms 0.071ms 12.145ms 12.398ms 12.398ms +json.load from file (1000n) 1.345ms 1.350ms 0.036ms 1.309ms 1.429ms 1.429ms + +=================================================================================================================== +JSON Serialization Benchmark (nodes=5000, rels=10000) +=================================================================================================================== +Compact JSON: 1881.4 KB, Indented: 2786.1 KB + +Benchmark Median Mean StdDev Min Max P95 +------------------------------------------------------------------------------------------------------------------- +json.dumps compact (5000n) 5.701ms 5.718ms 0.158ms 5.464ms 6.000ms 6.000ms +json.dumps indented (5000n) 47.875ms 47.950ms 0.285ms 47.618ms 48.611ms 48.611ms +json.loads compact (5000n) 6.291ms 6.327ms 0.244ms 5.999ms 6.754ms 6.754ms +json.loads indented (5000n) 6.686ms 6.666ms 0.263ms 6.346ms 7.152ms 7.152ms +json.dump to file (5000n) 60.552ms 60.895ms 1.262ms 60.082ms 64.565ms 64.565ms +json.load from file (5000n) 6.573ms 6.590ms 0.049ms 6.528ms 6.717ms 6.717ms + +=================================================================================================================== +JSON Serialization Benchmark (nodes=20000, rels=50000) +=================================================================================================================== +Compact JSON: 8381.6 KB, Indented: 12363.2 KB + +Benchmark Median Mean StdDev Min Max P95 +------------------------------------------------------------------------------------------------------------------- +json.dumps compact (20000n) 25.446ms 25.483ms 0.156ms 25.314ms 25.797ms 25.797ms +json.dumps indented (20000n) 215.190ms 215.593ms 1.383ms 214.183ms 219.350ms 219.350ms +json.loads compact (20000n) 28.713ms 28.731ms 0.480ms 28.049ms 30.253ms 30.253ms +json.loads indented (20000n) 30.416ms 30.558ms 0.813ms 29.707ms 32.258ms 32.258ms +json.dump to file (20000n) 271.376ms 271.918ms 3.051ms 266.710ms 278.494ms 278.494ms +json.load from file (20000n) 32.144ms 33.111ms 3.488ms 31.594ms 47.762ms 47.762ms diff --git a/benchmarks/results/bench_string_ops_20260315_000043.txt b/benchmarks/results/bench_string_ops_20260315_000043.txt new file mode 100644 index 000000000..66c1bcd8b --- /dev/null +++ b/benchmarks/results/bench_string_ops_20260315_000043.txt @@ -0,0 +1,51 @@ +Benchmark: bench_string_ops.py +Timestamp: 20260315_000043 +Exit code: 0 +Duration: 3.2s +Python: 3.12.2 (main, Feb 25 2024, 03:55:42) [Clang 17.0.6 ] +================================================================================ + +============================================================================================================== +String Operations Benchmark (n=1000) +============================================================================================================== + +Benchmark Median Mean StdDev Min Max P95 +-------------------------------------------------------------------------------------------------------------- +str.split (1000) 0.079ms 0.079ms 0.001ms 0.077ms 0.083ms 0.082ms +str.endswith (1000) 0.179ms 0.181ms 0.006ms 0.174ms 0.219ms 0.188ms +str.startswith (1000) 0.146ms 0.147ms 0.003ms 0.144ms 0.165ms 0.150ms +str.join (1000) 0.036ms 0.036ms 0.001ms 0.035ms 0.047ms 0.039ms +str.replace (1000) 0.014ms 0.014ms 0.000ms 0.014ms 0.016ms 0.014ms +regex split (1000) 0.418ms 0.420ms 0.006ms 0.414ms 0.437ms 0.431ms +f-string format (1000) 0.029ms 0.029ms 0.000ms 0.029ms 0.032ms 0.029ms +import_distance (1000) 0.164ms 0.165ms 0.004ms 0.162ms 0.185ms 0.171ms + +============================================================================================================== +String Operations Benchmark (n=5000) +============================================================================================================== + +Benchmark Median Mean StdDev Min Max P95 +-------------------------------------------------------------------------------------------------------------- +str.split (5000) 0.380ms 0.380ms 0.003ms 0.371ms 0.395ms 0.387ms +str.endswith (5000) 0.897ms 0.899ms 0.004ms 0.892ms 0.919ms 0.909ms +str.startswith (5000) 0.722ms 0.723ms 0.003ms 0.715ms 0.733ms 0.728ms +str.join (5000) 0.185ms 0.187ms 0.005ms 0.184ms 0.234ms 0.191ms +str.replace (5000) 0.071ms 0.071ms 0.001ms 0.070ms 0.074ms 0.071ms +regex split (5000) 2.033ms 2.037ms 0.023ms 1.984ms 2.103ms 2.076ms +f-string format (5000) 0.146ms 0.147ms 0.002ms 0.145ms 0.154ms 0.150ms +import_distance (5000) 0.781ms 0.773ms 0.014ms 0.752ms 0.797ms 0.790ms + +============================================================================================================== +String Operations Benchmark (n=20000) +============================================================================================================== + +Benchmark Median Mean StdDev Min Max P95 +-------------------------------------------------------------------------------------------------------------- +str.split (20000) 1.588ms 1.590ms 0.014ms 1.559ms 1.626ms 1.612ms +str.endswith (20000) 3.582ms 3.619ms 0.147ms 3.497ms 4.883ms 3.803ms +str.startswith (20000) 2.920ms 2.926ms 0.031ms 2.876ms 3.064ms 3.005ms +str.join (20000) 0.733ms 0.735ms 0.015ms 0.719ms 0.850ms 0.752ms +str.replace (20000) 0.287ms 0.288ms 0.009ms 0.282ms 0.374ms 0.293ms +regex split (20000) 8.051ms 8.047ms 0.068ms 7.924ms 8.195ms 8.174ms +f-string format (20000) 0.593ms 0.594ms 0.006ms 0.582ms 0.624ms 0.603ms +import_distance (20000) 3.183ms 3.184ms 0.039ms 3.129ms 3.315ms 3.262ms diff --git a/benchmarks/results/bench_trie_20260315_000043.txt b/benchmarks/results/bench_trie_20260315_000043.txt new file mode 100644 index 000000000..10ad3978e --- /dev/null +++ b/benchmarks/results/bench_trie_20260315_000043.txt @@ -0,0 +1,54 @@ +Benchmark: bench_trie.py +Timestamp: 20260315_000043 +Exit code: 0 +Duration: 9.3s +Python: 3.12.2 (main, Feb 25 2024, 03:55:42) [Clang 17.0.6 ] +================================================================================ + +========================================================================================================= +FunctionRegistryTrie Benchmark (n=1000) +========================================================================================================= + +Benchmark Median Mean StdDev Min Max P95 +--------------------------------------------------------------------------------------------------------- +insert (1000) 0.340ms 0.341ms 0.012ms 0.327ms 0.385ms 0.378ms +lookup (1000) 0.036ms 0.036ms 0.000ms 0.035ms 0.037ms 0.036ms +find_ending_with (1000) 0.004ms 0.005ms 0.004ms 0.004ms 0.031ms 0.004ms +find_with_prefix (1000) 0.390ms 0.425ms 0.059ms 0.369ms 0.589ms 0.528ms +delete 25% (1000) 0.407ms 0.418ms 0.021ms 0.394ms 0.457ms 0.449ms + +========================================================================================================= +FunctionRegistryTrie Benchmark (n=5000) +========================================================================================================= + +Benchmark Median Mean StdDev Min Max P95 +--------------------------------------------------------------------------------------------------------- +insert (5000) 1.795ms 1.797ms 0.037ms 1.721ms 1.911ms 1.876ms +lookup (5000) 0.195ms 0.196ms 0.002ms 0.193ms 0.201ms 0.200ms +find_ending_with (5000) 0.019ms 0.019ms 0.000ms 0.018ms 0.021ms 0.019ms +find_with_prefix (5000) 2.104ms 2.299ms 1.047ms 2.024ms 9.499ms 2.416ms +delete 25% (5000) 2.116ms 2.122ms 0.048ms 2.043ms 2.260ms 2.214ms + +========================================================================================================= +FunctionRegistryTrie Benchmark (n=10000) +========================================================================================================= + +Benchmark Median Mean StdDev Min Max P95 +--------------------------------------------------------------------------------------------------------- +insert (10000) 3.709ms 3.735ms 0.106ms 3.627ms 4.244ms 3.912ms +lookup (10000) 0.402ms 0.403ms 0.003ms 0.398ms 0.412ms 0.407ms +find_ending_with (10000) 0.046ms 0.046ms 0.002ms 0.045ms 0.056ms 0.050ms +find_with_prefix (10000) 4.244ms 4.630ms 1.843ms 3.904ms 13.674ms 5.386ms +delete 25% (10000) 4.204ms 4.207ms 0.066ms 3.959ms 4.349ms 4.312ms + +========================================================================================================= +FunctionRegistryTrie Benchmark (n=50000) +========================================================================================================= + +Benchmark Median Mean StdDev Min Max P95 +--------------------------------------------------------------------------------------------------------- +insert (50000) 18.036ms 18.128ms 0.306ms 17.831ms 18.972ms 18.820ms +lookup (50000) 2.058ms 2.061ms 0.013ms 2.036ms 2.091ms 2.085ms +find_ending_with (50000) 0.420ms 0.426ms 0.014ms 0.412ms 0.477ms 0.458ms +find_with_prefix (50000) 38.507ms 38.096ms 10.219ms 22.462ms 56.890ms 52.739ms +delete 25% (50000) 21.744ms 21.830ms 0.410ms 21.277ms 23.496ms 22.524ms diff --git a/benchmarks/run_all.py b/benchmarks/run_all.py new file mode 100644 index 000000000..a79c339ab --- /dev/null +++ b/benchmarks/run_all.py @@ -0,0 +1,74 @@ +import subprocess +import sys +import time +from pathlib import Path + +BENCHMARKS = [ + "bench_string_ops.py", + "bench_trie.py", + "bench_find_ending_with_fix.py", + "bench_dropin_replacements.py", + "bench_graph_loader.py", + "bench_file_hashing.py", + "bench_embedding_cache.py", + "bench_json_serialization.py", + "bench_ast_cache.py", + "bench_pathlib_vs_string.py", +] + + +def main() -> None: + bench_dir = Path(__file__).parent + results_dir = bench_dir / "results" + results_dir.mkdir(exist_ok=True) + + timestamp = time.strftime("%Y%m%d_%H%M%S") + overall_start = time.perf_counter() + + print(f"Running {len(BENCHMARKS)} benchmark suites") + print(f"Results will be saved to: {results_dir}") + print(f"Timestamp: {timestamp}") + print("=" * 80) + + for bench_file in BENCHMARKS: + bench_path = bench_dir / bench_file + if not bench_path.exists(): + print(f"SKIP: {bench_file} (not found)") + continue + + result_file = results_dir / f"{bench_path.stem}_{timestamp}.txt" + print(f"\nRunning: {bench_file}") + + start = time.perf_counter() + result = subprocess.run( + [sys.executable, str(bench_path)], + capture_output=True, + text=True, + timeout=600, + ) + elapsed = time.perf_counter() - start + + output = result.stdout + if result.returncode != 0: + output += f"\nSTDERR:\n{result.stderr}" + print(f" FAILED (exit code {result.returncode}, {elapsed:.1f}s)") + else: + print(f" OK ({elapsed:.1f}s)") + + with result_file.open("w") as f: + f.write(f"Benchmark: {bench_file}\n") + f.write(f"Timestamp: {timestamp}\n") + f.write(f"Exit code: {result.returncode}\n") + f.write(f"Duration: {elapsed:.1f}s\n") + f.write(f"Python: {sys.version}\n") + f.write("=" * 80 + "\n") + f.write(output) + + total = time.perf_counter() - overall_start + print(f"\n{'='*80}") + print(f"All benchmarks completed in {total:.1f}s") + print(f"Results saved in: {results_dir}") + + +if __name__ == "__main__": + main() diff --git a/build_binary.py b/build_binary.py index b82c48c6e..fd1884a0c 100644 --- a/build_binary.py +++ b/build_binary.py @@ -70,6 +70,9 @@ def build_binary() -> bool: for pkg in cs.PYINSTALLER_PACKAGES: cmd.extend(_build_package_args(pkg)) + for mod in cs.PYINSTALLER_EXCLUDED_MODULES: + cmd.extend([cs.PYINSTALLER_ARG_EXCLUDE_MODULE, mod]) + cmd.append(cs.PYINSTALLER_ENTRY_POINT) logger.info(logs.BUILD_BINARY.format(name=binary_name)) diff --git a/cgr/__init__.py b/cgr/__init__.py new file mode 100644 index 000000000..3d76ac771 --- /dev/null +++ b/cgr/__init__.py @@ -0,0 +1,14 @@ +from codebase_rag.config import settings +from codebase_rag.embedder import embed_code +from codebase_rag.graph_loader import GraphLoader, load_graph +from codebase_rag.services.graph_service import MemgraphIngestor +from codebase_rag.services.llm import CypherGenerator + +__all__ = [ + "CypherGenerator", + "GraphLoader", + "MemgraphIngestor", + "embed_code", + "load_graph", + "settings", +] diff --git a/codebase_rag/cgr_state.py b/codebase_rag/cgr_state.py new file mode 100644 index 000000000..703672a64 --- /dev/null +++ b/codebase_rag/cgr_state.py @@ -0,0 +1,57 @@ +from __future__ import annotations + +import json +from datetime import UTC, datetime +from pathlib import Path +from typing import TypedDict + +from loguru import logger + +from .config import settings + +STATE_FILENAME = "state.json" + + +class _StateShape(TypedDict, total=False): + last_sync: dict[str, str] + + +def state_path(home: Path | None = None) -> Path: + base = (home or settings.CGR_HOME).expanduser() + return base / STATE_FILENAME + + +def _load(path: Path) -> _StateShape: + if not path.exists(): + return _StateShape() + try: + with path.open(encoding="utf-8") as f: + data = json.load(f) + if isinstance(data, dict): + return _StateShape(last_sync=data.get("last_sync", {})) + except (OSError, json.JSONDecodeError) as e: + logger.warning(f"Failed to load cgr state from {path}: {e}") + return _StateShape() + + +def _save(path: Path, data: _StateShape) -> None: + try: + path.parent.mkdir(parents=True, exist_ok=True) + with path.open("w", encoding="utf-8") as f: + json.dump(data, f, indent=2) + except OSError as e: + logger.warning(f"Failed to save cgr state to {path}: {e}") + + +def record_sync(project_name: str, home: Path | None = None) -> None: + path = state_path(home) + state = _load(path) + last_sync = state.get("last_sync", {}) + last_sync[project_name] = datetime.now(UTC).isoformat() + state["last_sync"] = last_sync + _save(path, state) + + +def read_sync_timestamps(home: Path | None = None) -> dict[str, str]: + state = _load(state_path(home)) + return dict(state.get("last_sync", {})) diff --git a/codebase_rag/cli.py b/codebase_rag/cli.py index 87f9a5379..07b72ad88 100644 --- a/codebase_rag/cli.py +++ b/codebase_rag/cli.py @@ -1,39 +1,69 @@ import asyncio +import json +import time +from collections.abc import Callable +from functools import partial +from importlib.metadata import version as get_version from pathlib import Path import typer from loguru import logger +from rich.console import Console from rich.panel import Panel from rich.table import Table +from . import cgr_state from . import cli_help as ch from . import constants as cs from . import logs as ls from .config import load_cgrignore_patterns, settings from .graph_updater import GraphUpdater from .main import ( + _create_configuration_table, app_context, connect_memgraph, export_graph_to_file, main_async, main_optimize_async, + main_single_query, prompt_for_unignored_directories, style, update_model_settings, ) from .parser_loader import load_parsers +from .services.graph_service import MemgraphIngestor from .services.protobuf_service import ProtobufFileIngestor +from .stack import StackManager +from .stack.cli import cli as daemon_cli +from .stack.constants import StackState +from .stack.manager import StackError from .tools.health_checker import HealthChecker from .tools.language import cli as language_cli +from .types_defs import DeadCodeRow, PropertyValue, ResultRow +from .utils.path_utils import derive_project_name, resolve_repo_path +from .vector_store import delete_project_embeddings +from .workspaces import WorkspaceConfig, WorkspaceError, load_workspace +from .workspaces.cli import cli as workspace_cli app = typer.Typer( - name="code-graph-rag", + name=cs.PACKAGE_NAME, help=ch.APP_DESCRIPTION, no_args_is_help=True, add_completion=False, ) +def _version_callback(value: bool) -> None: + if value: + app_context.console.print( + cs.CLI_MSG_VERSION.format( + package=cs.PACKAGE_NAME, version=get_version(cs.PACKAGE_NAME) + ), + highlight=False, + ) + raise typer.Exit() + + def validate_models_early() -> None: try: orchestrator_config = settings.active_orchestrator_config @@ -58,6 +88,14 @@ def _update_and_validate_models(orchestrator: str | None, cypher: str | None) -> @app.callback() def _global_options( + version: bool | None = typer.Option( + None, + "--version", + "-v", + help=ch.HELP_VERSION, + callback=_version_callback, + is_eager=True, + ), quiet: bool = typer.Option( False, "--quiet", @@ -77,6 +115,184 @@ def _info(msg: str) -> None: app_context.console.print(msg) +def _load_workspace_or_exit(workspace: str | None) -> WorkspaceConfig | None: + if workspace is None: + return None + try: + return load_workspace(workspace) + except WorkspaceError as e: + app_context.console.print(style(str(e), cs.Color.RED)) + raise typer.Exit(1) from e + + +def _sync_workspace( + config: WorkspaceConfig, + batch_size: int, + exclude: list[str] | None, +) -> None: + total = len(config.repos) + if total == 0: + _info( + style(cs.CLI_MSG_WORKSPACE_EMPTY.format(name=config.name), cs.Color.YELLOW) + ) + return + _info( + style( + cs.CLI_MSG_WORKSPACE_SYNCING.format(name=config.name, count=total), + cs.Color.CYAN, + ) + ) + for idx, repo in enumerate(config.repos, start=1): + repo_path = repo.repo_path() + _info( + style( + cs.CLI_MSG_WORKSPACE_SYNC_REPO.format( + idx=idx, + total=total, + path=repo_path, + project_name=repo.project_name, + ), + cs.Color.CYAN, + ) + ) + _run_graph_sync( + repo=repo_path, + project_name=repo.project_name, + batch_size=batch_size, + exclude=exclude, + interactive_setup=False, + ) + + +def _resolve_active_projects(projects: str | None, default_project: str) -> list[str]: + if projects: + parsed = [p.strip() for p in projects.split(",") if p.strip()] + if parsed: + return parsed + return [default_project] + + +def _maybe_start_stack() -> None: + mgr = StackManager() + if mgr.status().state == StackState.RUNNING: + return + try: + mgr.ensure_running() + except StackError as e: + app_context.console.print(style(str(e), cs.Color.RED)) + raise typer.Exit(1) from e + + +def _run_graph_sync( + repo: Path, + project_name: str, + batch_size: int, + exclude: list[str] | None, + interactive_setup: bool, + clean: bool = False, + output: str | None = None, +) -> None: + cgrignore = load_cgrignore_patterns(repo) + cli_excludes = frozenset(exclude) if exclude else frozenset() + exclude_paths = cli_excludes | cgrignore.exclude or None + unignore_paths: frozenset[str] | None + if interactive_setup: + unignore_paths = prompt_for_unignored_directories(repo, exclude) + else: + unignore_paths = cgrignore.unignore or None + + elapsed = time.monotonic() + with connect_memgraph(batch_size) as ingestor: + if clean: + _info(style(cs.CLI_MSG_CLEANING_DB, cs.Color.YELLOW)) + ingestor.clean_database() + _delete_hash_cache(repo) + + ingestor.ensure_constraints() + + parsers, queries = load_parsers() + + updater = GraphUpdater( + ingestor=ingestor, + repo_path=repo, + parsers=parsers, + queries=queries, + unignore_paths=unignore_paths, + exclude_paths=exclude_paths, + project_name=project_name, + ) + updater.run() + cgr_state.record_sync(project_name) + + if output: + _info(style(cs.CLI_MSG_EXPORTING_TO.format(path=output), cs.Color.CYAN)) + if not export_graph_to_file(ingestor, output): + raise typer.Exit(1) + elapsed = time.monotonic() - elapsed + if updater.skipped_because_in_sync: + app_context.console.print( + style( + cs.CLI_MSG_SYNC_SKIPPED.format(project=project_name, elapsed=elapsed), + cs.Color.CYAN, + cs.StyleModifier.DIM, + ) + ) + else: + app_context.console.print( + style( + cs.CLI_MSG_SYNC_DONE.format(project=project_name, elapsed=elapsed), + cs.Color.CYAN, + cs.StyleModifier.NONE, + ) + ) + + +def _delete_hash_cache(repo_path: Path) -> None: + cache_path = repo_path / cs.HASH_CACHE_FILENAME + if cache_path.exists(): + _info( + style( + cs.CLI_MSG_CLEANING_HASH_CACHE.format(path=cache_path), + cs.Color.YELLOW, + ) + ) + cache_path.unlink(missing_ok=True) + dir_mtimes_path = repo_path / cs.DIR_MTIMES_FILENAME + dir_mtimes_path.unlink(missing_ok=True) + + +def _resolve_and_validate_repo(repo_path: str | None) -> Path: + resolved = resolve_repo_path(repo_path, settings.TARGET_REPO_PATH) + if not resolved.exists(): + app_context.console.print( + style(cs.CLI_ERR_PATH_NOT_EXISTS.format(path=resolved), cs.Color.RED) + ) + raise typer.Exit(1) + if not resolved.is_dir(): + app_context.console.print( + style(cs.CLI_ERR_PATH_NOT_DIR.format(path=resolved), cs.Color.RED) + ) + raise typer.Exit(1) + if not (resolved / cs.GIT_DIR_NAME).exists(): + app_context.console.print( + style(cs.CLI_WARN_NOT_GIT_REPO.format(path=resolved), cs.Color.YELLOW) + ) + return resolved + + +def _cleanup_project_embeddings(ingestor: MemgraphIngestor, project_name: str) -> None: + rows = ingestor.fetch_all( + cs.CYPHER_QUERY_PROJECT_NODE_IDS, + {cs.KEY_PROJECT_NAME: project_name}, + ) + node_ids: list[int] = [] + for row in rows: + node_id = row.get(cs.KEY_NODE_ID) + if isinstance(node_id, int): + node_ids.append(node_id) + delete_project_embeddings(project_name, node_ids) + + @app.command(help=ch.CMD_START) def start( repo_path: str | None = typer.Option( @@ -113,12 +329,22 @@ def start( "--no-confirm", help=ch.HELP_NO_CONFIRM, ), + no_instructions: bool = typer.Option( + False, + "--no-instructions", + help=ch.HELP_NO_INSTRUCTIONS, + ), batch_size: int | None = typer.Option( None, "--batch-size", min=1, help=ch.HELP_BATCH_SIZE, ), + project_name: str | None = typer.Option( + None, + "--project-name", + help=ch.HELP_PROJECT_NAME, + ), exclude: list[str] | None = typer.Option( None, "--exclude", @@ -129,10 +355,50 @@ def start( "--interactive-setup", help=ch.HELP_INTERACTIVE_SETUP, ), + ask_agent: str | None = typer.Option( + None, + "-a", + "--ask-agent", + help=ch.HELP_ASK_AGENT, + ), + output_format: cs.QueryFormat = typer.Option( + cs.QueryFormat.TABLE, + "--output-format", + help=ch.HELP_QUERY_OUTPUT_FORMAT, + ), + no_start_stack: bool = typer.Option( + False, + "--no-start-stack", + help=ch.HELP_NO_START_STACK, + ), + no_sync: bool = typer.Option( + False, + "--no-sync", + help=ch.HELP_NO_SYNC, + ), + projects: str | None = typer.Option( + None, + "--projects", + help=ch.HELP_PROJECTS, + ), + workspace: str | None = typer.Option( + None, + "--workspace", + help=ch.HELP_WORKSPACE, + ), ) -> None: app_context.session.confirm_edits = not no_confirm + app_context.session.load_cgr_instructions = not no_instructions + + if output_format == cs.QueryFormat.JSON and not ask_agent: + app_context.console.print( + style(cs.CLI_ERR_JSON_REQUIRES_ASK_AGENT, cs.Color.RED) + ) + raise typer.Exit(1) - target_repo_path = repo_path or settings.TARGET_REPO_PATH + resolved_repo = _resolve_and_validate_repo(repo_path) + target_repo_path = str(resolved_repo) + resolved_project_name = project_name or derive_project_name(resolved_repo) if output and not update_graph: app_context.console.print( @@ -140,54 +406,95 @@ def start( ) raise typer.Exit(1) - _update_and_validate_models(orchestrator, cypher) + if not no_start_stack: + _maybe_start_stack() effective_batch_size = settings.resolve_batch_size(batch_size) + if clean and not update_graph: + repo_to_clean = Path(target_repo_path) + with connect_memgraph(effective_batch_size) as ingestor: + _info(style(cs.CLI_MSG_CLEANING_DB, cs.Color.YELLOW)) + ingestor.clean_database() + + _delete_hash_cache(repo_to_clean) + _info(style(cs.CLI_MSG_CLEAN_DONE, cs.Color.GREEN)) + return + + _update_and_validate_models(orchestrator, cypher) + + if not ask_agent and not update_graph: + app_context.console.print(_create_configuration_table(target_repo_path)) + if update_graph: - repo_to_update = Path(target_repo_path) _info( - style(cs.CLI_MSG_UPDATING_GRAPH.format(path=repo_to_update), cs.Color.GREEN) + style(cs.CLI_MSG_UPDATING_GRAPH.format(path=resolved_repo), cs.Color.GREEN) ) - - cgrignore = load_cgrignore_patterns(repo_to_update) - cli_excludes = frozenset(exclude) if exclude else frozenset() - exclude_paths = cli_excludes | cgrignore.exclude or None - unignore_paths: frozenset[str] | None = None - if interactive_setup: - unignore_paths = prompt_for_unignored_directories(repo_to_update, exclude) - else: + if not interactive_setup: _info(style(cs.CLI_MSG_AUTO_EXCLUDE, cs.Color.YELLOW)) - unignore_paths = cgrignore.unignore or None + _run_graph_sync( + repo=resolved_repo, + project_name=resolved_project_name, + batch_size=effective_batch_size, + exclude=exclude, + interactive_setup=interactive_setup, + clean=clean, + output=output, + ) + _info(style(cs.CLI_MSG_GRAPH_UPDATED, cs.Color.GREEN)) + return - with connect_memgraph(effective_batch_size) as ingestor: - if clean: - _info(style(cs.CLI_MSG_CLEANING_DB, cs.Color.YELLOW)) - ingestor.clean_database() - ingestor.ensure_constraints() - - parsers, queries = load_parsers() - - updater = GraphUpdater( - ingestor, - repo_to_update, - parsers, - queries, - unignore_paths, - exclude_paths, - ) - updater.run() + workspace_config = _load_workspace_or_exit(workspace) - if output: - _info(style(cs.CLI_MSG_EXPORTING_TO.format(path=output), cs.Color.CYAN)) - if not export_graph_to_file(ingestor, output): - raise typer.Exit(1) + sync_task: Callable[[], None] | None = None + sync_message = cs.MSG_SYNCING_KNOWLEDGE_GRAPH + if not no_sync: + if workspace_config is not None: + sync_task = partial( + _sync_workspace, workspace_config, effective_batch_size, exclude + ) + sync_message = cs.MSG_SYNCING_WORKSPACE.format( + name=workspace_config.name, count=len(workspace_config.repos) + ) + else: + sync_task = partial( + _run_graph_sync, + repo=resolved_repo, + project_name=resolved_project_name, + batch_size=effective_batch_size, + exclude=exclude, + interactive_setup=interactive_setup, + ) - _info(style(cs.CLI_MSG_GRAPH_UPDATED, cs.Color.GREEN)) - return + if workspace_config is not None: + active_projects = workspace_config.project_names() + if projects: + active_projects = _resolve_active_projects(projects, active_projects[0]) + else: + active_projects = _resolve_active_projects(projects, resolved_project_name) try: - asyncio.run(main_async(target_repo_path, effective_batch_size)) + if ask_agent: + if sync_task is not None: + sync_task() + main_single_query( + target_repo_path, + effective_batch_size, + ask_agent, + active_projects=active_projects, + output_format=output_format, + ) + else: + asyncio.run( + main_async( + target_repo_path, + effective_batch_size, + active_projects=active_projects, + show_config_table=False, + pre_chat_sync=sync_task, + pre_chat_sync_message=sync_message, + ) + ) except KeyboardInterrupt: app_context.console.print(style(cs.CLI_MSG_APP_TERMINATED, cs.Color.RED)) except ValueError as e: @@ -223,8 +530,7 @@ def index( help=ch.HELP_INTERACTIVE_SETUP, ), ) -> None: - target_repo_path = repo_path or settings.TARGET_REPO_PATH - repo_to_index = Path(target_repo_path) + repo_to_index = _resolve_and_validate_repo(repo_path) _info(style(cs.CLI_MSG_INDEXING_AT.format(path=repo_to_index), cs.Color.GREEN)) _info(style(cs.CLI_MSG_OUTPUT_TO.format(path=output_proto_dir), cs.Color.CYAN)) @@ -245,7 +551,12 @@ def index( ) parsers, queries = load_parsers() updater = GraphUpdater( - ingestor, repo_to_index, parsers, queries, unignore_paths, exclude_paths + ingestor=ingestor, + repo_path=repo_to_index, + parsers=parsers, + queries=queries, + unignore_paths=unignore_paths, + exclude_paths=exclude_paths, ) updater.run() @@ -324,6 +635,11 @@ def optimize( "--no-confirm", help=ch.HELP_NO_CONFIRM, ), + no_instructions: bool = typer.Option( + False, + "--no-instructions", + help=ch.HELP_NO_INSTRUCTIONS, + ), batch_size: int | None = typer.Option( None, "--batch-size", @@ -332,8 +648,9 @@ def optimize( ), ) -> None: app_context.session.confirm_edits = not no_confirm + app_context.session.load_cgr_instructions = not no_instructions - target_repo_path = repo_path or settings.TARGET_REPO_PATH + target_repo_path = str(_resolve_and_validate_repo(repo_path)) _update_and_validate_models(orchestrator, cypher) @@ -357,11 +674,24 @@ def optimize( @app.command(name=ch.CLICommandName.MCP_SERVER, help=ch.CMD_MCP_SERVER) -def mcp_server() -> None: +def mcp_server( + transport: cs.MCPTransport = typer.Option( + cs.MCPTransport.STDIO, help=ch.HELP_MCP_TRANSPORT + ), + host: str = typer.Option(None, help=ch.HELP_MCP_HTTP_HOST), + port: int = typer.Option(None, help=ch.HELP_MCP_HTTP_PORT), +) -> None: try: - from codebase_rag.mcp import main as mcp_main + if transport == cs.MCPTransport.HTTP: + from codebase_rag.mcp import serve_http + + resolved_host = host or settings.MCP_HTTP_HOST + resolved_port = port or settings.MCP_HTTP_PORT + asyncio.run(serve_http(host=resolved_host, port=resolved_port)) + else: + from codebase_rag.mcp import serve_stdio - asyncio.run(mcp_main()) + asyncio.run(serve_stdio()) except KeyboardInterrupt: app_context.console.print(style(cs.CLI_MSG_APP_TERMINATED, cs.Color.RED)) except ValueError as e: @@ -369,7 +699,6 @@ def mcp_server() -> None: style(cs.CLI_ERR_CONFIG.format(error=e), cs.Color.RED) ) _info(style(cs.CLI_MSG_HINT_TARGET_REPO, cs.Color.YELLOW)) - except Exception as e: app_context.console.print( style(cs.CLI_ERR_MCP_SERVER.format(error=e), cs.Color.RED) @@ -417,6 +746,53 @@ def language_command(ctx: typer.Context) -> None: language_cli(ctx.args, standalone_mode=False) +@app.command( + name=ch.CLICommandName.DAEMON, + help=ch.CMD_DAEMON, + context_settings={"allow_extra_args": True, "allow_interspersed_args": False}, +) +def daemon_command(ctx: typer.Context) -> None: + daemon_cli(ctx.args, standalone_mode=False) + + +@app.command( + name=ch.CLICommandName.WORKSPACE, + help=ch.CMD_WORKSPACE, + context_settings={"allow_extra_args": True, "allow_interspersed_args": False}, +) +def workspace_command(ctx: typer.Context) -> None: + workspace_cli(ctx.args, standalone_mode=False) + + +@app.command(name=ch.CLICommandName.STOP, help=ch.CMD_STOP) +def stop_command() -> None: + mgr = StackManager() + try: + mgr.down() + except StackError as e: + app_context.console.print(style(str(e), cs.Color.RED)) + raise typer.Exit(1) from e + _info(style("stack stopped", cs.Color.GREEN)) + + +@app.command(name=ch.CLICommandName.STATUS, help=ch.CMD_STATUS) +def status_command() -> None: + status = StackManager().status() + app_context.console.print( + f"stack: {status.state.value} " + f"(memgraph={status.memgraph_endpoint} reachable={status.memgraph_reachable}, " + f"qdrant={status.qdrant_endpoint} reachable={status.qdrant_reachable})" + ) + app_context.console.print(f"compose: {status.compose_file}") + timestamps = cgr_state.read_sync_timestamps() + if not timestamps: + app_context.console.print("syncs: (no projects synced via cgr yet)") + return + app_context.console.print("syncs:") + for project, ts in sorted(timestamps.items()): + app_context.console.print(f" - {project}: last sync {ts}") + + @app.command(name=ch.CLICommandName.DOCTOR, help=ch.CMD_DOCTOR) def doctor() -> None: checker = HealthChecker() @@ -465,5 +841,324 @@ def doctor() -> None: raise typer.Exit(1) +def _build_stats_table( + title: str, + col_label: str, + rows: list[ResultRow], + get_label: Callable[[ResultRow], str], + total_label: str, +) -> Table: + table = Table( + title=style(title, cs.Color.GREEN), + show_header=True, + header_style=f"{cs.StyleModifier.BOLD} {cs.Color.MAGENTA}", + ) + table.add_column(col_label, style=cs.Color.CYAN) + table.add_column(cs.CLI_STATS_COL_COUNT, style=cs.Color.YELLOW, justify="right") + total = 0 + for row in rows: + raw_count = row.get("count", 0) + count = int(raw_count) if isinstance(raw_count, int | float) else 0 + total += count + table.add_row(get_label(row), f"{count:,}") + table.add_section() + table.add_row( + style(total_label, cs.Color.GREEN), + style(f"{total:,}", cs.Color.GREEN), + ) + return table + + +@app.command(name=ch.CLICommandName.STATS, help=ch.CMD_STATS) +def stats() -> None: + from .cypher_queries import ( + CYPHER_STATS_NODE_COUNTS, + CYPHER_STATS_RELATIONSHIP_COUNTS, + ) + + app_context.console.print(style(cs.CLI_MSG_CONNECTING_STATS, cs.Color.CYAN)) + + try: + with connect_memgraph(batch_size=1) as ingestor: + node_results = ingestor.fetch_all(CYPHER_STATS_NODE_COUNTS) + rel_results = ingestor.fetch_all(CYPHER_STATS_RELATIONSHIP_COUNTS) + + app_context.console.print( + _build_stats_table( + cs.CLI_STATS_NODE_TITLE, + cs.CLI_STATS_COL_NODE_TYPE, + node_results, + lambda r: ":".join(r.get("labels", [])) or cs.CLI_STATS_UNKNOWN, + cs.CLI_STATS_TOTAL_NODES, + ) + ) + app_context.console.print() + app_context.console.print( + _build_stats_table( + cs.CLI_STATS_REL_TITLE, + cs.CLI_STATS_COL_REL_TYPE, + rel_results, + lambda r: str(r.get("type", cs.CLI_STATS_UNKNOWN)), + cs.CLI_STATS_TOTAL_RELS, + ) + ) + + except Exception as e: + app_context.console.print( + style(cs.CLI_ERR_STATS_FAILED.format(error=e), cs.Color.RED) + ) + logger.exception(ls.STATS_ERROR.format(error=e)) + raise typer.Exit(1) from e + + +def _resolve_dead_code_project( + project_name: str | None, projects: list[str] +) -> str | None: + if project_name: + return project_name.strip() + if len(projects) == 1: + return projects[0] + return None + + +def _dead_code_params( + project_name: str, + entry_points: list[str], + decorator_roots: list[str], +) -> dict[str, PropertyValue]: + root_decorators = sorted( + {d.lower() for d in cs.DEFAULT_ROOT_DECORATORS} + | {d.lower() for d in decorator_roots} + ) + # (H) test_patterns is always passed: with tests included it makes test + # (H) functions roots; with tests excluded it filters test modules out of the + # (H) module-load root clause so test-only code is not kept alive. + return { + "project_prefix": f"{project_name}{cs.SEPARATOR_DOT}", + "root_decorators": root_decorators, + "entry_points": list(entry_points), + "test_patterns": list(cs.TEST_PATH_PATTERNS), + } + + +def _to_dead_code_row(row: ResultRow) -> DeadCodeRow: + start = row.get(cs.KEY_START_LINE, 0) + end = row.get(cs.KEY_END_LINE, 0) + return DeadCodeRow( + label=str(row.get(cs.KEY_LABEL, "")), + name=str(row.get(cs.KEY_NAME, "")), + qualified_name=str(row.get(cs.KEY_QUALIFIED_NAME, "")), + start_line=int(start) if isinstance(start, int | float) else 0, + end_line=int(end) if isinstance(end, int | float) else 0, + ) + + +def _build_dead_code_table(candidates: list[DeadCodeRow], project_name: str) -> Table: + table = Table( + title=style( + cs.CLI_DEADCODE_TABLE_TITLE.format(project_name=project_name), + cs.Color.GREEN, + ), + show_header=True, + header_style=f"{cs.StyleModifier.BOLD} {cs.Color.MAGENTA}", + ) + table.add_column(cs.CLI_DEADCODE_COL_KIND, style=cs.Color.MAGENTA) + table.add_column(cs.CLI_DEADCODE_COL_QUALIFIED_NAME, style=cs.Color.CYAN) + table.add_column(cs.CLI_DEADCODE_COL_LINES, style=cs.Color.YELLOW, justify="right") + for row in candidates: + table.add_row( + row["label"], + row["qualified_name"], + cs.CLI_DEADCODE_LINE_RANGE.format( + start=row["start_line"], end=row["end_line"] + ), + ) + return table + + +def _emit_dead_code( + candidates: list[DeadCodeRow], + output_format: cs.DeadCodeFormat, + output: Path | None, + project_name: str, +) -> None: + if output_format == cs.DeadCodeFormat.JSON: + payload = json.dumps(candidates, indent=2) + if output is not None: + output.write_text(payload, encoding=cs.ENCODING_UTF8) + app_context.console.print( + style( + cs.CLI_DEADCODE_WRITTEN.format(count=len(candidates), path=output), + cs.Color.GREEN, + ) + ) + return + typer.echo(payload) + return + + table = _build_dead_code_table(candidates, project_name) + if output is not None: + with output.open("w", encoding=cs.ENCODING_UTF8) as fh: + Console(file=fh).print(table) + app_context.console.print( + style( + cs.CLI_DEADCODE_WRITTEN.format(count=len(candidates), path=output), + cs.Color.GREEN, + ) + ) + return + + if not candidates: + app_context.console.print(style(cs.CLI_DEADCODE_NONE, cs.Color.GREEN)) + return + app_context.console.print(table) + app_context.console.print( + style(cs.CLI_DEADCODE_SUMMARY.format(count=len(candidates)), cs.Color.GREEN) + ) + + +@app.command(name=ch.CLICommandName.DEAD_CODE, help=ch.CMD_DEAD_CODE) +def dead_code( + project_name: str | None = typer.Option( + None, "--project-name", "-n", help=ch.HELP_DEADCODE_PROJECT_NAME + ), + entry_point: list[str] = typer.Option( + [], "--entry-point", "-e", help=ch.HELP_DEADCODE_ENTRY_POINT + ), + decorator_root: list[str] = typer.Option( + [], "--decorator-root", help=ch.HELP_DEADCODE_DECORATOR_ROOT + ), + include_tests: bool = typer.Option( + True, + "--include-tests/--no-include-tests", + help=ch.HELP_DEADCODE_INCLUDE_TESTS, + ), + include_classes: bool = typer.Option( + False, + "--classes/--no-classes", + help=ch.HELP_DEADCODE_CLASSES, + ), + output_format: cs.DeadCodeFormat = typer.Option( + cs.DeadCodeFormat.TABLE, "--format", help=ch.HELP_DEADCODE_FORMAT + ), + output: Path | None = typer.Option( + None, "--output", "-o", help=ch.HELP_DEADCODE_OUTPUT + ), + fail_on_found: bool = typer.Option( + False, "--fail-on-found", help=ch.HELP_DEADCODE_FAIL_ON_FOUND + ), +) -> None: + from .cypher_queries import build_dead_code_query + + show_progress = output_format == cs.DeadCodeFormat.TABLE and output is None + if show_progress: + app_context.console.print(style(cs.CLI_DEADCODE_CONNECTING, cs.Color.CYAN)) + + projects: list[str] = [] + resolved: str | None = None + rows: list[ResultRow] = [] + try: + with connect_memgraph(batch_size=1) as ingestor: + projects = ingestor.list_projects() + resolved = _resolve_dead_code_project(project_name, projects) + if resolved is not None: + logger.info(ls.DEADCODE_SCANNING.format(project_name=resolved)) + rows = ingestor.fetch_all( + build_dead_code_query(include_tests, include_classes), + _dead_code_params(resolved, entry_point, decorator_root), + ) + except Exception as e: + app_context.console.print( + style(cs.CLI_ERR_DEADCODE_FAILED.format(error=e), cs.Color.RED) + ) + logger.exception(ls.DEADCODE_ERROR.format(error=e)) + raise typer.Exit(1) from e + + if resolved is None: + message = ( + cs.CLI_ERR_DEADCODE_NO_PROJECTS + if not projects + else cs.CLI_ERR_DEADCODE_AMBIGUOUS_PROJECT.format(projects=projects) + ) + app_context.console.print(style(message, cs.Color.RED)) + raise typer.Exit(1) + + candidates = [_to_dead_code_row(row) for row in rows] + _emit_dead_code(candidates, output_format, output, resolved) + + if fail_on_found and candidates: + raise typer.Exit(1) + + +@app.command(name=ch.CLICommandName.DELETE_PROJECT, help=ch.CMD_DELETE_PROJECT) +def delete_project( + name: str = typer.Option( + ..., + "--name", + "-n", + help=ch.HELP_DELETE_PROJECT_NAME, + ), + repo_path: str | None = typer.Option( + None, + "--repo-path", + help=ch.HELP_DELETE_PROJECT_REPO_PATH, + ), +) -> None: + project_name = name.strip() + if not project_name: + app_context.console.print(style(cs.CLI_ERR_PROJECT_NAME_REQUIRED, cs.Color.RED)) + raise typer.Exit(1) + + effective_batch_size = settings.resolve_batch_size(None) + + try: + with connect_memgraph(effective_batch_size) as ingestor: + projects = ingestor.list_projects() + if project_name not in projects: + app_context.console.print( + style( + cs.CLI_ERR_PROJECT_NOT_FOUND.format( + project_name=project_name, projects=projects + ), + cs.Color.RED, + ) + ) + raise typer.Exit(1) + + _info( + style( + cs.CLI_MSG_DELETING_PROJECT.format(project_name=project_name), + cs.Color.YELLOW, + ) + ) + _cleanup_project_embeddings(ingestor, project_name) + ingestor.delete_project(project_name) + except typer.Exit: + raise + except Exception as e: + app_context.console.print( + style( + cs.CLI_ERR_DELETE_PROJECT_FAILED.format( + project_name=project_name, error=e + ), + cs.Color.RED, + ) + ) + logger.exception( + cs.CLI_ERR_DELETE_PROJECT_FAILED.format(project_name=project_name, error=e) + ) + raise typer.Exit(1) from e + + if repo_path: + _delete_hash_cache(Path(repo_path)) + + _info( + style( + cs.CLI_MSG_PROJECT_DELETED.format(project_name=project_name), + cs.Color.GREEN, + ) + ) + + if __name__ == "__main__": app() diff --git a/codebase_rag/cli_help.py b/codebase_rag/cli_help.py index 96e816d9a..5d6114e27 100644 --- a/codebase_rag/cli_help.py +++ b/codebase_rag/cli_help.py @@ -10,6 +10,13 @@ class CLICommandName(StrEnum): GRAPH_LOADER = "graph-loader" LANGUAGE = "language" DOCTOR = "doctor" + STATS = "stats" + DEAD_CODE = "dead-code" + DELETE_PROJECT = "delete-project" + DAEMON = "daemon" + WORKSPACE = "workspace" + STOP = "stop" + STATUS = "status" APP_DESCRIPTION = ( @@ -26,6 +33,12 @@ class CLICommandName(StrEnum): CMD_GRAPH_LOADER = "Load and display summary of exported graph JSON" CMD_LANGUAGE = "Manage language grammars (add, remove, list)" CMD_DOCTOR = "Verify that all dependencies and configurations are properly set up" +CMD_STATS = "Display node and relationship statistics for the indexed graph" +CMD_DEAD_CODE = ( + "Report functions/methods that are unreachable from any entry point " + "(candidates for review, not a guaranteed delete list)" +) +CMD_DELETE_PROJECT = "Delete a single project from the shared graph database (keeps other projects intact)" CMD_LANGUAGE_GROUP = "CLI for managing language grammars" CMD_LANGUAGE_ADD = "Add a new language grammar to the project." @@ -33,23 +46,86 @@ class CLICommandName(StrEnum): CMD_LANGUAGE_REMOVE = "Remove a language from the project." CMD_LANGUAGE_CLEANUP = "Clean up orphaned git modules that weren't properly removed." +CMD_DAEMON = "Manage the shared cgr docker stack (memgraph + qdrant)" +CMD_DAEMON_GROUP = "Manage the shared cgr docker stack (memgraph + qdrant)" +CMD_DAEMON_UP = "Start the docker stack and wait until healthy." +CMD_DAEMON_DOWN = "Stop the docker stack (preserves data volumes)." +CMD_DAEMON_STATUS = "Show whether memgraph and qdrant are reachable." +CMD_DAEMON_LOGS = "Tail docker compose logs for the stack." +CMD_DAEMON_RESTART = "Restart the docker stack." + +CMD_WORKSPACE = "Manage cgr workspaces (named bundles of repos)" +CMD_WORKSPACE_GROUP = "Manage cgr workspaces (named bundles of repos)" +CMD_WORKSPACE_LIST = "List all workspaces." +CMD_WORKSPACE_CREATE = "Create a new empty workspace." +CMD_WORKSPACE_DELETE = "Delete a workspace TOML (does not touch indexed graph data)." +CMD_WORKSPACE_SHOW = "Show a workspace's repos and project names." +CMD_WORKSPACE_ADD_REPO = "Add a repo to a workspace." +CMD_WORKSPACE_REMOVE_REPO = "Remove a repo from a workspace by path." + +HELP_WORKSPACE_DESCRIPTION = "Optional human-readable description." +HELP_WORKSPACE_FORCE = "Overwrite an existing workspace with the same name." +HELP_WORKSPACE_REPO_PROJECT_NAME = ( + "Project name to associate with this repo (defaults to derive_project_name(repo))." +) + +MSG_NO_WORKSPACES = "(no workspaces; create one with 'cgr workspace create ')" + +CMD_STOP = "Alias for `cgr daemon down`: stop the shared docker stack." +CMD_STATUS = "Show daemon stack state plus last-sync timestamp per project." + +HELP_DAEMON_LOGS_FOLLOW = "Stream logs continuously (Ctrl+C to stop)." +HELP_DAEMON_LOGS_SERVICE = ( + "Limit logs to a specific service (memgraph, qdrant, lab). Default: all." +) +HELP_NO_START_STACK = ( + "Skip auto-starting the docker stack. Useful when memgraph/qdrant run elsewhere." +) +HELP_NO_SYNC = ( + "Skip the automatic incremental graph sync that runs before the agent starts." +) +HELP_PROJECTS = ( + "Comma-separated list of project names to scope agent queries to. " + "Overrides --project-name. If omitted, defaults to the current repo's project." +) +HELP_WORKSPACE = ( + "Open the agent over all projects defined in a cgr workspace TOML " + "(stored under ~/.cgr/workspaces/.toml)." +) + HELP_BATCH_SIZE = "Number of buffered nodes/relationships before flushing to Memgraph" HELP_MEMGRAPH_HOST = "Memgraph host" HELP_MEMGRAPH_PORT = "Memgraph port" HELP_ORCHESTRATOR = ( "Specify orchestrator as provider:model " - "(e.g., ollama:llama3.2, openai:gpt-4, google:gemini-2.5-pro)" + "(e.g., ollama:llama3.2, openai:gpt-4, google:gemini-3.1-pro-preview)" ) HELP_CYPHER_MODEL = ( "Specify cypher model as provider:model " - "(e.g., ollama:codellama, google:gemini-2.5-flash)" + "(e.g., ollama:codellama, google:gemini-3-flash-preview)" ) HELP_NO_CONFIRM = "Disable confirmation prompts for edit operations (YOLO mode)" +HELP_NO_INSTRUCTIONS = ( + "Skip loading project instructions from ~/.cgr.md and /.cgr.md " + "(useful when the consolidated memories are bloating the system prompt)" +) -HELP_REPO_PATH_RETRIEVAL = "Path to the target repository for code retrieval" -HELP_REPO_PATH_INDEX = "Path to the target repository to index." -HELP_REPO_PATH_OPTIMIZE = "Path to the repository to optimize" +HELP_REPO_PATH_RETRIEVAL = ( + "Path to the target repository for code retrieval (defaults to current directory)" +) +HELP_REPO_PATH_INDEX = ( + "Path to the target repository to index (defaults to current directory)." +) +HELP_REPO_PATH_OPTIMIZE = ( + "Path to the repository to optimize (defaults to current directory)" +) HELP_REPO_PATH_WATCH = "Path to the repository to watch." +HELP_VERSION = "Show the version and exit." + +HELP_DEBOUNCE = "Debounce delay in seconds. Set to 0 to disable debouncing." +HELP_MAX_WAIT = ( + "Maximum wait time in seconds before forcing an update during continuous edits." +) HELP_UPDATE_GRAPH = "Update the knowledge graph by parsing the repository" HELP_CLEAN_DB = "Clean the database before updating (use when adding first repo)" @@ -73,6 +149,10 @@ class CLICommandName(StrEnum): ) HELP_KEEP_SUBMODULE = "Keep the git submodule (default: remove it)" +HELP_PROJECT_NAME = ( + "Override the project name used as qualified-name prefix for all nodes. " + "Defaults to the repo directory name." +) HELP_EXCLUDE_PATTERNS = ( "Additional directories to exclude from indexing. Can be specified multiple times." ) @@ -81,6 +161,60 @@ class CLICommandName(StrEnum): "Without this flag, all directories matching ignore patterns are automatically excluded." ) +HELP_ASK_AGENT = ( + "Run a single query in non-interactive mode and exit. " + "Output is sent to stdout, useful for scripting." +) + +HELP_QUERY_OUTPUT_FORMAT = ( + "Output format for --ask-agent: 'table' (default) prints the plain answer; " + '\'json\' wraps it as {"query": ..., "response": ...} for scripting.' +) + +HELP_MCP_TRANSPORT = "Transport mode: 'stdio' (default) or 'http'" +HELP_MCP_HTTP_HOST = ( + "Host to bind the HTTP server — only used when --transport http (default: 0.0.0.0)" +) +HELP_MCP_HTTP_PORT = ( + "Port to bind the HTTP server — only used when --transport http (default: 8080)" +) + +HELP_DEADCODE_PROJECT_NAME = ( + "Project to scan (matches the Project node name). " + "If omitted, the sole indexed project is used." +) +HELP_DEADCODE_ENTRY_POINT = ( + "Treat functions/methods whose qualified name ends with this value as " + "reachable roots. Repeatable." +) +HELP_DEADCODE_DECORATOR_ROOT = ( + "Treat functions/methods carrying this decorator as reachable roots. " + "Extends the built-in set (route, task, fixture, command, ...). Repeatable." +) +HELP_DEADCODE_INCLUDE_TESTS = ( + "Treat test code as reachable roots so production code it exercises is " + "not reported. On by default." +) +HELP_DEADCODE_CLASSES = ( + "Also report unreachable classes. A class counts as used when it is " + "instantiated or subclassed by a reachable class, so a base whose only " + "subclass is itself unreachable is reported as part of the dead cluster. " + "Off by default: classes referenced only via type annotations, isinstance, " + "or dynamic lookups are not tracked and may be false positives." +) +HELP_DEADCODE_FORMAT = "Output format: 'table' (default) or 'json'." +HELP_DEADCODE_OUTPUT = "Write the report to this file instead of stdout." +HELP_DEADCODE_FAIL_ON_FOUND = ( + "Exit with code 1 when any candidate is found (useful in CI)." +) + +HELP_DELETE_PROJECT_NAME = ( + "Name of the project to delete (matches the Project node name in the graph)." +) +HELP_DELETE_PROJECT_REPO_PATH = ( + "Optional path to the project's repo. If supplied, its hash cache is removed too." +) + CLI_COMMANDS: dict[CLICommandName, str] = { CLICommandName.START: CMD_START, CLICommandName.INDEX: CMD_INDEX, @@ -90,4 +224,11 @@ class CLICommandName(StrEnum): CLICommandName.GRAPH_LOADER: CMD_GRAPH_LOADER, CLICommandName.LANGUAGE: CMD_LANGUAGE, CLICommandName.DOCTOR: CMD_DOCTOR, + CLICommandName.STATS: CMD_STATS, + CLICommandName.DEAD_CODE: CMD_DEAD_CODE, + CLICommandName.DELETE_PROJECT: CMD_DELETE_PROJECT, + CLICommandName.DAEMON: CMD_DAEMON, + CLICommandName.WORKSPACE: CMD_WORKSPACE, + CLICommandName.STOP: CMD_STOP, + CLICommandName.STATUS: CMD_STATUS, } diff --git a/codebase_rag/config.py b/codebase_rag/config.py index 31848e4d1..4c4a95857 100644 --- a/codebase_rag/config.py +++ b/codebase_rag/config.py @@ -1,5 +1,6 @@ from __future__ import annotations +import os from dataclasses import asdict, dataclass from pathlib import Path from typing import TypedDict, Unpack @@ -44,11 +45,6 @@ class ApiKeyInfoEntry(TypedDict): "url": "https://portal.azure.com/", "name": "Azure OpenAI", }, - cs.Provider.COHERE: { - "env_var": "COHERE_API_KEY", - "url": "https://dashboard.cohere.com/api-keys", - "name": "Cohere", - }, } @@ -94,6 +90,9 @@ def format_missing_api_key_errors( return error_msg +LOCAL_PROVIDERS = frozenset({cs.Provider.OLLAMA}) + + @dataclass class ModelConfig: provider: str @@ -113,8 +112,20 @@ def to_update_kwargs(self) -> ModelConfigKwargs: return ModelConfigKwargs(**result) def validate_api_key(self, role: str = cs.DEFAULT_MODEL_ROLE) -> None: - local_providers = {cs.Provider.OLLAMA, cs.Provider.LOCAL, cs.Provider.VLLM} - if self.provider.lower() in local_providers: + provider_lower = self.provider.lower() + provider_env_keys = { + cs.Provider.ANTHROPIC: cs.ENV_ANTHROPIC_API_KEY, + cs.Provider.AZURE: cs.ENV_AZURE_API_KEY, + } + env_key = provider_env_keys.get(provider_lower) + if ( + provider_lower in LOCAL_PROVIDERS + or ( + provider_lower == cs.Provider.GOOGLE + and self.provider_type == cs.GoogleProviderType.VERTEX + ) + or (env_key and os.environ.get(env_key)) + ): return if ( not self.api_key @@ -139,6 +150,8 @@ class AppConfig(BaseSettings): MEMGRAPH_HOST: str = "localhost" MEMGRAPH_PORT: int = 7687 MEMGRAPH_HTTP_PORT: int = 7444 + MEMGRAPH_USERNAME: str | None = None + MEMGRAPH_PASSWORD: str | None = None LAB_PORT: int = 3000 MEMGRAPH_BATCH_SIZE: int = 1000 AGENT_RETRIES: int = 3 @@ -150,7 +163,7 @@ class AppConfig(BaseSettings): ORCHESTRATOR_ENDPOINT: str | None = None ORCHESTRATOR_PROJECT_ID: str | None = None ORCHESTRATOR_REGION: str = cs.DEFAULT_REGION - ORCHESTRATOR_PROVIDER_TYPE: str | None = None + ORCHESTRATOR_PROVIDER_TYPE: cs.GoogleProviderType | None = None ORCHESTRATOR_THINKING_BUDGET: int | None = None ORCHESTRATOR_SERVICE_ACCOUNT_FILE: str | None = None @@ -160,7 +173,7 @@ class AppConfig(BaseSettings): CYPHER_ENDPOINT: str | None = None CYPHER_PROJECT_ID: str | None = None CYPHER_REGION: str = cs.DEFAULT_REGION - CYPHER_PROVIDER_TYPE: str | None = None + CYPHER_PROVIDER_TYPE: cs.GoogleProviderType | None = None CYPHER_THINKING_BUDGET: int | None = None CYPHER_SERVICE_ACCOUNT_FILE: str | None = None @@ -171,6 +184,11 @@ def ollama_endpoint(self) -> str: return f"{self.OLLAMA_BASE_URL.rstrip('/')}/v1" TARGET_REPO_PATH: str = "." + CPP_FRONTEND: cs.CppFrontend = cs.CppFrontend.TREESITTER + CAPTURE_FUNCTION_LOCAL_DEFINITIONS: bool = Field( + True, validation_alias="CGR_CAPTURE_LOCAL_DEFINITIONS" + ) + CGR_HOME: Path = Field(default_factory=lambda: Path.home() / ".cgr") SHELL_COMMAND_TIMEOUT: int = 30 SHELL_COMMAND_ALLOWLIST: frozenset[str] = frozenset( { @@ -235,24 +253,41 @@ def ollama_endpoint(self) -> str: ) QDRANT_DB_PATH: str = "./.qdrant_code_embeddings" + QDRANT_URL: str | None = None QDRANT_COLLECTION_NAME: str = "code_embeddings" QDRANT_VECTOR_DIM: int = 768 QDRANT_TOP_K: int = 5 + QDRANT_UPSERT_RETRIES: int = Field(default=3, gt=0) + QDRANT_RETRY_BASE_DELAY: float = Field(default=0.5, gt=0) + QDRANT_BATCH_SIZE: int = Field(default=50, gt=0) EMBEDDING_MAX_LENGTH: int = 512 EMBEDDING_PROGRESS_INTERVAL: int = 10 + FLUSH_THREAD_POOL_SIZE: int = Field(default=4, gt=0) + FILE_FLUSH_INTERVAL: int = Field(default=500, gt=0) + CACHE_MAX_ENTRIES: int = 1000 CACHE_MAX_MEMORY_MB: int = 500 CACHE_EVICTION_DIVISOR: int = 10 CACHE_MEMORY_THRESHOLD_RATIO: float = 0.8 + QUERY_RESULT_MAX_TOKENS: int = Field(default=16000, gt=0) + QUERY_RESULT_ROW_CAP: int = Field(default=500, gt=0) + QUERY_MEMORY_LIMIT_MB: int = Field(default=4096, gt=0) + QUERY_TIMEOUT_S: float = Field(default=60.0, gt=0) + OLLAMA_HEALTH_TIMEOUT: float = 5.0 + LITELLM_HEALTH_TIMEOUT: float = 5.0 _active_orchestrator: ModelConfig | None = None _active_cypher: ModelConfig | None = None QUIET: bool = Field(False, validation_alias="CGR_QUIET") + MCP_HTTP_HOST: str = "0.0.0.0" + MCP_HTTP_PORT: int = 8080 + MCP_HTTP_ENDPOINT_PATH: str = "/mcp" + def _get_default_config(self, role: str) -> ModelConfig: role_upper = role.upper() @@ -362,3 +397,34 @@ def load_cgrignore_patterns(repo_path: Path) -> CgrignorePatterns: except OSError as e: logger.warning(logs.CGRIGNORE_READ_FAILED.format(path=ignore_file, error=e)) return EMPTY_CGRIGNORE + + +CGR_INSTRUCTIONS_FILENAME = ".cgr.md" +GLOBAL_CGR_INSTRUCTIONS_PATH = Path.home() / CGR_INSTRUCTIONS_FILENAME + + +def _read_cgr_instructions_file(path: Path) -> str | None: + if not path.is_file(): + return None + try: + with path.open(encoding="utf-8") as f: + body = f.read().strip() + except OSError as e: + logger.warning(logs.CGR_INSTRUCTIONS_READ_FAILED.format(path=path, error=e)) + return None + if not body: + return None + logger.info(logs.CGR_INSTRUCTIONS_LOADED.format(path=path, chars=len(body))) + return body + + +def load_cgr_instructions(repo_path: Path | None) -> str | None: + global_body = _read_cgr_instructions_file(GLOBAL_CGR_INSTRUCTIONS_PATH) + repo_body = ( + _read_cgr_instructions_file(repo_path / CGR_INSTRUCTIONS_FILENAME) + if repo_path is not None + else None + ) + if global_body and repo_body: + return f"{global_body}\n\n---\n\n{repo_body}" + return global_body or repo_body diff --git a/codebase_rag/constants.py b/codebase_rag/constants.py index 4ef971d8a..e94b28637 100644 --- a/codebase_rag/constants.py +++ b/codebase_rag/constants.py @@ -20,9 +20,7 @@ class Provider(StrEnum): OPENAI = "openai" GOOGLE = "google" AZURE = "azure" - COHERE = "cohere" - LOCAL = "local" - VLLM = "vllm" + LITELLM_PROXY = "litellm_proxy" class Color(StrEnum): @@ -36,8 +34,15 @@ class Color(StrEnum): class KeyBinding(StrEnum): CTRL_J = "c-j" + CTRL_E = "c-e" ENTER = "enter" CTRL_C = "c-c" + SHIFT_TAB = "s-tab" + + +class PermissionMode(StrEnum): + NORMAL = "normal" + YOLO = "yolo" class StyleModifier(StrEnum): @@ -89,7 +94,7 @@ class FileAction(StrEnum): EXT_IXX = ".ixx" EXT_CPPM = ".cppm" EXT_CCM = ".ccm" -EXT_CS = ".cs" +EXT_C = ".c" EXT_PHP = ".php" EXT_LUA = ".lua" @@ -101,6 +106,7 @@ class FileAction(StrEnum): GO_EXTENSIONS = (EXT_GO,) SCALA_EXTENSIONS = (EXT_SCALA, EXT_SC) JAVA_EXTENSIONS = (EXT_JAVA,) +C_EXTENSIONS = (EXT_C,) CPP_EXTENSIONS = ( EXT_CPP, EXT_H, @@ -113,7 +119,6 @@ class FileAction(StrEnum): EXT_CPPM, EXT_CCM, ) -CS_EXTENSIONS = (EXT_CS,) PHP_EXTENSIONS = (EXT_PHP,) LUA_EXTENSIONS = (EXT_LUA,) @@ -131,6 +136,10 @@ class FileAction(StrEnum): ENV_OPENAI_API_KEY = "OPENAI_API_KEY" ENV_GOOGLE_API_KEY = "GOOGLE_API_KEY" +ENV_ANTHROPIC_API_KEY = "ANTHROPIC_API_KEY" +ENV_AZURE_API_KEY = "AZURE_API_KEY" +ENV_AZURE_ENDPOINT = "AZURE_OPENAI_ENDPOINT" +ENV_AZURE_API_VERSION = "AZURE_API_VERSION" HELP_ARG = "help" @@ -140,6 +149,11 @@ class GoogleProviderType(StrEnum): VERTEX = "vertex" +class CppFrontend(StrEnum): + TREESITTER = "treesitter" + LIBCLANG = "libclang" + + # (H) Provider endpoints OPENAI_DEFAULT_ENDPOINT = "https://api.openai.com/v1" OLLAMA_HEALTH_PATH = "/api/tags" @@ -150,11 +164,14 @@ class GoogleProviderType(StrEnum): HTTP_OK = 200 UNIXCODER_MODEL = "microsoft/unixcoder-base" +EMBEDDING_DEFAULT_BATCH_SIZE = 64 +EMBEDDING_CACHE_FILENAME = ".embedding_cache.json" KEY_NODES = "nodes" KEY_RELATIONSHIPS = "relationships" KEY_NODE_ID = "node_id" KEY_LABELS = "labels" +KEY_LABEL = "label" KEY_PROPERTIES = "properties" KEY_FROM_ID = "from_id" KEY_TO_ID = "to_id" @@ -168,9 +185,12 @@ class GoogleProviderType(StrEnum): KEY_PARSER = "parser" KEY_NAME = "name" KEY_QUALIFIED_NAME = "qualified_name" +KEY_QUERY = "query" +KEY_RESPONSE = "response" KEY_START_LINE = "start_line" KEY_END_LINE = "end_line" KEY_PATH = "path" +KEY_ABSOLUTE_PATH = "absolute_path" KEY_EXTENSION = "extension" KEY_MODULE_TYPE = "module_type" KEY_IMPLEMENTS_MODULE = "implements_module" @@ -209,12 +229,23 @@ class GoogleProviderType(StrEnum): ONEOF_EXTERNAL_PACKAGE = "external_package" ONEOF_MODULE_IMPLEMENTATION = "module_implementation" ONEOF_MODULE_INTERFACE = "module_interface" +ONEOF_INTERFACE = "interface_node" +ONEOF_ENUM = "enum_node" +ONEOF_TYPE = "type_node" +ONEOF_UNION = "union_node" # (H) CLI error and info messages CLI_ERR_OUTPUT_REQUIRES_UPDATE = ( "Error: --output/-o option requires --update-graph to be specified." ) CLI_ERR_ONLY_JSON = "Error: Currently only JSON format is supported." +CLI_ERR_JSON_REQUIRES_ASK_AGENT = ( + "Error: --output-format json requires --ask-agent/-a; " + "it only applies to single-query output." +) +CLI_ERR_PATH_NOT_EXISTS = "Error: --repo-path does not exist: {path}" +CLI_ERR_PATH_NOT_DIR = "Error: --repo-path is not a directory: {path}" +CLI_WARN_NOT_GIT_REPO = "Warning: --repo-path is not a Git repository: {path}" CLI_ERR_STARTUP = "Startup Error: {error}" CLI_ERR_CONFIG = "Configuration Error: {error}" CLI_ERR_INDEXING = "An error occurred during indexing: {error}" @@ -223,7 +254,34 @@ class GoogleProviderType(StrEnum): CLI_ERR_MCP_SERVER = "MCP Server Error: {error}" CLI_MSG_UPDATING_GRAPH = "Updating knowledge graph for: {path}" +CLI_MSG_SYNCING_GRAPH = "Syncing knowledge graph for: {path} (use --no-sync to skip)" +CLI_MSG_WORKSPACE_SYNCING = "Syncing workspace '{name}' ({count} repos)..." +CLI_MSG_WORKSPACE_SYNC_REPO = ( + "[{idx}/{total}] Syncing {path} as project '{project_name}'" +) +CLI_MSG_WORKSPACE_EMPTY = ( + "Workspace '{name}' has no repos (use cgr workspace add-repo)." +) +MSG_SYNCING_KNOWLEDGE_GRAPH = ( + "[bold cyan]Syncing knowledge graph[/bold cyan] (incremental, --no-sync to skip)" +) +MSG_SYNCING_WORKSPACE = ( + "[bold cyan]Syncing workspace '{name}'[/bold cyan] ({count} repos)" +) +CLI_MSG_SYNC_SKIPPED = "Knowledge graph already in sync for '{project}' ({elapsed:.2f}s, no changes detected)." +CLI_MSG_SYNC_DONE = "Knowledge graph sync done for '{project}' in {elapsed:.2f}s." CLI_MSG_CLEANING_DB = "Cleaning database..." +CLI_MSG_CLEANING_HASH_CACHE = "Removing hash cache: {path}" +CLI_MSG_CLEAN_DONE = "Clean completed successfully!" +CLI_MSG_DELETING_PROJECT = "Deleting project '{project_name}' from the graph..." +CLI_MSG_PROJECT_DELETED = "Project '{project_name}' deleted successfully." +CLI_ERR_PROJECT_NOT_FOUND = ( + "Project '{project_name}' not found. Available projects: {projects}" +) +CLI_ERR_PROJECT_NAME_REQUIRED = ( + "Error: --name is required and must be a non-empty project name." +) +CLI_ERR_DELETE_PROJECT_FAILED = "Failed to delete project '{project_name}': {error}" CLI_MSG_EXPORTING_TO = "Exporting graph to: {path}" CLI_MSG_GRAPH_UPDATED = "Graph update completed!" CLI_MSG_APP_TERMINATED = "\nApplication terminated by user." @@ -234,10 +292,39 @@ class GoogleProviderType(StrEnum): CLI_MSG_EXPORTING_DATA = "Exporting graph data..." CLI_MSG_OPTIMIZATION_TERMINATED = "\nOptimization session terminated by user." CLI_MSG_MCP_TERMINATED = "\nMCP server terminated by user." +PACKAGE_NAME = "code-graph-rag" +CLI_MSG_VERSION = "{package} version {version}" CLI_MSG_HINT_TARGET_REPO = ( "\nHint: Make sure TARGET_REPO_PATH environment variable is set." ) CLI_MSG_GRAPH_SUMMARY = "Graph Summary:" +CLI_MSG_CONNECTING_STATS = "Fetching graph statistics..." +CLI_STATS_NODE_TITLE = "Node Statistics" +CLI_STATS_REL_TITLE = "Relationship Statistics" +CLI_STATS_COL_NODE_TYPE = "Node Type" +CLI_STATS_COL_REL_TYPE = "Relationship Type" +CLI_STATS_COL_COUNT = "Count" +CLI_STATS_TOTAL_NODES = "Total Nodes" +CLI_STATS_TOTAL_RELS = "Total Relationships" +CLI_STATS_UNKNOWN = "Unknown" +CLI_ERR_STATS_FAILED = "Failed to get graph statistics: {error}" + +CLI_DEADCODE_CONNECTING = "Scanning for unreachable functions and methods..." +CLI_DEADCODE_TABLE_TITLE = "Dead Code Candidates ({project_name})" +CLI_DEADCODE_COL_KIND = "Kind" +CLI_DEADCODE_COL_QUALIFIED_NAME = "Qualified Name" +CLI_DEADCODE_COL_LINES = "Lines" +CLI_DEADCODE_LINE_RANGE = "{start}-{end}" +CLI_DEADCODE_SUMMARY = "{count} candidate(s) for review." +CLI_DEADCODE_NONE = "No unreachable functions or methods found." +CLI_DEADCODE_WRITTEN = "Wrote {count} candidate(s) to {path}" +CLI_ERR_DEADCODE_FAILED = "Failed to scan for dead code: {error}" +CLI_ERR_DEADCODE_NO_PROJECTS = ( + "No projects found in the graph. Index a repository first with 'cgr start'." +) +CLI_ERR_DEADCODE_AMBIGUOUS_PROJECT = ( + "Multiple projects found: {projects}. Specify which one with --project-name/-n." +) CLI_MSG_AUTO_EXCLUDE = ( "Auto-excluding common directories (venv, node_modules, .git, etc.). " "Use --interactive-setup to customize." @@ -247,9 +334,7 @@ class GoogleProviderType(StrEnum): UI_NEW_FILE_HEADER = "[bold cyan]New file: {path}[/bold cyan]" UI_SHELL_COMMAND_HEADER = "[bold cyan]Shell command:[/bold cyan]" UI_TOOL_APPROVAL = "[bold yellow]⚠️ Tool '{tool_name}' requires approval:[/bold yellow]" -UI_FEEDBACK_PROMPT = ( - "[bold yellow]Feedback (why rejected, or press Enter to skip)[/bold yellow]" -) +UI_FEEDBACK_PROMPT = "Feedback (why rejected, or press Enter to skip)" UI_OPTIMIZATION_START = ( "[bold green]Starting {language} optimization session...[/bold green]" ) @@ -268,7 +353,7 @@ class GoogleProviderType(StrEnum): UI_MODEL_SWITCHED = "[bold green]Model switched to: {model}[/bold green]" UI_MODEL_CURRENT = "[bold cyan]Current model: {model}[/bold cyan]" UI_MODEL_SWITCH_ERROR = "[bold red]Failed to switch model: {error}[/bold red]" -UI_MODEL_USAGE = "[bold yellow]Usage: /model (e.g., /model google:gemini-2.0-flash)[/bold yellow]" +UI_MODEL_USAGE = "[bold yellow]Usage: /model (e.g., /model google:gemini-3.1-pro-preview)[/bold yellow]" UI_HELP_COMMANDS = """[bold cyan]Available commands:[/bold cyan] /model - Switch to a different model /model - Show current model @@ -296,6 +381,9 @@ class GoogleProviderType(StrEnum): # (H) Qualified name separators SEPARATOR_DOT = "." SEPARATOR_SLASH = "/" +# (H) Disambiguates definitions that share one qualified name (if/else import +# (H) fallbacks, typing.overload, try/except fallbacks): "@". +DUP_QN_MARKER = "@" # (H) Path navigation PATH_CURRENT_DIR = "." @@ -318,6 +406,42 @@ class UniqueKeyType(StrEnum): QUALIFIED_NAME = KEY_QUALIFIED_NAME +class DeadCodeFormat(StrEnum): + TABLE = "table" + JSON = "json" + + +class QueryFormat(StrEnum): + TABLE = "table" + JSON = "json" + + +# (H) Decorators whose presence marks a function/method as an implicit entry point +# (H) (web routes, task/flow handlers, fixtures, CLI commands, event listeners). +DEFAULT_ROOT_DECORATORS: frozenset[str] = frozenset( + { + "route", + "get", + "post", + "put", + "delete", + "patch", + "websocket", + "task", + "flow", + "fixture", + "command", + "cli", + "app", + "on_event", + "listener", + } +) + +# (H) Substrings in a node's file path that mark it as test code. +TEST_PATH_PATTERNS: tuple[str, ...] = ("test_", "_test", "conftest", "/tests/") + + class NodeLabel(StrEnum): PROJECT = "Project" PACKAGE = "Package" @@ -377,6 +501,7 @@ class RelationshipType(StrEnum): IMPLEMENTS = "IMPLEMENTS" OVERRIDES = "OVERRIDES" CALLS = "CALLS" + INSTANTIATES = "INSTANTIATES" DEPENDS_ON_EXTERNAL = "DEPENDS_ON_EXTERNAL" @@ -417,14 +542,21 @@ class RelationshipType(StrEnum): # (H) Cypher queries CYPHER_DEFAULT_LIMIT = 50 -CYPHER_QUERY_EMBEDDINGS = """ +_CYPHER_EMBEDDING_BASE = """ MATCH (m:Module)-[:DEFINES]->(n) WHERE (n:Function OR n:Method) - AND m.qualified_name STARTS WITH $project_name + '.' -RETURN id(n) AS node_id, n.qualified_name AS qualified_name, + AND m.qualified_name STARTS WITH ($project_name + '.') +""" + +CYPHER_QUERY_EMBEDDINGS = ( + _CYPHER_EMBEDDING_BASE + + """RETURN id(n) AS node_id, n.qualified_name AS qualified_name, n.start_line AS start_line, n.end_line AS end_line, m.path AS path """ +) + +CYPHER_QUERY_PROJECT_NODE_IDS = _CYPHER_EMBEDDING_BASE + "RETURN id(n) AS node_id\n" class SupportedLanguage(StrEnum): @@ -435,8 +567,8 @@ class SupportedLanguage(StrEnum): GO = "go" SCALA = "scala" JAVA = "java" + C = "c" CPP = "cpp" - CSHARP = "c-sharp" PHP = "php" LUA = "lua" @@ -468,6 +600,11 @@ class LanguageMetadata(NamedTuple): "Interfaces, type aliases, enums, namespaces, ES6/CommonJS modules", "TypeScript", ), + SupportedLanguage.C: LanguageMetadata( + LanguageStatus.FULL, + "Functions, structs, unions, enums, preprocessor includes", + "C", + ), SupportedLanguage.CPP: LanguageMetadata( LanguageStatus.FULL, "Constructors, destructors, operator overloading, templates, lambdas, C++20 modules, namespaces", @@ -498,14 +635,9 @@ class LanguageMetadata(NamedTuple): "Case classes, objects", "Scala", ), - SupportedLanguage.CSHARP: LanguageMetadata( - LanguageStatus.DEV, - "Classes, interfaces, generics (planned)", - "C#", - ), SupportedLanguage.PHP: LanguageMetadata( - LanguageStatus.DEV, - "Classes, functions, namespaces", + LanguageStatus.FULL, + "Classes, interfaces, traits, enums, namespaces, PHP 8 attributes", "PHP", ), } @@ -551,7 +683,6 @@ class LanguageMetadata(NamedTuple): IMPORT_NODES_FROM = ("import_from_statement",) IMPORT_NODES_MODULE = ("lexical_declaration", "export_statement") IMPORT_NODES_INCLUDE = ("preproc_include",) -IMPORT_NODES_USING = ("using_directive",) # (H) JS/TS specific node types JS_TS_FUNCTION_NODES = ( @@ -584,10 +715,16 @@ class LanguageMetadata(NamedTuple): FIELD_MODULE_NAME = "module_name" FIELD_ARGUMENTS = "arguments" FIELD_BODY = "body" +FIELD_RETURN_TYPE = "return_type" FIELD_CONSTRUCTOR = "constructor" FIELD_DECLARATOR = "declarator" FIELD_PARAMETERS = "parameters" +FIELD_RECEIVER = "receiver" FIELD_TYPE = "type" +# (H) Rust impl `trait`/`type` fields and a trait's supertrait `bounds`. +FIELD_TRAIT = "trait" +FIELD_BOUNDS = "bounds" +TS_RS_TRAIT_BOUNDS = "trait_bounds" FIELD_VALUE = "value" FIELD_LEFT = "left" FIELD_RIGHT = "right" @@ -601,7 +738,35 @@ class LanguageMetadata(NamedTuple): METHOD_ITEMS = "items" # (H) Image file extensions for chat image handling -IMAGE_EXTENSIONS = (".png", ".jpg", ".jpeg", ".gif") +MULTIMODAL_EXTENSIONS = (".png", ".jpg", ".jpeg", ".gif", ".webp", ".pdf") +MIME_TYPE_PDF = "application/pdf" +MIME_TYPE_FALLBACK = "application/octet-stream" +YES_ANSWER = "y" +YES_ANSWERS = frozenset({"y", "yes", ""}) +NO_ANSWERS = frozenset({"n", "no"}) +SHIFT_TAB_ESCAPE = b"\x1b[Z" +DIFF_GIT_HEADER = "diff --git " +MARKDOWN_FENCE = "```" +MARKDOWN_FENCE_DIFF = "```diff" +DIFF_CONTINUATION_PREFIXES = ( + "diff --git ", + "index ", + "--- ", + "+++ ", + "@@ ", + "+", + "-", + " ", + "\\ ", + "new file mode", + "deleted file mode", + "old mode", + "new mode", + "rename from ", + "rename to ", + "similarity index ", + "Binary files ", +) # (H) CLI exit commands EXIT_COMMANDS = frozenset({"exit", "quit"}) @@ -670,6 +835,7 @@ class DiffMarker: MSG_CONNECTED_MEMGRAPH = "Successfully connected to Memgraph." MSG_THINKING_CANCELLED = "Thinking cancelled." MSG_TIMEOUT_FORMAT = "Operation timed out after {timeout} seconds." +MSG_TOOL_CALL_CANCELLED = "Tool call cancelled by user." MSG_CHAT_INSTRUCTIONS = ( "Ask questions about your codebase graph. Type 'exit' or 'quit' to end." ) @@ -679,7 +845,65 @@ class DiffMarker: OPTIMIZATION_TABLE_TITLE = "Optimization Session Configuration" PROMPT_ASK_QUESTION = "Ask a question" PROMPT_YOUR_RESPONSE = "Your response" -MULTILINE_INPUT_HINT = "(Press Ctrl+J to submit, Enter for new line)" +MULTILINE_INPUT_HINT = ( + "(Press Ctrl+J or Ctrl+E to submit, Enter for new line, Shift+Tab to toggle mode)" +) +PERMISSION_MODE_NORMAL_LABEL = "● Normal mode (asks before destructive)" +PERMISSION_MODE_YOLO_LABEL = "● YOLO mode (auto-approve, allowlist off)" +PERMISSION_MODE_TOGGLED = "Permission mode: {label}" +STATUS_BAR_BRANCH_CLEAN_HTML = ( + '' +) +STATUS_BAR_BRANCH_DIRTY_HTML = ( + '' +) +STATUS_BAR_BRANCH_CLEAN_PLAIN = " ⎇ {branch} " +STATUS_BAR_BRANCH_DIRTY_PLAIN = " ⎇ {branch} ± " +STATUS_BAR_BRANCH_RICH_TEXT = " ⎇ {branch}{marker} " +STATUS_BAR_CLEAN_STYLE = "black on green" +STATUS_BAR_DIRTY_STYLE = "black on yellow" +STATUS_BAR_DIRTY_MARKER = " ±" +STATUS_BAR_SPINNER = "dots" +STATUS_BAR_SEPARATOR_CHAR = "─" +STATUS_BAR_SEPARATOR_COLOR = "#666666" +STATUS_BAR_TOKEN_HTML = ' ' +STATUS_BAR_CONFIG_COLOR = "#888888" +STATUS_BAR_CONFIG_LABEL_COLOR = "#5fafd7" +STATUS_BAR_CONFIG_SEPARATOR = " │ " +STATUS_BAR_CONFIG_LABEL_O = "O" +STATUS_BAR_CONFIG_LABEL_C = "C" +STATUS_BAR_CONFIG_LABEL_EDIT = "edit" +STATUS_BAR_CONFIG_LABEL_INSTRUCTIONS = "instructions" +STATUS_BAR_CONFIG_LABEL_REPO = "repo" +STATUS_BAR_EDIT_ON = "on" +STATUS_BAR_EDIT_OFF = "off" +TOKEN_THRESHOLD_WARNING = 50 +TOKEN_THRESHOLD_CRITICAL = 80 +TOKEN_COLOR_OK = "green" +TOKEN_COLOR_WARNING = "yellow" +TOKEN_COLOR_CRITICAL = "red" + +ANTHROPIC_COUNT_TOKENS_URL = "https://api.anthropic.com/v1/messages/count_tokens" +ANTHROPIC_API_VERSION = "2023-06-01" +ANTHROPIC_HEADER_API_KEY = "x-api-key" +ANTHROPIC_HEADER_VERSION = "anthropic-version" +HEADER_CONTENT_TYPE = "content-type" +CONTENT_TYPE_JSON = "application/json" +ANTHROPIC_COUNT_TIMEOUT_S = 10.0 + +DEFAULT_CONTEXT_WINDOW = 200_000 +MODEL_CONTEXT_WINDOWS: dict[str, int] = { + "claude-opus-4-7": 1_000_000, + "claude-opus-4-6": 200_000, + "claude-opus-4-5": 200_000, + "claude-opus-4-1": 200_000, + "claude-opus-4-0": 200_000, + "claude-sonnet-4-6": 200_000, + "claude-sonnet-4-5": 200_000, + "claude-sonnet-4-0": 200_000, + "claude-haiku-4-5": 200_000, + "claude-haiku-4-0": 200_000, +} # (H) Interactive setup prompt - grouped view INTERACTIVE_TITLE_GROUPED = "Detected Directories (will be excluded unless kept)" @@ -723,6 +947,7 @@ class DiffMarker: INPLACE_FLAG = "--inplace" LANG_ATTR_PREFIX = "language_" LANG_ATTR_TYPESCRIPT = "language_typescript" +LANG_ATTR_PHP = "language_php" class TreeSitterModule(StrEnum): @@ -733,8 +958,10 @@ class TreeSitterModule(StrEnum): GO = "tree_sitter_go" SCALA = "tree_sitter_scala" JAVA = "tree_sitter_java" + C = "tree_sitter_c" CPP = "tree_sitter_cpp" LUA = "tree_sitter_lua" + PHP = "tree_sitter_php" # (H) Query dict keys @@ -839,10 +1066,34 @@ class TreeSitterModule(StrEnum): class EventType(StrEnum): MODIFIED = "modified" CREATED = "created" + DELETED = "deleted" -CYPHER_DELETE_MODULE = "MATCH (m:Module {path: $path})-[*0..]->(c) DETACH DELETE m, c" +CYPHER_DELETE_MODULE = ( + "MATCH (m:Module {path: $path}) " + "OPTIONAL MATCH (m)-[:DEFINES|DEFINES_METHOD*0..]->(c) " + "DETACH DELETE m, c" +) +CYPHER_DELETE_FILE = "MATCH (f:File {path: $path}) DETACH DELETE f" +CYPHER_DELETE_FOLDER = "MATCH (f:Folder {path: $path}) DETACH DELETE f" CYPHER_DELETE_CALLS = "MATCH ()-[r:CALLS]->() DELETE r" +# (H) Removes external import-target Module nodes that no module imports anymore +# (H) (e.g. an imported name that was renamed/removed on an incremental rebuild). +CYPHER_DELETE_ORPHAN_EXTERNAL_MODULES = ( + "MATCH (m:Module) WHERE m.is_external = true AND NOT (m)<--() DETACH DELETE m" +) + +# (H) Queries for orphan pruning — returns all paths stored in the graph +CYPHER_ALL_FILE_PATHS = ( + "MATCH (f:File) RETURN f.path AS path, f.absolute_path AS absolute_path" +) +CYPHER_ALL_MODULE_PATHS_INTERNAL = ( + "MATCH (m:Module) WHERE m.is_external IS NULL OR m.is_external = false " + "RETURN m.path AS path, m.qualified_name AS qualified_name" +) +CYPHER_ALL_FOLDER_PATHS = ( + "MATCH (f:Folder) RETURN f.path AS path, f.absolute_path AS absolute_path" +) REALTIME_LOGGER_FORMAT = ( "{time:YYYY-MM-DD HH:mm:ss.SSS} | " @@ -853,6 +1104,11 @@ class EventType(StrEnum): WATCHER_SLEEP_INTERVAL = 1 LOG_LEVEL_INFO = "INFO" +LOG_LEVEL_ERROR = "ERROR" + +# (H) Debounce settings for realtime watcher +DEFAULT_DEBOUNCE_SECONDS = 5 +DEFAULT_MAX_WAIT_SECONDS = 30 class Architecture(StrEnum): @@ -880,8 +1136,11 @@ class Architecture(StrEnum): PYINSTALLER_ARG_COLLECT_ALL = "--collect-all" PYINSTALLER_ARG_COLLECT_DATA = "--collect-data" PYINSTALLER_ARG_HIDDEN_IMPORT = "--hidden-import" +PYINSTALLER_ARG_EXCLUDE_MODULE = "--exclude-module" PYINSTALLER_ENTRY_POINT = "main.py" +PYINSTALLER_EXCLUDED_MODULES = ["logfire"] + # (H) TOML parsing constants TOML_KEY_PROJECT = "project" TOML_KEY_OPTIONAL_DEPS = "optional-dependencies" @@ -905,6 +1164,7 @@ class Architecture(StrEnum): PyInstallerPackage(name="loguru", collect_all=True), PyInstallerPackage(name="toml", collect_all=True), PyInstallerPackage(name="protobuf", collect_all=True), + PyInstallerPackage(name="genai_prices", collect_all=True), ] ALLOWED_COMMENT_MARKERS = frozenset( @@ -961,6 +1221,46 @@ class UniXcoderMode(StrEnum): CYPHER_SEMICOLON = ";" CYPHER_BACKTICK = "`" CYPHER_MATCH_KEYWORD = "MATCH" +CYPHER_DANGEROUS_KEYWORDS: frozenset[str] = frozenset( + { + "DELETE", + "DETACH", + "DROP", + "CREATE INDEX", + "CREATE CONSTRAINT", + "REMOVE", + "SET", + "MERGE", + "CREATE", + "LOAD CSV", + "FOREACH", + } +) + +CYPHER_ALLOWED_PROCEDURE_PREFIXES: frozenset[str] = frozenset( + { + "algo.", + "betweenness_centrality.", + "biconnected_components.", + "bridges.", + "community_detection.", + "cycles.", + "degree_centrality.", + "graph_analyzer.", + "graph_util.", + "igraphalg.", + "katz_centrality.", + "leiden_community_detection.", + "neighbors.", + "node_similarity.", + "nxalg.", + "pagerank.", + "path.", + "schema.", + "weakly_connected_components.", + "wcc.", + } +) # (H) Tool success messages MSG_SURGICAL_SUCCESS = "Successfully applied surgical code replacement in: {path}" @@ -1105,12 +1405,23 @@ class UniXcoderMode(StrEnum): # (H) Query tool messages QUERY_NOT_AVAILABLE = "N/A" DICT_KEY_RESULTS = "results" +TIKTOKEN_ENCODING = "cl100k_base" QUERY_SUMMARY_SUCCESS = "Successfully retrieved {count} item(s) from the graph." +QUERY_SUMMARY_TRUNCATED = ( + "Results truncated: showing {kept} of {total} items (~{tokens} tokens, limit {max_tokens}). " + "Refine your query for more specific results." +) QUERY_SUMMARY_TRANSLATION_FAILED = ( "I couldn't translate your request into a database query. Error: {error}" ) QUERY_SUMMARY_DB_ERROR = "There was an error querying the database: {error}" +QUERY_SUMMARY_TIMEOUT = ( + "Query exceeded the {timeout:.1f}s timeout and was cancelled. " + "Avoid unbounded traversals; add depth bounds or use a graph-algorithm procedure." +) QUERY_RESULTS_PANEL_TITLE = "[bold blue]Cypher Query Results[/bold blue]" +CYPHER_MEMORY_LIMIT_SUFFIX = " QUERY MEMORY LIMIT {mb} MB" +CYPHER_MEMORY_LIMIT_TOKEN = "QUERY MEMORY LIMIT" # (H) File editor constants TMP_EXTENSION = ".tmp" @@ -1569,6 +1880,13 @@ class CppNodeType(StrEnum): # (H) Gemfile parsing patterns GEMFILE_GEM_PREFIX = "gem " +# (H) Incremental update hash cache +HASH_CACHE_FILENAME = ".cgr-hash-cache.json" +DIR_MTIMES_FILENAME = ".cgr-dir-mtimes.json" +GIT_DIR_NAME = ".git" +ROOT_DIR_KEY = "." +JSON_EMPTY_OBJECT = "{}" + # (H) Import processor cache config IMPORT_CACHE_TTL = 3600 IMPORT_CACHE_DIR = ".cache/codebase_rag" @@ -1666,6 +1984,11 @@ class CppNodeType(StrEnum): # (H) Tree-sitter Go node types TS_GO_TYPE_DECLARATION = "type_declaration" +TS_GO_TYPE_SPEC = "type_spec" +TS_GO_TYPE_ALIAS = "type_alias" +TS_GO_STRUCT_TYPE = "struct_type" +TS_GO_INTERFACE_TYPE = "interface_type" +TS_GO_PARAMETER_DECLARATION = "parameter_declaration" TS_GO_SOURCE_FILE = "source_file" TS_GO_FUNCTION_DECLARATION = "function_declaration" TS_GO_METHOD_DECLARATION = "method_declaration" @@ -1680,23 +2003,28 @@ class CppNodeType(StrEnum): TS_SCALA_FUNCTION_DEFINITION = "function_definition" TS_SCALA_FUNCTION_DECLARATION = "function_declaration" TS_SCALA_CALL_EXPRESSION = "call_expression" -TS_SCALA_GENERIC_FUNCTION = "generic_function" +# (H) Shared tree-sitter node type: a call with explicit type args, e.g. Rust +# (H) turbofish `f::()` and Scala `f[T]()`. Its `function` field holds the +# (H) actual callee (identifier or scoped_identifier). +TS_GENERIC_FUNCTION = "generic_function" +TS_SCALA_GENERIC_FUNCTION = TS_GENERIC_FUNCTION TS_SCALA_FIELD_EXPRESSION = "field_expression" TS_SCALA_INFIX_EXPRESSION = "infix_expression" TS_SCALA_IMPORT_DECLARATION = "import_declaration" -# (H) Tree-sitter C# node types -TS_CS_STRUCT_DECLARATION = "struct_declaration" -TS_CS_COMPILATION_UNIT = "compilation_unit" -TS_CS_DESTRUCTOR_DECLARATION = "destructor_declaration" -TS_CS_LOCAL_FUNCTION_STATEMENT = "local_function_statement" -TS_CS_FUNCTION_POINTER_TYPE = "function_pointer_type" -TS_CS_ANONYMOUS_METHOD_EXPRESSION = "anonymous_method_expression" -TS_CS_LAMBDA_EXPRESSION = "lambda_expression" -TS_CS_INVOCATION_EXPRESSION = "invocation_expression" - # (H) Tree-sitter PHP node types +TS_PHP_FUNCTION_DEFINITION = "function_definition" +TS_PHP_METHOD_DECLARATION = "method_declaration" TS_PHP_TRAIT_DECLARATION = "trait_declaration" +# (H) PHP inheritance clauses: `extends ...` (base_clause, for class AND +# (H) interface) and `implements ...` (class_interface_clause); each lists `name` +# (H) nodes naming the base types. +TS_PHP_BASE_CLAUSE = "base_clause" +TS_PHP_CLASS_INTERFACE_CLAUSE = "class_interface_clause" +TS_PHP_NAME = "name" +# (H) PHP fully-qualified base (`\Exception`, `\App\Base`); its trailing `name` +# (H) child is the simple name cgr resolves against. +TS_PHP_QUALIFIED_NAME = "qualified_name" TS_PHP_FUNCTION_STATIC_DECLARATION = "function_static_declaration" TS_PHP_ANONYMOUS_FUNCTION = "anonymous_function" TS_PHP_ARROW_FUNCTION = "arrow_function" @@ -1704,6 +2032,20 @@ class CppNodeType(StrEnum): TS_PHP_SCOPED_CALL_EXPRESSION = "scoped_call_expression" TS_PHP_FUNCTION_CALL_EXPRESSION = "function_call_expression" TS_PHP_NULLSAFE_MEMBER_CALL_EXPRESSION = "nullsafe_member_call_expression" +TS_PHP_OBJECT_CREATION_EXPRESSION = "object_creation_expression" +TS_PHP_NAMESPACE_DEFINITION = "namespace_definition" +TS_PHP_NAMESPACE_USE_DECLARATION = "namespace_use_declaration" +TS_PHP_NAMESPACE_USE_CLAUSE = "namespace_use_clause" +TS_PHP_INCLUDE_EXPRESSION = "include_expression" +TS_PHP_INCLUDE_ONCE_EXPRESSION = "include_once_expression" +TS_PHP_REQUIRE_EXPRESSION = "require_expression" +TS_PHP_REQUIRE_ONCE_EXPRESSION = "require_once_expression" +TS_PHP_ATTRIBUTE_LIST = "attribute_list" +TS_PHP_ATTRIBUTE = "attribute" +TS_PHP_ATTRIBUTE_GROUP = "attribute_group" +TS_PHP_VISIBILITY_MODIFIER = "visibility_modifier" +TS_PHP_USE_DECLARATION = "use_declaration" +TS_PHP_QUALIFIED_NAME = "qualified_name" # (H) Tree-sitter Lua node types for language_spec TS_LUA_CHUNK = "chunk" @@ -1739,11 +2081,15 @@ class CppNodeType(StrEnum): TS_VIRTUAL = "virtual" TS_TYPE_LIST = "type_list" TS_CLASS_HERITAGE = "class_heritage" +# (H) TS class `implements I, J` clause (a child of class_heritage). +TS_IMPLEMENTS_CLAUSE = "implements_clause" TS_EXTENDS_CLAUSE = "extends_clause" TS_MEMBER_EXPRESSION = "member_expression" TS_EXTENDS = "extends" TS_ARGUMENTS = "arguments" TS_EXTENDS_TYPE_CLAUSE = "extends_type_clause" +# (H) Java interface `extends A, B` clause (tree-sitter-java); holds a type_list. +TS_JAVA_EXTENDS_INTERFACES = "extends_interfaces" TS_METHOD_DEFINITION = "method_definition" TS_DECORATOR = "decorator" TS_ERROR = "ERROR" @@ -1827,6 +2173,20 @@ class CppNodeType(StrEnum): } ) +# (H) Java stdlib package prefixes for static stdlib detection +JAVA_STDLIB_PREFIXES = ( + "java.", + "javax.", + "jdk.", + "com.sun.", + "sun.", + "org.w3c.", + "org.xml.", + "org.ietf.", + "org.omg.", + "netscape.", +) + # (H) Java common class names for heuristic detection JAVA_STDLIB_CLASSES = frozenset( { @@ -1906,6 +2266,7 @@ class CppNodeType(StrEnum): # (H) Tree-sitter field names for child_by_field_name TS_FIELD_NAME = "name" TS_FIELD_TYPE = "type" +TS_SCOPED_TYPE_IDENTIFIER = "scoped_type_identifier" TS_FIELD_SUPERCLASS = "superclass" TS_FIELD_INTERFACES = "interfaces" TS_FIELD_TYPE_PARAMETERS = "type_parameters" @@ -2095,6 +2456,8 @@ class CppNodeType(StrEnum): TS_PY_FOR_STATEMENT = "for_statement" TS_PY_FOR_IN_CLAUSE = "for_in_clause" TS_PY_ASSIGNMENT = "assignment" +PY_ASSIGNMENT_QUERY = "(assignment) @assignment" +PY_RETURN_QUERY = "(return_statement) @return_stmt" TS_PY_CLASS_DEFINITION = "class_definition" TS_PY_BLOCK = "block" TS_PY_FUNCTION_DEFINITION = "function_definition" @@ -2109,11 +2472,57 @@ class CppNodeType(StrEnum): TS_PY_STRING = "string" TS_PY_DECORATED_DEFINITION = "decorated_definition" TS_PY_DECORATOR = "decorator" +TS_PY_KEYWORD_ARGUMENT = "keyword_argument" +TS_PY_DEFAULT_PARAMETER = "default_parameter" +TS_PY_LIST_SPLAT_PATTERN = "list_splat_pattern" +TS_PY_DICTIONARY_SPLAT_PATTERN = "dictionary_splat_pattern" +TS_PY_SUBSCRIPT = "subscript" +TS_PY_COMPARISON_OPERATOR = "comparison_operator" +TS_FIELD_OPERATORS = "operators" +TS_PY_IF_STATEMENT = "if_statement" +TS_PY_WHILE_STATEMENT = "while_statement" +TS_PY_ELIF_CLAUSE = "elif_clause" +TS_PY_CONDITIONAL_EXPRESSION = "conditional_expression" +TS_PY_BOOLEAN_OPERATOR = "boolean_operator" +TS_PY_NOT_OPERATOR = "not_operator" +TS_FIELD_CONDITION = "condition" +TS_FIELD_ARGUMENT = "argument" + +# (H) Python operator syntax dispatches to dunder methods at runtime; these names +# (H) let the call extractor synthesize the implied .__dunder__ call. +PY_OP_IN = "in" +PY_BUILTIN_LEN = "len" +PY_BUILTIN_GETATTR = "getattr" +TS_PY_STRING_CONTENT = "string_content" +PY_DUNDER_GETITEM = "__getitem__" +PY_DUNDER_SETITEM = "__setitem__" +PY_DUNDER_CONTAINS = "__contains__" +PY_DUNDER_LEN = "__len__" +PY_DUNDER_BOOL = "__bool__" +# (H) Operands with these characters are not simple attribute/name chains (calls, +# (H) nested subscripts, whitespace), so the operator-dispatch synthesizer skips them. +PY_OPERAND_REJECT_CHARS = "()[]{}\n\t " +# (H) Optional annotation handling: X | None names a single concrete class. +PY_UNION_SEPARATOR = "|" +PY_NONE = "None" # (H) Python keyword identifiers PY_KEYWORD_SELF = "self" PY_KEYWORD_CLS = "cls" +# (H) typing.Protocol base name and the conventional XxxProtocol class suffix +# (H) used to map a Protocol to its concrete implementer. +PY_PROTOCOL = "Protocol" PY_METHOD_INIT = "__init__" +DECORATOR_AT = "@" +PROPERTY_DECORATORS: frozenset[str] = frozenset({"property", "cached_property"}) +ABSTRACT_DECORATORS: frozenset[str] = frozenset({"abstractmethod", "abstractproperty"}) + +# (H) Eager builtins that invoke a callable argument synchronously within the +# (H) caller's own stack frame; a function passed to one is invoked there, so the +# (H) trace attributes the call to the enclosing function (no Python frame exists +# (H) for the builtin). Lazy higher-order builtins (map/filter) are excluded: +# (H) they defer invocation until the result is consumed, which may be elsewhere. +HIGHER_ORDER_BUILTINS: frozenset[str] = frozenset({"sorted", "min", "max", "reduce"}) # (H) Python attribute prefixes PY_SELF_PREFIX = "self." @@ -2134,8 +2543,9 @@ class CppNodeType(StrEnum): TYPE_INFERENCE_LIST = "list" TYPE_INFERENCE_BASE_MODEL = "BaseModel" -# (H) Type inference guard attribute +# (H) Recursion guard attributes ATTR_TYPE_INFERENCE_IN_PROGRESS = "_type_inference_in_progress" +GUARD_INHERITED_METHOD = "_inherited_method_guard" # (H) JS/TS ingest node types TS_PAIR = "pair" @@ -2307,6 +2717,7 @@ class CppNodeType(StrEnum): # (H) Tree-sitter Rust node types TS_RS_SCOPED_TYPE_IDENTIFIER = "scoped_type_identifier" +TS_RS_PRIMITIVE_TYPE = "primitive_type" TS_RS_USE_AS_CLAUSE = "use_as_clause" TS_RS_USE_WILDCARD = "use_wildcard" TS_RS_USE_LIST = "use_list" @@ -2355,12 +2766,21 @@ class MCPToolName(StrEnum): DELETE_PROJECT = "delete_project" WIPE_DATABASE = "wipe_database" INDEX_REPOSITORY = "index_repository" + UPDATE_REPOSITORY = "update_repository" QUERY_CODE_GRAPH = "query_code_graph" GET_CODE_SNIPPET = "get_code_snippet" SURGICAL_REPLACE_CODE = "surgical_replace_code" READ_FILE = "read_file" WRITE_FILE = "write_file" LIST_DIRECTORY = "list_directory" + SEMANTIC_SEARCH = "semantic_search" + ASK_AGENT = "ask_agent" + + +# (H) MCP transport selection +class MCPTransport(StrEnum): + STDIO = "stdio" + HTTP = "http" # (H) MCP environment variables @@ -2400,6 +2820,8 @@ class MCPParamName(StrEnum): LIMIT = "limit" CONTENT = "content" DIRECTORY_PATH = "directory_path" + TOP_K = "top_k" + QUESTION = "question" # (H) MCP server constants @@ -2418,6 +2840,12 @@ class MCPParamName(StrEnum): MCP_WRITE_SUCCESS = "Successfully wrote file: {path}" MCP_UNKNOWN_TOOL_ERROR = "Unknown tool: {name}" MCP_TOOL_EXEC_ERROR = "Error executing tool '{name}': {error}" +MCP_UPDATE_SUCCESS = "Successfully updated repository at {path} (no database wipe)." +MCP_UPDATE_ERROR = "Error updating repository: {error}" +MCP_SEMANTIC_NOT_AVAILABLE_RESPONSE = ( + "Semantic search is not available. Install with: uv sync --extra semantic" +) +MCP_ASK_AGENT_ERROR = "Error running ask_agent: {error}" MCP_PROJECT_DELETED = "Successfully deleted project '{project_name}'." MCP_WIPE_CANCELLED = "Database wipe cancelled. Set confirm=true to proceed." MCP_WIPE_SUCCESS = "Database completely wiped. All projects have been removed." @@ -2460,11 +2888,14 @@ class MCPParamName(StrEnum): TS_FUNCTION_EXPRESSION, ) -# (H) FQN node type tuples for TS +# (H) FQN node type tuples for TS. The grammar emits `internal_module` for a +# (H) `namespace`/`module` block; without it a class declared inside a namespace +# (H) loses the namespace from its qn and collides with a top-level same name. FQN_TS_SCOPE_TYPES = ( TS_CLASS_DECLARATION, TS_INTERFACE_DECLARATION, TS_NAMESPACE_DEFINITION, + TS_INTERNAL_MODULE, TS_PROGRAM, TS_FUNCTION_DECLARATION, TS_FUNCTION_EXPRESSION, @@ -2550,35 +2981,19 @@ class MCPParamName(StrEnum): TS_SCALA_FUNCTION_DECLARATION, ) -# (H) FQN node type tuples for C# -FQN_CS_SCOPE_TYPES = ( - TS_CLASS_DECLARATION, - TS_CS_STRUCT_DECLARATION, - TS_INTERFACE_DECLARATION, - TS_CS_COMPILATION_UNIT, -) -FQN_CS_FUNCTION_TYPES = ( - TS_CS_DESTRUCTOR_DECLARATION, - TS_CS_LOCAL_FUNCTION_STATEMENT, - TS_CS_FUNCTION_POINTER_TYPE, - TS_CONSTRUCTOR_DECLARATION, - TS_CS_ANONYMOUS_METHOD_EXPRESSION, - TS_CS_LAMBDA_EXPRESSION, - TS_METHOD_DECLARATION, -) - # (H) FQN node type tuples for PHP FQN_PHP_SCOPE_TYPES = ( TS_CLASS_DECLARATION, TS_INTERFACE_DECLARATION, TS_PHP_TRAIT_DECLARATION, + TS_PHP_NAMESPACE_DEFINITION, TS_PROGRAM, ) FQN_PHP_FUNCTION_TYPES = ( - TS_PY_FUNCTION_DEFINITION, + TS_PHP_FUNCTION_DEFINITION, + TS_PHP_METHOD_DECLARATION, TS_PHP_ANONYMOUS_FUNCTION, TS_PHP_ARROW_FUNCTION, - TS_PHP_FUNCTION_STATIC_DECLARATION, ) # (H) LANGUAGE_SPECS node type tuples for Python @@ -2599,6 +3014,8 @@ class MCPParamName(StrEnum): TS_FUNCTION_DECLARATION, TS_CLASS_DECLARATION, TS_METHOD_DEFINITION, + # (H) TS `namespace`/`module` block; its `name` field scopes nested classes. + TS_INTERNAL_MODULE, ) # (H) Derived node types for _rust_get_name @@ -2617,6 +3034,13 @@ class MCPParamName(StrEnum): TS_ENUM_SPECIFIER, ) +# (H) Derived node types for _c_get_name +C_NAME_NODE_TYPES = ( + TS_STRUCT_SPECIFIER, + TS_UNION_SPECIFIER, + TS_ENUM_SPECIFIER, +) + # (H) LANGUAGE_SPECS node type tuples for Rust SPEC_RS_FUNCTION_TYPES = ( TS_RS_FUNCTION_ITEM, @@ -2639,7 +3063,7 @@ class MCPParamName(StrEnum): # (H) LANGUAGE_SPECS node type tuples for Go SPEC_GO_FUNCTION_TYPES = (TS_GO_FUNCTION_DECLARATION, TS_GO_METHOD_DECLARATION) -SPEC_GO_CLASS_TYPES = (TS_GO_TYPE_DECLARATION,) +SPEC_GO_CLASS_TYPES = (TS_GO_TYPE_SPEC, TS_GO_TYPE_ALIAS) SPEC_GO_MODULE_TYPES = (TS_GO_SOURCE_FILE,) SPEC_GO_CALL_TYPES = (TS_GO_CALL_EXPRESSION,) SPEC_GO_IMPORT_TYPES = (TS_GO_IMPORT_DECLARATION,) @@ -2713,44 +3137,53 @@ class MCPParamName(StrEnum): PKG_CONANFILE, ) -# (H) LANGUAGE_SPECS node type tuples for C# -SPEC_CS_FUNCTION_TYPES = ( - TS_CS_DESTRUCTOR_DECLARATION, - TS_CS_LOCAL_FUNCTION_STATEMENT, - TS_CS_FUNCTION_POINTER_TYPE, - TS_CONSTRUCTOR_DECLARATION, - TS_CS_ANONYMOUS_METHOD_EXPRESSION, - TS_CS_LAMBDA_EXPRESSION, - TS_METHOD_DECLARATION, +# (H) FQN node type tuples for C +FQN_C_SCOPE_TYPES = ( + TS_CPP_TRANSLATION_UNIT, + TS_STRUCT_SPECIFIER, + TS_UNION_SPECIFIER, + TS_ENUM_SPECIFIER, ) -SPEC_CS_CLASS_TYPES = ( - TS_CLASS_DECLARATION, - TS_CS_STRUCT_DECLARATION, - TS_ENUM_DECLARATION, - TS_INTERFACE_DECLARATION, +FQN_C_FUNCTION_TYPES = (TS_CPP_FUNCTION_DEFINITION,) + +# (H) LANGUAGE_SPECS node type tuples for C +SPEC_C_FUNCTION_TYPES = (TS_CPP_FUNCTION_DEFINITION,) +SPEC_C_CLASS_TYPES = ( + TS_STRUCT_SPECIFIER, + TS_UNION_SPECIFIER, + TS_ENUM_SPECIFIER, ) -SPEC_CS_MODULE_TYPES = (TS_CS_COMPILATION_UNIT,) -SPEC_CS_CALL_TYPES = (TS_CS_INVOCATION_EXPRESSION,) +SPEC_C_MODULE_TYPES = (TS_CPP_TRANSLATION_UNIT,) +SPEC_C_CALL_TYPES = (TS_CPP_CALL_EXPRESSION,) +SPEC_C_PACKAGE_INDICATORS = (PKG_CMAKE_LISTS, PKG_MAKEFILE) # (H) LANGUAGE_SPECS node type tuples for PHP SPEC_PHP_FUNCTION_TYPES = ( - TS_PHP_FUNCTION_STATIC_DECLARATION, + TS_PHP_FUNCTION_DEFINITION, + TS_PHP_METHOD_DECLARATION, TS_PHP_ANONYMOUS_FUNCTION, - TS_PY_FUNCTION_DEFINITION, TS_PHP_ARROW_FUNCTION, ) SPEC_PHP_CLASS_TYPES = ( + TS_CLASS_DECLARATION, + TS_INTERFACE_DECLARATION, TS_PHP_TRAIT_DECLARATION, TS_ENUM_DECLARATION, - TS_INTERFACE_DECLARATION, - TS_CLASS_DECLARATION, ) SPEC_PHP_MODULE_TYPES = (TS_PROGRAM,) SPEC_PHP_CALL_TYPES = ( + TS_PHP_FUNCTION_CALL_EXPRESSION, TS_PHP_MEMBER_CALL_EXPRESSION, TS_PHP_SCOPED_CALL_EXPRESSION, - TS_PHP_FUNCTION_CALL_EXPRESSION, TS_PHP_NULLSAFE_MEMBER_CALL_EXPRESSION, + TS_PHP_OBJECT_CREATION_EXPRESSION, +) +SPEC_PHP_IMPORT_TYPES = (TS_PHP_NAMESPACE_USE_DECLARATION,) +SPEC_PHP_IMPORT_FROM_TYPES = ( + TS_PHP_INCLUDE_EXPRESSION, + TS_PHP_INCLUDE_ONCE_EXPRESSION, + TS_PHP_REQUIRE_EXPRESSION, + TS_PHP_REQUIRE_ONCE_EXPRESSION, ) # (H) LANGUAGE_SPECS node type tuples for Lua diff --git a/codebase_rag/cypher_queries.py b/codebase_rag/cypher_queries.py index 8d70bae4e..cf06641d7 100644 --- a/codebase_rag/cypher_queries.py +++ b/codebase_rag/cypher_queries.py @@ -52,8 +52,8 @@ CYPHER_EXAMPLE_LIMIT_ONE = """MATCH (f:File) RETURN f.path as path, f.name as name, labels(f) as type LIMIT 1""" CYPHER_EXAMPLE_CLASS_METHODS = f"""MATCH (c:Class)-[:DEFINES_METHOD]->(m:Method) -WHERE c.qualified_name ENDS WITH '.UserService' -RETURN m.name AS name, m.qualified_name AS qualified_name, labels(m) AS type +WHERE c.name = 'UserService' +RETURN c.name AS className, m.name AS methodName, m.qualified_name AS qualified_name, labels(m) AS type LIMIT {CYPHER_DEFAULT_LIMIT}""" CYPHER_EXPORT_NODES = """ @@ -84,6 +84,90 @@ """ +CYPHER_STATS_NODE_COUNTS = """ +MATCH (n) +RETURN labels(n) AS labels, count(*) AS count +ORDER BY count DESC +""" + +CYPHER_STATS_RELATIONSHIP_COUNTS = """ +MATCH ()-[r]->() +RETURN type(r) AS type, count(*) AS count +ORDER BY count DESC +""" + + +_DEAD_CODE_TEST_ROOT_CLAUSE = ( + "\n OR ANY(p IN $test_patterns WHERE n.path CONTAINS p)" +) + +# (H) A node reached by a Module node runs at import (top-level statement, +# (H) `if __name__ == "__main__"`, a bare decorator, or a module-scope +# (H) construction), so it is a root. `size([...])` avoids the non-standard +# (H) `exists(pattern)`. When tests are excluded, an edge from a test module must +# (H) NOT keep project code alive, so the test-module variant filters by path. +# (H) `{module_rels}` is the relationship set walked from the module (CALLS, plus +# (H) INSTANTIATES when classes are included so module-scope construction roots a +# (H) class). +_DEAD_CODE_MODULE_ROOT_ANY = "size([(n)<-[:{module_rels}]-(:Module) | 1]) > 0" +_DEAD_CODE_MODULE_ROOT_NON_TEST = ( + "size([(n)<-[:{module_rels}]-(m:Module)" + " WHERE NOT ANY(p IN $test_patterns WHERE m.path CONTAINS p) | 1]) > 0" +) + +# (H) Reachability walks CALLS only by default. With classes included it also +# (H) walks INSTANTIATES (construction keeps a class live) and INHERITS forward +# (H) from subclass to base, so a base is kept live only by a REACHABLE subclass. +# (H) A base whose sole subclass is itself unreachable is therefore reported as +# (H) part of the dead cluster (the subclass is reported too). Classes referenced +# (H) solely via type annotations / isinstance / dynamic lookups are not modelled +# (H) as edges, so class candidates are review hints, not a delete list. +_DEAD_CODE_QUERY_TEMPLATE = """MATCH (n:{labels}) +WHERE n.qualified_name STARTS WITH $project_prefix + AND ( + ANY(d IN n.decorators + WHERE toLower(last(split(split(replace(d, '@', ''), '(')[0], '.'))) + IN $root_decorators) + OR n.is_exported = true + OR ANY(e IN $entry_points WHERE n.qualified_name ENDS WITH e) + OR {module_clause}{test_clause} + ) +WITH collect(n) AS roots +UNWIND roots AS r +MATCH (r)-[:{traversal}*0..]->(live) +WITH collect(DISTINCT live) AS live_set +MATCH (n:{labels}) +WHERE n.qualified_name STARTS WITH $project_prefix + AND NOT n IN live_set +RETURN labels(n)[0] AS label, n.name AS name, + n.qualified_name AS qualified_name, + n.start_line AS start_line, n.end_line AS end_line +ORDER BY qualified_name""" + + +def build_dead_code_query(include_tests: bool, include_classes: bool = False) -> str: + if include_classes: + labels = "Function|Method|Class" + traversal = "CALLS|INSTANTIATES|INHERITS" + module_rels = "CALLS|INSTANTIATES" + else: + labels = "Function|Method" + traversal = "CALLS" + module_rels = "CALLS" + if include_tests: + module_clause = _DEAD_CODE_MODULE_ROOT_ANY.format(module_rels=module_rels) + test_clause = _DEAD_CODE_TEST_ROOT_CLAUSE + else: + module_clause = _DEAD_CODE_MODULE_ROOT_NON_TEST.format(module_rels=module_rels) + test_clause = "" + return _DEAD_CODE_QUERY_TEMPLATE.format( + labels=labels, + traversal=traversal, + module_clause=module_clause, + test_clause=test_clause, + ) + + def wrap_with_unwind(query: str) -> str: return f"UNWIND $batch AS row\n{query}" @@ -126,3 +210,24 @@ def build_merge_relationship_query( ) query += CYPHER_SET_PROPS_RETURN_COUNT if has_props else CYPHER_RETURN_COUNT return query + + +def build_create_node_query(label: str, id_key: str) -> str: + return f"CREATE (n:{label} {{{id_key}: row.id}})\nSET n += row.props" + + +def build_create_relationship_query( + from_label: str, + from_key: str, + rel_type: str, + to_label: str, + to_key: str, + has_props: bool = False, +) -> str: + query = ( + f"MATCH (a:{from_label} {{{from_key}: row.from_val}}), " + f"(b:{to_label} {{{to_key}: row.to_val}})\n" + f"CREATE (a)-[r:{rel_type}]->(b)\n" + ) + query += CYPHER_SET_PROPS_RETURN_COUNT if has_props else CYPHER_RETURN_COUNT + return query diff --git a/codebase_rag/docker-compose.yaml b/codebase_rag/docker-compose.yaml new file mode 100644 index 000000000..1b394c873 --- /dev/null +++ b/codebase_rag/docker-compose.yaml @@ -0,0 +1,27 @@ +services: + memgraph: + image: memgraph/memgraph-mage + ports: + - "${MEMGRAPH_PORT:-7687}:7687" + - "${MEMGRAPH_HTTP_PORT:-7444}:7444" + volumes: + - memgraph_data:/var/lib/memgraph + - memgraph_log:/var/log/memgraph + lab: + image: memgraph/lab + ports: + - "${LAB_PORT:-3000}:3000" + environment: + QUICK_CONNECT_MG_HOST: memgraph + qdrant: + image: qdrant/qdrant + ports: + - "${QDRANT_HTTP_PORT:-6333}:6333" + - "${QDRANT_GRPC_PORT:-6334}:6334" + volumes: + - qdrant_storage:/qdrant/storage + +volumes: + qdrant_storage: + memgraph_data: + memgraph_log: diff --git a/codebase_rag/embedder.py b/codebase_rag/embedder.py index 0928cae97..89b3b466f 100644 --- a/codebase_rag/embedder.py +++ b/codebase_rag/embedder.py @@ -1,19 +1,96 @@ -# ┌────────────────────────────────────────────────────────────────────────┐ -# │ UniXcoder Model Singleton via LRU Cache │ -# ├────────────────────────────────────────────────────────────────────────┤ -# │ get_model() provides: │ -# │ - Singleton behavior without global variables │ -# │ - Thread-safe lazy initialization │ -# │ - Easy testability with cache_clear() method │ -# │ - Memory efficient with maxsize=1 │ -# └────────────────────────────────────────────────────────────────────────┘ +from __future__ import annotations + +import hashlib +import json from functools import lru_cache +from pathlib import Path + +from loguru import logger +from . import constants as cs from . import exceptions as ex +from . import logs as ls from .config import settings -from .constants import UNIXCODER_MODEL from .utils.dependencies import has_torch, has_transformers + +class EmbeddingCache: + __slots__ = ("_cache", "_path") + + def __init__(self, path: Path | None = None) -> None: + self._cache: dict[str, list[float]] = {} + self._path = path + + @staticmethod + def _content_hash(content: str) -> str: + return hashlib.sha256(content.encode()).hexdigest() + + def get(self, content: str) -> list[float] | None: + return self._cache.get(self._content_hash(content)) + + def put(self, content: str, embedding: list[float]) -> None: + self._cache[self._content_hash(content)] = embedding + + def get_many(self, snippets: list[str]) -> dict[int, list[float]]: + results: dict[int, list[float]] = {} + for i, snippet in enumerate(snippets): + if (cached := self.get(snippet)) is not None: + results[i] = cached + return results + + def put_many(self, snippets: list[str], embeddings: list[list[float]]) -> None: + for snippet, embedding in zip(snippets, embeddings): + self.put(snippet, embedding) + + def save(self) -> None: + if self._path is None: + return + try: + self._path.parent.mkdir(parents=True, exist_ok=True) + with self._path.open("w", encoding="utf-8") as f: + json.dump(self._cache, f) + except Exception as e: + logger.warning(ls.EMBEDDING_CACHE_SAVE_FAILED, path=self._path, error=e) + + def load(self) -> None: + if self._path is None or not self._path.exists(): + return + try: + with self._path.open("r", encoding="utf-8") as f: + self._cache = json.load(f) + logger.debug( + ls.EMBEDDING_CACHE_LOADED, count=len(self._cache), path=self._path + ) + except Exception as e: + logger.warning(ls.EMBEDDING_CACHE_LOAD_FAILED, path=self._path, error=e) + self._cache = {} + + def clear(self) -> None: + self._cache.clear() + + def __len__(self) -> int: + return len(self._cache) + + +_embedding_cache: EmbeddingCache | None = None + + +def get_embedding_cache() -> EmbeddingCache: + global _embedding_cache + if _embedding_cache is None: + cache_path = Path(settings.QDRANT_DB_PATH) / cs.EMBEDDING_CACHE_FILENAME + _embedding_cache = EmbeddingCache(path=cache_path) + _embedding_cache.load() + return _embedding_cache + + +def clear_embedding_cache() -> None: + global _embedding_cache + if _embedding_cache is not None: + _embedding_cache.clear() + _embedding_cache = None + + if has_torch() and has_transformers(): import numpy as np import torch @@ -21,15 +98,29 @@ from .unixcoder import UniXcoder + def _select_device() -> str: + if torch.cuda.is_available(): + return "cuda" + if torch.backends.mps.is_available(): + return "mps" + return "cpu" + @lru_cache(maxsize=1) def get_model() -> UniXcoder: - model = UniXcoder(UNIXCODER_MODEL) + model = UniXcoder(cs.UNIXCODER_MODEL) model.eval() - if torch.cuda.is_available(): + device = _select_device() + if device == "cuda": model = model.cuda() + elif device == "mps": + model = model.to("mps") return model def embed_code(code: str, max_length: int | None = None) -> list[float]: + cache = get_embedding_cache() + if (cached := cache.get(code)) is not None: + return cached + if max_length is None: max_length = settings.EMBEDDING_MAX_LENGTH model = get_model() @@ -40,9 +131,63 @@ def embed_code(code: str, max_length: int | None = None) -> list[float]: _, sentence_embeddings = model(tokens_tensor) embedding: NDArray[np.float32] = sentence_embeddings.cpu().numpy() result: list[float] = embedding[0].tolist() + + cache.put(code, result) return result + def embed_code_batch( + snippets: list[str], + max_length: int | None = None, + batch_size: int = cs.EMBEDDING_DEFAULT_BATCH_SIZE, + ) -> list[list[float]]: + if not snippets: + return [] + + if max_length is None: + max_length = settings.EMBEDDING_MAX_LENGTH + + cache = get_embedding_cache() + cached_results = cache.get_many(snippets) + + if len(cached_results) == len(snippets): + logger.debug(ls.EMBEDDING_CACHE_HIT, count=len(snippets)) + return [cached_results[i] for i in range(len(snippets))] + + uncached_indices = [i for i in range(len(snippets)) if i not in cached_results] + uncached_snippets = [snippets[i] for i in uncached_indices] + + model = get_model() + device = next(model.parameters()).device + + all_new_embeddings: list[list[float]] = [] + for start in range(0, len(uncached_snippets), batch_size): + batch = uncached_snippets[start : start + batch_size] + tokens_list = model.tokenize(batch, max_length=max_length, padding=True) + tokens_tensor = torch.tensor(tokens_list).to(device) + with torch.no_grad(): + _, sentence_embeddings = model(tokens_tensor) + batch_np: NDArray[np.float32] = sentence_embeddings.cpu().numpy() + for row in batch_np: + all_new_embeddings.append(row.tolist()) + + cache.put_many(uncached_snippets, all_new_embeddings) + + results: list[list[float]] = [[] for _ in snippets] + for i, emb in cached_results.items(): + results[i] = emb + for idx, orig_i in enumerate(uncached_indices): + results[orig_i] = all_new_embeddings[idx] + + return results + else: def embed_code(code: str, max_length: int | None = None) -> list[float]: raise RuntimeError(ex.SEMANTIC_EXTRA) + + def embed_code_batch( + snippets: list[str], + max_length: int | None = None, + batch_size: int = cs.EMBEDDING_DEFAULT_BATCH_SIZE, + ) -> list[list[float]]: + raise RuntimeError(ex.SEMANTIC_EXTRA) diff --git a/codebase_rag/exceptions.py b/codebase_rag/exceptions.py index f30202395..21c479995 100644 --- a/codebase_rag/exceptions.py +++ b/codebase_rag/exceptions.py @@ -11,10 +11,26 @@ "OpenAI provider requires api_key. " "Set ORCHESTRATOR_API_KEY or CYPHER_API_KEY in .env file." ) +ANTHROPIC_NO_KEY = ( + "Anthropic provider requires api_key. " + "Set ORCHESTRATOR_API_KEY or CYPHER_API_KEY in .env file." +) +AZURE_NO_KEY = "Azure OpenAI provider requires api_key. Set AZURE_API_KEY in .env file." +AZURE_NO_ENDPOINT = ( + "Azure OpenAI provider requires endpoint. Set AZURE_OPENAI_ENDPOINT in .env file." +) OLLAMA_NOT_RUNNING = ( "Ollama server not responding at {endpoint}. " "Make sure Ollama is running: ollama serve" ) +LITELLM_NO_ENDPOINT = ( + "LiteLLM provider requires endpoint. " + "Set ORCHESTRATOR_ENDPOINT or CYPHER_ENDPOINT in .env file." +) +LITELLM_NOT_RUNNING = ( + "LiteLLM proxy server not responding at {endpoint}. " + "Make sure LiteLLM proxy is running and API key is valid." +) UNKNOWN_PROVIDER = "Unknown provider '{provider}'. Available providers: {available}" # (H) Dependency errors @@ -42,16 +58,29 @@ # (H) LLM errors LLM_INIT_CYPHER = "Failed to initialize CypherGenerator: {error}" LLM_INVALID_QUERY = "LLM did not generate a valid query. Output: {output}" +LLM_DANGEROUS_QUERY = "LLM generated a destructive Cypher query (found '{keyword}'). Query rejected: {query}" +LLM_UNBOUNDED_PATH = ( + "LLM generated an unbounded variable-length path pattern " + "(e.g. [:TYPE*] or [:TYPE*N..]) which causes memory exhaustion on cyclic graphs. " + "Add an upper bound such as [:TYPE*1..6]. Query rejected: {query}" +) +LLM_DISALLOWED_PROCEDURE = ( + "LLM generated a CALL to procedure '{name}' which is outside the read-only " + "MAGE allowlist. Query rejected: {query}" +) LLM_GENERATION_FAILED = "Cypher generation failed: {error}" LLM_INIT_ORCHESTRATOR = "Failed to initialize RAG Orchestrator: {error}" # (H) Graph service errors BATCH_SIZE = "batch_size must be a positive integer" CONN = "Not connected to Memgraph." +AUTH_INCOMPLETE = ( + "Both username and password are required for authentication. " + "Either provide both or neither." +) # (H) Access control errors (used with raise) ACCESS_DENIED = "Access denied: Cannot access files outside the project root." -DOC_UNSUPPORTED_PROVIDER = "DocumentAnalyzer does not support the 'local' LLM provider." # (H) Exception classes diff --git a/codebase_rag/graph_loader.py b/codebase_rag/graph_loader.py index b69635755..6a210c6d5 100644 --- a/codebase_rag/graph_loader.py +++ b/codebase_rag/graph_loader.py @@ -13,6 +13,18 @@ class GraphLoader: + __slots__ = ( + "file_path", + "_data", + "_nodes", + "_relationships", + "_nodes_by_id", + "_nodes_by_label", + "_outgoing_rels", + "_incoming_rels", + "_property_indexes", + ) + def __init__(self, file_path: str): self.file_path = Path(file_path) self._data: GraphData | None = None diff --git a/codebase_rag/graph_updater.py b/codebase_rag/graph_updater.py index 2620d2bcb..442471c24 100644 --- a/codebase_rag/graph_updater.py +++ b/codebase_rag/graph_updater.py @@ -1,16 +1,27 @@ +import hashlib +import json +import os import sys from collections import OrderedDict, defaultdict from collections.abc import Callable, ItemsView, KeysView from pathlib import Path from loguru import logger -from tree_sitter import Node, Parser +from rich.progress import Progress, SpinnerColumn, TextColumn +from tree_sitter import Node, Parser, QueryCursor from . import constants as cs from . import logs as ls from .config import settings from .language_spec import LANGUAGE_FQN_SPECS, get_language_spec +from .parser_loader import COMBINED_FUNC_CLASS_IMPORT_QUERIES +from .parsers.cpp_frontend import ( + cpp_frontend_available, + find_compile_commands, + run_cpp_frontend, +) from .parsers.factory import ProcessorFactory +from .parsers.utils import sorted_captures from .services import IngestorProtocol, QueryProtocol from .types_defs import ( EmbeddingQueryResult, @@ -24,19 +35,90 @@ ) from .utils.dependencies import has_semantic_dependencies from .utils.fqn_resolver import find_function_source_by_fqn -from .utils.path_utils import should_skip_path +from .utils.path_utils import ( + cached_relative_path, + should_skip_path, + should_skip_rel_file, +) from .utils.source_extraction import extract_source_with_fallback +type FileHashCache = dict[str, str] +type DirMtimesCache = dict[str, float] + class FunctionRegistryTrie: + __slots__ = ( + "root", + "_entries", + "_simple_name_lookup", + "_ending_with_cache", + "_duplicates", + "_properties", + "_property_names", + "_abstracts", + "_callable_params", + ) + def __init__(self, simple_name_lookup: SimpleNameLookup | None = None) -> None: self.root: TrieNode = {} self._entries: FunctionRegistry = {} self._simple_name_lookup = simple_name_lookup + self._ending_with_cache: dict[str, list[QualifiedName]] = {} + self._duplicates: dict[QualifiedName, list[QualifiedName]] = {} + self._properties: set[QualifiedName] = set() + self._property_names: set[str] = set() + self._abstracts: set[QualifiedName] = set() + self._callable_params: dict[QualifiedName, dict[str, int]] = {} + + def mark_callable_params( + self, qualified_name: QualifiedName, params: dict[str, int] + ) -> None: + if params: + self._callable_params[qualified_name] = params + + def callable_params(self, qualified_name: QualifiedName) -> dict[str, int] | None: + return self._callable_params.get(qualified_name) + + def mark_property(self, qualified_name: QualifiedName) -> None: + self._properties.add(qualified_name) + self._property_names.add(qualified_name.rsplit(cs.SEPARATOR_DOT, 1)[-1]) + + def is_property(self, qualified_name: QualifiedName) -> bool: + return qualified_name in self._properties + + def property_names(self) -> set[str]: + return self._property_names + + def mark_abstract(self, qualified_name: QualifiedName) -> None: + self._abstracts.add(qualified_name) + + def is_abstract(self, qualified_name: QualifiedName) -> bool: + return qualified_name in self._abstracts + + def register_unique_qn( + self, natural_qn: QualifiedName, start_line: int + ) -> QualifiedName: + if natural_qn not in self._entries: + return natural_qn + variant = f"{natural_qn}{cs.DUP_QN_MARKER}{start_line}" + bucket = self._duplicates.setdefault(natural_qn, [natural_qn]) + if variant not in bucket: + bucket.append(variant) + return variant + + def variants(self, qualified_name: QualifiedName) -> list[QualifiedName]: + return self._duplicates.get(qualified_name, [qualified_name]) def insert(self, qualified_name: QualifiedName, func_type: NodeType) -> None: + qualified_name = sys.intern(qualified_name) self._entries[qualified_name] = func_type + simple_name = qualified_name.rsplit(cs.SEPARATOR_DOT, 1)[-1] + if self._simple_name_lookup is not None: + self._simple_name_lookup[simple_name].add(qualified_name) + if self._ending_with_cache: + self._ending_with_cache.pop(simple_name, None) + parts = qualified_name.split(cs.SEPARATOR_DOT) current: TrieNode = self.root @@ -69,6 +151,30 @@ def __delitem__(self, qualified_name: QualifiedName) -> None: return del self._entries[qualified_name] + self._duplicates.pop(qualified_name, None) + for natural, bucket in list(self._duplicates.items()): + if qualified_name in bucket: + bucket.remove(qualified_name) + if len(bucket) <= 1: + self._duplicates.pop(natural, None) + simple_name = qualified_name.rsplit(cs.SEPARATOR_DOT, 1)[-1] + + if qualified_name in self._properties: + self._properties.discard(qualified_name) + if not any( + p.rsplit(cs.SEPARATOR_DOT, 1)[-1] == simple_name + for p in self._properties + ): + self._property_names.discard(simple_name) + self._abstracts.discard(qualified_name) + self._callable_params.pop(qualified_name, None) + + if self._ending_with_cache: + self._ending_with_cache.pop(simple_name, None) + + if self._simple_name_lookup is not None: + if simple_name in self._simple_name_lookup: + self._simple_name_lookup[simple_name].discard(qualified_name) parts = qualified_name.split(cs.SEPARATOR_DOT) self._cleanup_trie_path(parts, self.root) @@ -148,11 +254,20 @@ def find_with_prefix_and_suffix( return [qn for qn, _ in matches] def find_ending_with(self, suffix: str) -> list[QualifiedName]: - if self._simple_name_lookup is not None and suffix in self._simple_name_lookup: - # (H) O(1) lookup using the simple_name_lookup index - return list(self._simple_name_lookup[suffix]) - # (H) Fallback to linear scan if no index available - return [qn for qn in self._entries.keys() if qn.endswith(f".{suffix}")] + cached = self._ending_with_cache.get(suffix) + if cached is not None: + return cached + if self._simple_name_lookup is not None: + if suffix in self._simple_name_lookup: + result = sorted(self._simple_name_lookup[suffix]) + else: + result = [] + else: + result = sorted( + qn for qn in self._entries.keys() if qn.endswith(f".{suffix}") + ) + self._ending_with_cache[suffix] = result + return result def find_with_prefix(self, prefix: str) -> list[tuple[QualifiedName, NodeType]]: node = self._navigate_to_prefix(prefix) @@ -160,6 +275,8 @@ def find_with_prefix(self, prefix: str) -> list[tuple[QualifiedName, NodeType]]: class BoundedASTCache: + __slots__ = ("cache", "max_entries", "max_memory_bytes") + def __init__( self, max_entries: int | None = None, @@ -220,6 +337,78 @@ def _should_evict_for_memory(self) -> bool: ) +def _hash_file(filepath: Path) -> str: + data = filepath.read_bytes() + return hashlib.md5(data, usedforsecurity=False).hexdigest() + + +def _hash_file_with_bytes(filepath: Path) -> tuple[str, bytes] | None: + try: + with open(filepath, "rb") as f: + data = f.read() + except OSError as e: + logger.warning(ls.FILE_UNREADABLE, path=filepath, error=e) + return None + return hashlib.md5(data, usedforsecurity=False).hexdigest(), data + + +def _load_hash_cache(cache_path: Path) -> FileHashCache: + if not cache_path.is_file(): + return {} + try: + with cache_path.open(encoding="utf-8") as f: + data = json.load(f) + if isinstance(data, dict): + logger.info(ls.HASH_CACHE_LOADED, count=len(data), path=cache_path) + return data + except (json.JSONDecodeError, OSError) as e: + logger.warning(ls.HASH_CACHE_LOAD_FAILED, path=cache_path, error=e) + return {} + + +def _save_hash_cache(cache_path: Path, hashes: FileHashCache) -> None: + try: + cache_path.parent.mkdir(parents=True, exist_ok=True) + with cache_path.open("w", encoding="utf-8") as f: + json.dump(hashes, f, indent=2) + logger.info(ls.HASH_CACHE_SAVED, count=len(hashes), path=cache_path) + except OSError as e: + logger.warning(ls.HASH_CACHE_SAVE_FAILED, path=cache_path, error=e) + + +def _load_dir_mtimes(cache_path: Path) -> DirMtimesCache: + if not cache_path.is_file(): + return {} + try: + with cache_path.open(encoding="utf-8") as f: + data = json.load(f) + if isinstance(data, dict): + return {k: float(v) for k, v in data.items() if isinstance(v, int | float)} + except (json.JSONDecodeError, OSError, ValueError): + pass + return {} + + +def _save_dir_mtimes(cache_path: Path, mtimes: DirMtimesCache) -> None: + try: + cache_path.parent.mkdir(parents=True, exist_ok=True) + with cache_path.open("w", encoding="utf-8") as f: + json.dump(mtimes, f) + except OSError: + pass + + +def _touch_empty_json(cache_path: Path) -> None: + if cache_path.exists(): + return + try: + cache_path.parent.mkdir(parents=True, exist_ok=True) + with cache_path.open("w", encoding="utf-8") as f: + f.write(cs.JSON_EMPTY_OBJECT) + except OSError: + pass + + class GraphUpdater: def __init__( self, @@ -229,12 +418,20 @@ def __init__( queries: dict[cs.SupportedLanguage, LanguageQueries], unignore_paths: frozenset[str] | None = None, exclude_paths: frozenset[str] | None = None, + project_name: str | None = None, ): self.ingestor = ingestor + self._single_file: Path | None = None + if repo_path.is_file(): + resolved = repo_path.resolve() + self._single_file = resolved + repo_path = resolved.parent self.repo_path = repo_path self.parsers = parsers self.queries = queries - self.project_name = repo_path.resolve().name + self.project_name = ( + project_name and project_name.strip() + ) or repo_path.resolve().name self.simple_name_lookup: SimpleNameLookup = defaultdict(set) self.function_registry = FunctionRegistryTrie( simple_name_lookup=self.simple_name_lookup @@ -242,6 +439,9 @@ def __init__( self.ast_cache = BoundedASTCache() self.unignore_paths = unignore_paths self.exclude_paths = exclude_paths + self.skipped_because_in_sync = False + self._collected_dir_mtimes: DirMtimesCache = {} + self._cpp_frontend_covered: frozenset[str] = frozenset() self.factory = ProcessorFactory( ingestor=self.ingestor, @@ -255,25 +455,77 @@ def __init__( exclude_paths=self.exclude_paths, ) + def _run_cpp_frontend(self) -> None: + # (H) Optional libclang C++ pre-pass: when CPP_FRONTEND=libclang and a + # (H) compile_commands.json is discoverable, emit macro-accurate C/C++ + # (H) nodes/edges directly (tree-sitter cannot expand macros). Covered + # (H) files are then skipped by the tree-sitter definition pass. Missing + # (H) either condition falls back to tree-sitter with no change. + self._cpp_frontend_covered = frozenset() + if settings.CPP_FRONTEND != cs.CppFrontend.LIBCLANG: + return + if not cpp_frontend_available(): + logger.warning(ls.CPP_FRONTEND_UNAVAILABLE) + return + compdb_dir = find_compile_commands(self.repo_path) + if compdb_dir is None: + logger.warning(ls.CPP_FRONTEND_NO_COMPDB) + return + logger.info(ls.CPP_FRONTEND_RUNNING.format(path=compdb_dir)) + self._cpp_frontend_covered = run_cpp_frontend( + self.ingestor, + self.repo_path, + self.project_name, + compdb_dir, + function_registry=self.function_registry, + simple_name_lookup=self.simple_name_lookup, + structural_elements=self.factory.structure_processor.structural_elements, + ) + logger.info( + ls.CPP_FRONTEND_COVERED.format(count=len(self._cpp_frontend_covered)) + ) + def _is_dependency_file(self, file_name: str, filepath: Path) -> bool: return ( file_name.lower() in cs.DEPENDENCY_FILES or filepath.suffix.lower() == cs.CSPROJ_SUFFIX ) - def run(self) -> None: + def run(self, force: bool = False) -> None: + py_engine = self.factory.type_inference._python_type_inference + if py_engine is not None: + py_engine._available_classes_cache.clear() + py_engine._return_stmt_cache.clear() + py_engine._method_return_type_cache.clear() + py_engine._self_assignment_cache.clear() self.ingestor.ensure_node_batch( cs.NODE_PROJECT, {cs.KEY_NAME: self.project_name} ) - logger.info(ls.ENSURING_PROJECT.format(name=self.project_name)) + logger.info(ls.ENSURING_PROJECT, name=self.project_name) + + if not force and self._is_already_in_sync(): + logger.info(ls.GRAPH_ALREADY_IN_SYNC) + self.skipped_because_in_sync = True + self.ingestor.flush_all() + return logger.info(ls.PASS_1_STRUCTURE) self.factory.structure_processor.identify_structure() + self._run_cpp_frontend() + logger.info(ls.PASS_2_FILES) - self._process_files() + self._process_files(force=force) - logger.info(ls.FOUND_FUNCTIONS.format(count=len(self.function_registry))) + corrected = self.factory.definition_processor.resolve_deferred_cpp_methods() + if corrected: + logger.info("Resolved {} deferred C++ out-of-class methods", corrected) + + go_methods = self.factory.definition_processor.resolve_deferred_go_methods() + if go_methods: + logger.info("Resolved {} Go receiver methods", go_methods) + + logger.info(ls.FOUND_FUNCTIONS, count=len(self.function_registry)) logger.info(ls.PASS_3_CALLS) self._process_function_calls() @@ -282,16 +534,18 @@ def run(self) -> None: logger.info(ls.ANALYSIS_COMPLETE) self.ingestor.flush_all() + self._prune_orphan_nodes() + self._generate_semantic_embeddings() def remove_file_from_state(self, file_path: Path) -> None: - logger.debug(ls.REMOVING_STATE.format(path=file_path)) + logger.debug(ls.REMOVING_STATE, path=file_path) if file_path in self.ast_cache: del self.ast_cache[file_path] logger.debug(ls.REMOVED_FROM_CACHE) - relative_path = file_path.relative_to(self.repo_path) + relative_path = cached_relative_path(file_path, self.repo_path) path_parts = ( relative_path.parent.parts if file_path.name == cs.INIT_PY @@ -307,51 +561,480 @@ def remove_file_from_state(self, file_path: Path) -> None: del self.function_registry[qn] if qns_to_remove: - logger.debug(ls.REMOVING_QNS.format(count=len(qns_to_remove))) + logger.debug(ls.REMOVING_QNS, count=len(qns_to_remove)) for simple_name, qn_set in self.simple_name_lookup.items(): original_count = len(qn_set) new_qn_set = qn_set - qns_to_remove if len(new_qn_set) < original_count: self.simple_name_lookup[simple_name] = new_qn_set - logger.debug(ls.CLEANED_SIMPLE_NAME.format(name=simple_name)) + logger.debug(ls.CLEANED_SIMPLE_NAME, name=simple_name) + + def _delete_module_entities(self, file_key: str) -> None: + """Remove a changed/deleted file's Module subtree from the graph. + + The incremental path re-parses a changed file and re-adds its + entities, but the entities the previous parse contributed (the + Module and everything it DEFINES, plus their IMPORTS/CALLS edges via + DETACH) must be removed first; otherwise renamed-away Function/Class/ + Method nodes and their edges linger alongside the new ones. + """ + if isinstance(self.ingestor, QueryProtocol): + self.ingestor.execute_write( + cs.CYPHER_DELETE_MODULE, {cs.KEY_PATH: file_key} + ) - def _process_files(self) -> None: - for filepath in self.repo_path.rglob("*"): - if filepath.is_file() and not should_skip_path( - filepath, + def _diff_dir_against_cache( + self, + dir_path_str: str, + dir_key: str, + old_hashes: FileHashCache, + old_dir_mtimes: DirMtimesCache, + ) -> tuple[str | None, str | None]: + prefix = "" if dir_key == cs.ROOT_DIR_KEY else f"{dir_key}/" + expected_files: set[str] = set() + expected_dirs: set[str] = set() + for fk in old_hashes: + if fk.startswith(prefix): + rest = fk[len(prefix) :] + if "/" not in rest: + expected_files.add(rest) + for dk in old_dir_mtimes: + if dk == cs.ROOT_DIR_KEY or not dk.startswith(prefix): + continue + rest = dk[len(prefix) :] + if "/" not in rest: + expected_dirs.add(rest) + + actual_files: set[str] = set() + actual_dirs: set[str] = set() + try: + with os.scandir(dir_path_str) as it: + for entry in it: + name = entry.name + if name in (cs.HASH_CACHE_FILENAME, cs.DIR_MTIMES_FILENAME): + continue + try: + is_symlink = entry.is_symlink() + except OSError: + is_symlink = False + try: + is_dir_following = entry.is_dir() + except OSError: + is_dir_following = False + if is_symlink and is_dir_following: + continue + if is_dir_following: + actual_dirs.add(name) + else: + actual_files.add(name) + except OSError: + return None, dir_key + + dir_parts: tuple[str, ...] = ( + () if dir_key == cs.ROOT_DIR_KEY else tuple(dir_key.split("/")) + ) + dir_prefix_for_keep = "" if dir_key == cs.ROOT_DIR_KEY else f"{dir_key}/" + + for name in actual_dirs - expected_dirs: + if not self._should_keep_dir(name, dir_prefix_for_keep): + continue + return f"{prefix}{name}", None + for name in actual_files - expected_files: + dot = name.rfind(".") + suffix = name[dot:] if dot != -1 else "" + if should_skip_rel_file( + f"{prefix}{name}", + dir_parts, + suffix, + exclude_paths=self.exclude_paths, + unignore_paths=self.unignore_paths, + ): + continue + return f"{prefix}{name}", None + + for name in expected_files - actual_files: + return None, f"{prefix}{name}" + for name in expected_dirs - actual_dirs: + return None, f"{prefix}{name}" + + return None, None + + def _should_keep_dir(self, dirname: str, dir_prefix: str) -> bool: + if dirname not in cs.IGNORE_PATTERNS and ( + not self.exclude_paths or dirname not in self.exclude_paths + ): + return True + return bool( + self.unignore_paths + and any( + u.startswith(f"{dir_prefix}{dirname}/") or u == f"{dir_prefix}{dirname}" + for u in self.unignore_paths + ) + ) + + def _is_already_in_sync(self) -> bool: + if self._single_file is not None: + return False + cache_path = self.repo_path / cs.HASH_CACHE_FILENAME + if not cache_path.is_file(): + return False + cache_mtime = cache_path.stat().st_mtime + dir_mtimes_path = self.repo_path / cs.DIR_MTIMES_FILENAME + old_hashes = _load_hash_cache(cache_path) + old_dir_mtimes = _load_dir_mtimes(dir_mtimes_path) + if not old_hashes or not old_dir_mtimes: + return False + + repo_str = str(self.repo_path) + for dir_key, cached_mtime in old_dir_mtimes.items(): + dir_path_str = ( + repo_str if dir_key == cs.ROOT_DIR_KEY else f"{repo_str}/{dir_key}" + ) + try: + current_mtime = os.stat(dir_path_str).st_mtime + except OSError: + return False + if current_mtime != cached_mtime: + addition, removal = self._diff_dir_against_cache( + dir_path_str, dir_key, old_hashes, old_dir_mtimes + ) + if addition is not None or removal is not None: + return False + + for file_key, old_hash in old_hashes.items(): + file_path_str = f"{repo_str}/{file_key}" + try: + stat = os.stat(file_path_str) + except OSError: + return False + if stat.st_mtime <= cache_mtime: + continue + if _hash_file(Path(file_path_str)) != old_hash: + return False + return True + + def _collect_eligible_files(self) -> list[tuple[Path, str]]: + if self._single_file is not None: + if not should_skip_path( + self._single_file, self.repo_path, exclude_paths=self.exclude_paths, unignore_paths=self.unignore_paths, ): - lang_config = get_language_spec(filepath.suffix) - if ( - lang_config - and isinstance(lang_config.language, cs.SupportedLanguage) - and lang_config.language in self.parsers + file_key = cached_relative_path( + self._single_file, self.repo_path + ).as_posix() + return [(self._single_file, file_key)] + return [] + + eligible: list[tuple[Path, str]] = [] + hash_name = cs.HASH_CACHE_FILENAME + dir_mtimes_name = cs.DIR_MTIMES_FILENAME + repo_str = str(self.repo_path) + repo_prefix_len = len(repo_str) + 1 + exclude_paths = self.exclude_paths + unignore_paths = self.unignore_paths + self._collected_dir_mtimes = {} + for dirpath, dirnames, filenames in os.walk(repo_str): + if len(dirpath) < repo_prefix_len: + rel_dir = "" + dir_parts: tuple[str, ...] = () + dir_key = cs.ROOT_DIR_KEY + else: + rel_dir = dirpath[repo_prefix_len:].replace(os.sep, "/") + dir_parts = tuple(rel_dir.split("/")) if rel_dir else () + dir_key = rel_dir or cs.ROOT_DIR_KEY + dir_prefix = f"{rel_dir}/" if rel_dir else "" + try: + self._collected_dir_mtimes[dir_key] = os.stat(dirpath).st_mtime + except OSError: + pass + dirnames[:] = sorted( + d for d in dirnames if self._should_keep_dir(d, dir_prefix) + ) + for fname in sorted(filenames): + if fname in (hash_name, dir_mtimes_name): + continue + dot = fname.rfind(".") + suffix = fname[dot:] if dot != -1 else "" + rel_path_str = f"{dir_prefix}{fname}" + if not should_skip_rel_file( + rel_path_str, + dir_parts, + suffix, + exclude_paths=exclude_paths, + unignore_paths=unignore_paths, ): - result = self.factory.definition_processor.process_file( - filepath, - lang_config.language, - self.queries, - self.factory.structure_processor.structural_elements, + eligible.append((Path(f"{dirpath}/{fname}"), rel_path_str)) + return eligible + + def _process_files(self, force: bool = False) -> None: + cache_path = self.repo_path / cs.HASH_CACHE_FILENAME + dir_mtimes_path = self.repo_path / cs.DIR_MTIMES_FILENAME + old_hashes = _load_hash_cache(cache_path) if not force else {} + cache_mtime = cache_path.stat().st_mtime if cache_path.is_file() else 0.0 + if force: + logger.info(ls.INCREMENTAL_FORCE) + + _touch_empty_json(cache_path) + _touch_empty_json(dir_mtimes_path) + + eligible_files = self._collect_eligible_files() + new_hashes: FileHashCache = {} + skipped_count = 0 + changed_count = 0 + unreadable_count = 0 + + current_file_keys: set[str] = set() + + processed_since_flush = 0 + + changed_entries: list[tuple[Path, str, bool, bytes]] = [] + for filepath, file_key in eligible_files: + if not force and file_key in old_hashes: + try: + file_mtime = filepath.stat().st_mtime + except OSError: + unreadable_count += 1 + continue + if file_mtime <= cache_mtime: + new_hashes[file_key] = old_hashes[file_key] + current_file_keys.add(file_key) + skipped_count += 1 + continue + + hashed = _hash_file_with_bytes(filepath) + if hashed is None: + unreadable_count += 1 + continue + current_hash, file_bytes = hashed + + current_file_keys.add(file_key) + new_hashes[file_key] = current_hash + + if ( + not force + and file_key in old_hashes + and old_hashes[file_key] == current_hash + ): + logger.debug(ls.FILE_HASH_UNCHANGED, path=file_key) + skipped_count += 1 + continue + + is_new = file_key not in old_hashes + if not is_new: + logger.debug(ls.FILE_HASH_CHANGED, path=file_key) + else: + logger.debug(ls.FILE_HASH_NEW, path=file_key) + changed_entries.append((filepath, file_key, is_new, file_bytes)) + + pre_parsed = self._pre_parse_changed_files(changed_entries) + + with Progress( + SpinnerColumn(), + TextColumn(ls.PROGRESS_INDEXING_LABEL), + TextColumn("[progress.description]{task.description}"), + transient=True, + disable=not sys.stderr.isatty(), + ) as progress: + task = progress.add_task("", total=len(eligible_files)) + if skipped_count or unreadable_count: + progress.advance(task, skipped_count + unreadable_count) + + for filepath, file_key, is_new, file_bytes in changed_entries: + if not is_new: + self.remove_file_from_state(filepath) + self._delete_module_entities(file_key) + + changed_count += 1 + self._process_single_file( + filepath, + file_bytes=file_bytes, + pre_parsed=pre_parsed.get(filepath), + ) + + processed_since_flush += 1 + if processed_since_flush >= settings.FILE_FLUSH_INTERVAL: + logger.info(ls.PERIODIC_FLUSH.format(count=processed_since_flush)) + self.ingestor.flush_all() + processed_since_flush = 0 + + progress.update( + task, + advance=1, + description=ls.PROGRESS_FILES_PROCESSED.format(count=changed_count), + ) + + deleted_keys = set(old_hashes.keys()) - current_file_keys + if deleted_keys: + logger.info(ls.INCREMENTAL_DELETED, count=len(deleted_keys)) + for deleted_key in deleted_keys: + deleted_path = self.repo_path / deleted_key + self.remove_file_from_state(deleted_path) + self._delete_module_entities(deleted_key) + if isinstance(self.ingestor, QueryProtocol): + self.ingestor.execute_write( + cs.CYPHER_DELETE_FILE, {cs.KEY_PATH: deleted_key} ) - if result: - root_node, language = result - self.ast_cache[filepath] = (root_node, language) - elif self._is_dependency_file(filepath.name, filepath): - self.factory.definition_processor.process_dependencies(filepath) + if skipped_count > 0: + logger.info(ls.INCREMENTAL_SKIPPED, count=skipped_count) + if changed_count > 0: + logger.info(ls.INCREMENTAL_CHANGED, count=changed_count) + if unreadable_count > 0: + logger.info(ls.INCREMENTAL_UNREADABLE, count=unreadable_count) + + _save_hash_cache(cache_path, new_hashes) + _save_dir_mtimes(dir_mtimes_path, self._collected_dir_mtimes) + + def _pre_parse_changed_files( + self, + changed_entries: list[tuple[Path, str, bool, bytes]], + ) -> dict[Path, tuple[Node, dict[str, list] | None]]: + result: dict[Path, tuple[Node, dict[str, list] | None]] = {} + for filepath, _file_key, _is_new, file_bytes in changed_entries: + lang_config = get_language_spec(filepath.suffix) + if not ( + lang_config + and isinstance(lang_config.language, cs.SupportedLanguage) + and lang_config.language in self.parsers + ): + continue + language = lang_config.language + parser = self.queries[language].get(cs.KEY_PARSER) + if not parser: + continue + tree = parser.parse(file_bytes) + root_node = tree.root_node + combined_query = COMBINED_FUNC_CLASS_IMPORT_QUERIES.get(language) + combined_captures: dict[str, list] | None = None + if combined_query: + cursor = QueryCursor(combined_query) + combined_captures = sorted_captures(cursor, root_node) + result[filepath] = (root_node, combined_captures) + return result + + def _process_single_file( + self, + filepath: Path, + file_bytes: bytes | None = None, + pre_parsed: tuple[Node, dict[str, list] | None] | None = None, + ) -> None: + if self._cpp_frontend_covered: + rel = cached_relative_path(filepath, self.repo_path).as_posix() + if rel in self._cpp_frontend_covered: + # (H) The libclang frontend already emitted this file's + # (H) definitions; keep only the generic File node. self.factory.structure_processor.process_generic_file( filepath, filepath.name ) + return + + lang_config = get_language_spec(filepath.suffix) + if ( + lang_config + and isinstance(lang_config.language, cs.SupportedLanguage) + and lang_config.language in self.parsers + ): + result = self.factory.definition_processor.process_file( + filepath, + lang_config.language, + self.queries, + self.factory.structure_processor.structural_elements, + source_bytes=file_bytes, + pre_parsed=pre_parsed, + ) + if result: + root_node, language = result + self.ast_cache[filepath] = (root_node, language) + elif self._is_dependency_file(filepath.name, filepath): + self.factory.definition_processor.process_dependencies(filepath) + + self.factory.structure_processor.process_generic_file(filepath, filepath.name) def _process_function_calls(self) -> None: + captures_cache = self.factory._func_class_captures_cache ast_cache_items = list(self.ast_cache.items()) for file_path, (root_node, language) in ast_cache_items: + self.factory.call_processor.collect_callable_field_bindings( + file_path, + root_node, + language, + self.queries, + func_class_captures_cache=captures_cache, + ) + for file_path, (root_node, language) in ast_cache_items: + if captures_cache is not None and file_path in captures_cache: + cached = captures_cache[file_path] + if not cached.get(cs.CAPTURE_CALL) and not cached.get( + cs.CAPTURE_FUNCTION + ): + continue self.factory.call_processor.process_calls_in_file( - file_path, root_node, language, self.queries + file_path, + root_node, + language, + self.queries, + func_class_captures_cache=captures_cache, ) + self.factory.call_processor.finalize_callable_param_flow() + + def _prune_orphan_nodes(self) -> None: + """Remove graph nodes whose files/folders no longer exist on disk.""" + if not isinstance(self.ingestor, QueryProtocol): + return + + logger.info(ls.PRUNE_START) + total_pruned = 0 + + project_prefix = self.project_name + "." + repo_abs = self.repo_path.resolve().as_posix() + prune_specs: list[tuple[str, str, str]] = [ + (cs.CYPHER_ALL_FILE_PATHS, cs.CYPHER_DELETE_FILE, "File"), + ( + cs.CYPHER_ALL_MODULE_PATHS_INTERNAL, + cs.CYPHER_DELETE_MODULE, + "Module", + ), + (cs.CYPHER_ALL_FOLDER_PATHS, cs.CYPHER_DELETE_FOLDER, "Folder"), + ] + + for query_all, delete_query, label in prune_specs: + rows = self.ingestor.fetch_all(query_all) + orphans = [] + for r in rows: + path = r.get("path") + if not isinstance(path, str) or not path: + continue + if path.startswith(cs.INLINE_MODULE_PATH_PREFIX): + continue + abs_path = r.get("absolute_path") + qn = r.get("qualified_name", "") + if isinstance(abs_path, str) and not abs_path.startswith(repo_abs): + continue + if isinstance(qn, str) and qn and not qn.startswith(project_prefix): + continue + if not (self.repo_path / path).exists(): + orphans.append(path) + + if orphans: + logger.info(ls.PRUNE_FOUND, count=len(orphans), label=label) + for orphan_path in orphans: + logger.debug(ls.PRUNE_DELETING, label=label, path=orphan_path) + self.ingestor.execute_write( + delete_query, {cs.KEY_PATH: orphan_path} + ) + total_pruned += len(orphans) + + # (H) Drop external import-target modules that no module imports anymore, + # (H) e.g. an imported name renamed/removed on an incremental rebuild. + self.ingestor.execute_write(cs.CYPHER_DELETE_ORPHAN_EXTERNAL_MODULES) + + if total_pruned: + logger.info(ls.PRUNE_COMPLETE, count=total_pruned) + else: + logger.info(ls.PRUNE_SKIP) def _generate_semantic_embeddings(self) -> None: if not has_semantic_dependencies(): @@ -363,22 +1046,55 @@ def _generate_semantic_embeddings(self) -> None: return try: - from .embedder import embed_code - from .vector_store import store_embedding + from .embedder import embed_code_batch, get_embedding_cache + from .vector_store import ( + close_qdrant_client, + store_embedding_batch, + verify_stored_ids, + ) logger.info(ls.PASS_4_EMBEDDINGS) results = self.ingestor.fetch_all( - cs.CYPHER_QUERY_EMBEDDINGS, {"project_name": self.project_name + "."} + cs.CYPHER_QUERY_EMBEDDINGS, {"project_name": self.project_name} ) if not results: logger.info(ls.NO_FUNCTIONS_FOR_EMBEDDING) return - logger.info(ls.GENERATING_EMBEDDINGS.format(count=len(results))) + logger.info(ls.GENERATING_EMBEDDINGS, count=len(results)) embedded_count = 0 + expected_ids: set[int] = set() + pending: list[tuple[int, str, str]] = [] + flush_at = settings.QDRANT_BATCH_SIZE + + def flush() -> int: + nonlocal pending + if not pending: + return 0 + snippets = [item[2] for item in pending] + try: + embeddings = embed_code_batch(snippets) + except Exception as e: + logger.warning( + ls.EMBEDDING_BATCH_COMPUTE_FAILED, + count=len(pending), + error=e, + ) + pending = [] + return 0 + points: list[tuple[int, list[float], str]] = [ + (node_id, emb, qname) + for (node_id, qname, _), emb in zip(pending, embeddings) + ] + for node_id, _qname, _src in pending: + expected_ids.add(node_id) + stored = store_embedding_batch(points) + pending = [] + return stored + for row in results: parsed = self._parse_embedding_result(row) if parsed is None: @@ -391,33 +1107,62 @@ def _generate_semantic_embeddings(self) -> None: file_path = parsed.get(cs.KEY_PATH) if start_line is None or end_line is None or file_path is None: - logger.debug(ls.NO_SOURCE_FOR.format(name=qualified_name)) + logger.debug(ls.NO_SOURCE_FOR, name=qualified_name) + continue - elif source_code := self._extract_source_code( + if source_code := self._extract_source_code( qualified_name, file_path, start_line, end_line ): - try: - embedding = embed_code(source_code) - store_embedding(node_id, embedding, qualified_name) - embedded_count += 1 - - if embedded_count % settings.EMBEDDING_PROGRESS_INTERVAL == 0: + pending.append((node_id, qualified_name, source_code)) + if len(pending) >= flush_at: + embedded_count += flush() + if ( + embedded_count % settings.EMBEDDING_PROGRESS_INTERVAL == 0 + and embedded_count > 0 + ): logger.debug( - ls.EMBEDDING_PROGRESS.format( - done=embedded_count, total=len(results) - ) + ls.EMBEDDING_PROGRESS, + done=embedded_count, + total=len(results), ) - - except Exception as e: - logger.warning( - ls.EMBEDDING_FAILED.format(name=qualified_name, error=e) - ) else: - logger.debug(ls.NO_SOURCE_FOR.format(name=qualified_name)) - logger.info(ls.EMBEDDINGS_COMPLETE.format(count=embedded_count)) + logger.debug(ls.NO_SOURCE_FOR, name=qualified_name) + + embedded_count += flush() + + logger.info(ls.EMBEDDINGS_COMPLETE, count=embedded_count) + + self._reconcile_embeddings(expected_ids, verify_stored_ids) + + get_embedding_cache().save() + close_qdrant_client() + + except Exception as e: + logger.warning(ls.EMBEDDING_GENERATION_FAILED, error=e) + def _reconcile_embeddings( + self, + expected_ids: set[int], + verify_fn: Callable[[set[int]], set[int]], + ) -> None: + if not expected_ids: + return + try: + stored_ids = verify_fn(expected_ids) + missing = expected_ids - stored_ids + if missing: + sample = sorted(missing)[:10] + logger.warning( + ls.EMBEDDING_RECONCILE_MISSING.format( + missing=len(missing), + expected=len(expected_ids), + sample_ids=sample, + ) + ) + else: + logger.info(ls.EMBEDDING_RECONCILE_OK.format(count=len(expected_ids))) except Exception as e: - logger.warning(ls.EMBEDDING_GENERATION_FAILED.format(error=e)) + logger.warning(ls.EMBEDDING_RECONCILE_FAILED.format(error=e)) def _extract_source_code( self, qualified_name: str, file_path: str, start_line: int, end_line: int diff --git a/codebase_rag/language_spec.py b/codebase_rag/language_spec.py index cf550ab08..a48b6442b 100644 --- a/codebase_rag/language_spec.py +++ b/codebase_rag/language_spec.py @@ -82,6 +82,14 @@ def _rust_get_name(node: Node) -> str | None: name_node = node.child_by_field_name(cs.FIELD_NAME) if name_node and name_node.type == cs.TS_IDENTIFIER and name_node.text: return name_node.text.decode(cs.ENCODING_UTF8) + elif node.type == cs.TS_IMPL_ITEM: + # (H) An `impl Foo` block is an FQN scope, but it has no `name` field; its + # (H) target type is the segment that anchors its methods' qns + # (H) (owner_module.Foo.method). Without this the scope walk drops `Foo`, so + # (H) a closure/nested fn in an impl method binds to a phantom parent qn. + from .parsers.rs import utils as rs_utils + + return rs_utils.extract_impl_target(node) return _generic_get_name(node) @@ -97,6 +105,38 @@ def _rust_file_to_module(file_path: Path, repo_root: Path) -> list[str]: return [] +def _php_file_to_module(file_path: Path, repo_root: Path) -> list[str]: + try: + rel = file_path.relative_to(repo_root) + parts = list(rel.with_suffix("").parts) + if parts and parts[0] in ("src", "app", "lib"): + parts = parts[1:] + return parts + except ValueError: + return [] + + +def _c_unwrap_declarator(declarator: Node | None) -> Node | None: + while declarator and declarator.type == cs.CppNodeType.POINTER_DECLARATOR: + declarator = declarator.child_by_field_name(cs.FIELD_DECLARATOR) + return declarator + + +def _c_get_name(node: Node) -> str | None: + if node.type in cs.C_NAME_NODE_TYPES: + name_node = node.child_by_field_name(cs.FIELD_NAME) + if name_node and name_node.text: + return name_node.text.decode(cs.ENCODING_UTF8) + elif node.type == cs.TS_CPP_FUNCTION_DEFINITION: + declarator = node.child_by_field_name(cs.FIELD_DECLARATOR) + declarator = _c_unwrap_declarator(declarator) + if declarator and declarator.type == cs.TS_CPP_FUNCTION_DECLARATOR: + name_node = declarator.child_by_field_name(cs.FIELD_DECLARATOR) + if name_node and name_node.type == cs.TS_IDENTIFIER and name_node.text: + return name_node.text.decode(cs.ENCODING_UTF8) + return _generic_get_name(node) + + def _cpp_get_name(node: Node) -> str | None: if node.type in cs.CPP_NAME_NODE_TYPES: name_node = node.child_by_field_name(cs.FIELD_NAME) @@ -154,6 +194,13 @@ def _cpp_get_name(node: Node) -> str | None: file_to_module_parts=_generic_file_to_module, ) +C_FQN_SPEC = FQNSpec( + scope_node_types=frozenset(cs.FQN_C_SCOPE_TYPES), + function_node_types=frozenset(cs.FQN_C_FUNCTION_TYPES), + get_name=_c_get_name, + file_to_module_parts=_generic_file_to_module, +) + LUA_FQN_SPEC = FQNSpec( scope_node_types=frozenset(cs.FQN_LUA_SCOPE_TYPES), function_node_types=frozenset(cs.FQN_LUA_FUNCTION_TYPES), @@ -175,18 +222,11 @@ def _cpp_get_name(node: Node) -> str | None: file_to_module_parts=_generic_file_to_module, ) -CSHARP_FQN_SPEC = FQNSpec( - scope_node_types=frozenset(cs.FQN_CS_SCOPE_TYPES), - function_node_types=frozenset(cs.FQN_CS_FUNCTION_TYPES), - get_name=_generic_get_name, - file_to_module_parts=_generic_file_to_module, -) - PHP_FQN_SPEC = FQNSpec( scope_node_types=frozenset(cs.FQN_PHP_SCOPE_TYPES), function_node_types=frozenset(cs.FQN_PHP_FUNCTION_TYPES), get_name=_generic_get_name, - file_to_module_parts=_generic_file_to_module, + file_to_module_parts=_php_file_to_module, ) LANGUAGE_FQN_SPECS: dict[cs.SupportedLanguage, FQNSpec] = { @@ -195,11 +235,11 @@ def _cpp_get_name(node: Node) -> str | None: cs.SupportedLanguage.TS: TS_FQN_SPEC, cs.SupportedLanguage.RUST: RUST_FQN_SPEC, cs.SupportedLanguage.JAVA: JAVA_FQN_SPEC, + cs.SupportedLanguage.C: C_FQN_SPEC, cs.SupportedLanguage.CPP: CPP_FQN_SPEC, cs.SupportedLanguage.LUA: LUA_FQN_SPEC, cs.SupportedLanguage.GO: GO_FQN_SPEC, cs.SupportedLanguage.SCALA: SCALA_FQN_SPEC, - cs.SupportedLanguage.CSHARP: CSHARP_FQN_SPEC, cs.SupportedLanguage.PHP: PHP_FQN_SPEC, } @@ -285,8 +325,14 @@ def _cpp_get_name(node: Node) -> str | None: function: (scoped_identifier "::" name: (identifier) @name)) @call + (call_expression + function: (generic_function) @name) @call (macro_invocation macro: (identifier) @name) @call + (token_tree + (identifier) @name @call + . + (token_tree . "(")) """, ), cs.SupportedLanguage.GO: LanguageSpec( @@ -343,6 +389,28 @@ def _cpp_get_name(node: Node) -> str | None: type: (type_identifier) @name) @call """, ), + cs.SupportedLanguage.C: LanguageSpec( + language=cs.SupportedLanguage.C, + file_extensions=cs.C_EXTENSIONS, + function_node_types=cs.SPEC_C_FUNCTION_TYPES, + class_node_types=cs.SPEC_C_CLASS_TYPES, + module_node_types=cs.SPEC_C_MODULE_TYPES, + call_node_types=cs.SPEC_C_CALL_TYPES, + import_node_types=cs.IMPORT_NODES_INCLUDE, + import_from_node_types=cs.IMPORT_NODES_INCLUDE, + package_indicators=cs.SPEC_C_PACKAGE_INDICATORS, + function_query=""" + (function_definition) @function + """, + class_query=""" + (struct_specifier) @class + (union_specifier) @class + (enum_specifier) @class + """, + call_query=""" + (call_expression) @call + """, + ), cs.SupportedLanguage.CPP: LanguageSpec( language=cs.SupportedLanguage.CPP, file_extensions=cs.CPP_EXTENSIONS, @@ -381,16 +449,6 @@ def _cpp_get_name(node: Node) -> str | None: (delete_expression) @call """, ), - cs.SupportedLanguage.CSHARP: LanguageSpec( - language=cs.SupportedLanguage.CSHARP, - file_extensions=cs.CS_EXTENSIONS, - function_node_types=cs.SPEC_CS_FUNCTION_TYPES, - class_node_types=cs.SPEC_CS_CLASS_TYPES, - module_node_types=cs.SPEC_CS_MODULE_TYPES, - call_node_types=cs.SPEC_CS_CALL_TYPES, - import_node_types=cs.IMPORT_NODES_USING, - import_from_node_types=cs.IMPORT_NODES_USING, - ), cs.SupportedLanguage.PHP: LanguageSpec( language=cs.SupportedLanguage.PHP, file_extensions=cs.PHP_EXTENSIONS, @@ -398,6 +456,42 @@ def _cpp_get_name(node: Node) -> str | None: class_node_types=cs.SPEC_PHP_CLASS_TYPES, module_node_types=cs.SPEC_PHP_MODULE_TYPES, call_node_types=cs.SPEC_PHP_CALL_TYPES, + import_node_types=cs.SPEC_PHP_IMPORT_TYPES, + import_from_node_types=cs.SPEC_PHP_IMPORT_FROM_TYPES, + function_query=""" + (function_definition + name: (name) @name) @function + (method_declaration + name: (name) @name) @function + (anonymous_function) @function + (arrow_function) @function + """, + class_query=""" + (class_declaration + name: (name) @name) @class + (interface_declaration + name: (name) @name) @class + (trait_declaration + name: (name) @name) @class + (enum_declaration + name: (name) @name) @class + """, + call_query=""" + (function_call_expression + function: (name) @name) @call + (function_call_expression + function: (qualified_name) @name) @call + (member_call_expression + name: (name) @name) @call + (scoped_call_expression + name: (name) @name) @call + (nullsafe_member_call_expression + name: (name) @name) @call + (object_creation_expression + (name) @name) @call + (object_creation_expression + (qualified_name) @name) @call + """, ), cs.SupportedLanguage.LUA: LanguageSpec( language=cs.SupportedLanguage.LUA, diff --git a/codebase_rag/logs.py b/codebase_rag/logs.py index 3e075c877..9fb10305b 100644 --- a/codebase_rag/logs.py +++ b/codebase_rag/logs.py @@ -13,6 +13,17 @@ ) PASS_3_CALLS = "--- Pass 3: Processing Function Calls from AST Cache ---" PASS_4_EMBEDDINGS = "--- Pass 4: Generating semantic embeddings ---" +CPP_FRONTEND_RUNNING = "--- C/C++ libclang frontend: {path} ---" +CPP_FRONTEND_UNAVAILABLE = ( + "CPP_FRONTEND=libclang but libclang is unavailable; using tree-sitter" +) +CPP_FRONTEND_NO_COMPDB = ( + "CPP_FRONTEND=libclang but no compile_commands.json found; using tree-sitter" +) +CPP_FRONTEND_COVERED = "C/C++ libclang frontend covered {count} file(s)" +GRAPH_ALREADY_IN_SYNC = ( + "Knowledge graph already in sync (hash cache matches every file). Skipping passes." +) # (H) Analysis logs FOUND_FUNCTIONS = "\n--- Found {count} functions/methods in codebase ---" @@ -45,14 +56,40 @@ GENERATING_EMBEDDINGS = "Generating embeddings for {count} functions/methods" EMBEDDING_PROGRESS = "Generated {done}/{total} embeddings" EMBEDDING_FAILED = "Failed to embed {name}: {error}" +EMBEDDING_BATCH_COMPUTE_FAILED = "Failed to embed batch of {count}: {error}" +CONTEXT_TOKEN_COUNT_FAILED = "Context token count failed: {error}" NO_SOURCE_FOR = "No source code found for {name}" EMBEDDINGS_COMPLETE = "Successfully generated {count} semantic embeddings" EMBEDDING_GENERATION_FAILED = "Failed to generate semantic embeddings: {error}" EMBEDDING_STORE_FAILED = "Failed to store embedding for {name}: {error}" +EMBEDDING_STORE_RETRY = "Qdrant upsert failed (attempt {attempt}/{max_attempts}), retrying in {delay:.1f}s: {error}" +EMBEDDING_BATCH_STORED = "Stored batch of {count} embeddings in Qdrant" +EMBEDDING_BATCH_FAILED = "Failed to store embedding batch: {error}" EMBEDDING_SEARCH_FAILED = "Failed to search embeddings: {error}" - -# (H) Image logs -IMAGE_COPIED = "Copied image to temporary path: {path}" +EMBEDDING_RECONCILE_OK = "Qdrant reconciliation: all {count} expected embeddings found" +EMBEDDING_RECONCILE_MISSING = "Qdrant reconciliation: {missing} of {expected} embeddings missing (IDs: {sample_ids})" +EMBEDDING_RECONCILE_FAILED = "Qdrant reconciliation check failed: {error}" +QDRANT_DELETE_PROJECT = "Deleting {count} Qdrant vectors for project '{project}'" +QDRANT_DELETE_PROJECT_DONE = "Deleted Qdrant vectors for project '{project}'" +QDRANT_DELETE_PROJECT_FAILED = ( + "Failed to delete Qdrant vectors for project '{project}': {error}" +) +QDRANT_LOCK_ERROR = ( + "Failed to open embedded Qdrant at '{path}': {error}. The storage folder is " + "locked by another process; look for the '.lock' sentinel inside it. Embedded " + "Qdrant allows only one process at a time, so a running MCP server and a CLI " + "indexing run cannot share it. Set QDRANT_URL to point at a shared Qdrant " + "server for concurrent access." +) +EMBEDDING_CACHE_HIT = "Embedding cache hit for {count} snippets" +EMBEDDING_CACHE_LOADED = "Loaded embedding cache with {count} entries from {path}" +EMBEDDING_CACHE_SAVE_FAILED = "Failed to save embedding cache to {path}: {error}" +EMBEDDING_CACHE_LOAD_FAILED = "Failed to load embedding cache from {path}: {error}" + +# (H) Multimodal attachment logs +MULTIMODAL_ATTACHED = "Attached multimodal content: {path}" +MULTIMODAL_NOT_FOUND = "Multimodal path referenced but not found: {path}" +MULTIMODAL_READ_FAILED = "Failed to read multimodal file '{path}': {error}" # (H) Protobuf service logs PROTOBUF_INIT = "ProtobufFileIngestor initialized to write to: {path}" @@ -95,10 +132,30 @@ ) CGRIGNORE_READ_FAILED = "Failed to read {path}: {error}" +CGR_INSTRUCTIONS_LOADED = "Loaded project instructions from {path} ({chars} chars)" +CGR_INSTRUCTIONS_READ_FAILED = "Failed to read project instructions {path}: {error}" + # (H) File watcher logs WATCHER_ACTIVE = "File watcher is now active." +WATCHER_DEBOUNCE_ACTIVE = ( + "File watcher active with debouncing (debounce={debounce}s, max_wait={max_wait}s)" +) WATCHER_SKIP_NO_QUERY = "Ingestor does not support querying, skipping real-time update." CHANGE_DETECTED = "Change detected: {event_type} on {path}. Updating graph." +CHANGE_DEBOUNCING = ( + "Change detected: {event_type} on {name} (debouncing for {debounce}s)" +) +DEBOUNCE_RESET = "Reset debounce timer for {path}" +DEBOUNCE_MAX_WAIT = "Max wait ({max_wait}s) exceeded for {path}, processing now" +DEBOUNCE_SCHEDULED = ( + "Scheduled update for {path} in {debounce}s (max wait: {remaining}s remaining)" +) +DEBOUNCE_PROCESSING = "Processing debounced change: {path}" +DEBOUNCE_NO_EVENT = "No pending event for {path}, skipping" +DEBOUNCE_MAX_WAIT_ADJUSTED = ( + "max_wait ({max_wait}s) is less than debounce ({debounce}s). " + "Setting max_wait to debounce value." +) DELETION_QUERY = "Ran deletion query for path: {path}" RECALC_CALLS = "Recalculating all function call relationships for consistency..." GRAPH_UPDATED = "Graph updated successfully for change in: {name}" @@ -155,7 +212,8 @@ # (H) Memgraph logs MG_CONNECTING = "Connecting to Memgraph at {host}:{port}..." MG_CONNECTED = "Successfully connected to Memgraph." -MG_EXCEPTION = "An exception occurred: {error}. Flushing remaining items..." +MG_EXCEPTION = "An exception occurred: {error}. Attempting best-effort flush..." +MG_FLUSH_ERROR = "Failed to flush during cleanup: {error}" MG_DISCONNECTED = "\nDisconnected from Memgraph." MG_CYPHER_ERROR = "!!! Cypher Error: {error}" MG_CYPHER_QUERY = " Query: {query}" @@ -177,7 +235,9 @@ "Relationship buffer reached batch size ({size}). Performing incremental flush." ) MG_NO_CONSTRAINT = "No unique constraint defined for label '{label}'. Skipping flush." -MG_MISSING_PROP = "Skipping {label} node missing required '{key}' property: {props}" +MG_MISSING_PROP = ( + "Skipping {label} node missing required '{key}' property (keys: {prop_keys})" +) MG_NODES_FLUSHED = "Flushed {flushed} of {total} buffered nodes." MG_NODES_SKIPPED = ( "Skipped {count} buffered nodes due to missing identifiers or constraints." @@ -189,6 +249,18 @@ ) MG_FLUSH_START = "--- Flushing all pending writes to database... ---" MG_FLUSH_COMPLETE = "--- Flushing complete. ---" +MG_PARALLEL_FLUSH_NODES = ( + "Parallel flushing {count} label groups with {workers} workers" +) +MG_PARALLEL_FLUSH_RELS = ( + "Parallel flushing {count} relationship groups with {workers} workers" +) +MG_LABEL_FLUSH_ERROR = "Error flushing label group '{label}': {error}" +MG_REL_FLUSH_ERROR = "Error flushing relationship group '{pattern}': {error}" +MG_NO_CONN_NODES = "No database connection for label '{label}', skipping flush." +MG_NO_CONN_RELS = ( + "No database connection for relationship group '{pattern}', skipping flush." +) MG_FETCH_QUERY = "Executing fetch query: {query} with params: {params}" MG_WRITE_QUERY = "Executing write query: {query} with params: {params}" MG_EXPORTING = "Exporting graph data..." @@ -215,6 +287,13 @@ ) TOOL_QUERY_RECEIVED = "[Tool:QueryGraph] Received NL query: '{query}'" TOOL_QUERY_ERROR = "[Tool:QueryGraph] Error during query execution: {error}" +TOOL_QUERY_TIMEOUT = ( + "[Tool:QueryGraph] Query exceeded {timeout:.1f}s and was cancelled: {query}" +) +QUERY_RESULTS_TRUNCATED = ( + "[Tool:QueryGraph] Results truncated: showing {kept} of {total} rows " + "({tokens} tokens, limit {max_tokens})" +) TOOL_SHELL_EXEC = "Executing shell command: {cmd}" TOOL_SHELL_RETURN = "Return code: {code}" TOOL_SHELL_STDOUT = "Stdout: {stdout}" @@ -224,7 +303,6 @@ "Process already terminated when timeout kill was attempted." ) TOOL_SHELL_ERROR = "An error occurred while executing command: {error}" -TOOL_DOC_ANALYZE = "[DocumentAnalyzer] Analyzing '{path}' with question: '{question}'" # (H) Shell timing log SHELL_TIMING = "'{func}' executed in {time:.2f}ms" @@ -276,15 +354,6 @@ SEMANTIC_TOOL_SEARCH = "[Tool:SemanticSearch] Searching for: '{query}'" SEMANTIC_TOOL_SOURCE = "[Tool:GetFunctionSource] Retrieving source for node ID: {id}" -# (H) Document analyzer logs -DOC_COPIED = "Copied external file to: {path}" -DOC_SUCCESS = "Successfully received analysis for '{path}'." -DOC_NO_TEXT = "No text found in response: {response}" -DOC_API_ERROR = "Google GenAI API error for '{path}': {error}" -DOC_FAILED = "Failed to analyze document '{path}': {error}" -DOC_RESULT = "[analyze_document] Result type: {type}, content: {preview}..." -DOC_EXCEPTION = "[analyze_document] Exception during analysis: {error}" - # (H) Code retrieval logs CODE_RETRIEVER_INIT = "CodeRetriever initialized with root: {root}" CODE_RETRIEVER_SEARCH = "[CodeRetriever] Searching for: {name}" @@ -295,14 +364,12 @@ FILE_EDITOR_INIT = "FileEditor initialized with root: {root}" FILE_READER_INIT = "FileReader initialized with root: {root}" SHELL_COMMANDER_INIT = "ShellCommander initialized with root: {root}" -DOC_ANALYZER_INIT = "DocumentAnalyzer initialized with root: {root}" # (H) Tool error logs FILE_EDITOR_WARN = "[FileEditor] {msg}" FILE_EDITOR_ERR = "[FileEditor] {msg}" FILE_EDITOR_ERR_EDIT = "[FileEditor] Error editing file {path}: {error}" FILE_READER_ERR = "Error reading file {path}: {error}" -DOC_ANALYZER_API_ERR = "[DocumentAnalyzer] API validation error: {error}" # (H) File writer logs FILE_WRITER_INIT = "FileWriter initialized with root: {root}" @@ -312,18 +379,20 @@ # (H) Error logs (used with logger.error/warning) UNEXPECTED = "An unexpected error occurred: {error}" EXPORT_ERROR = "Export error: {error}" +STATS_ERROR = "Stats error: {error}" +DEADCODE_SCANNING = "Scanning project '{project_name}' for dead code" +DEADCODE_ERROR = "Dead code scan error: {error}" INDEXING_FAILED = "Indexing failed" PATH_NOT_IN_QUESTION = ( - "Could not find original path in question for replacement: {path}" + "Could not locate path token in user message for attachment: {path}" ) -IMAGE_NOT_FOUND = "Image path found, but does not exist: {path}" -IMAGE_COPY_FAILED = "Failed to copy image to temporary directory: {error}" FILE_OUTSIDE_ROOT = "Security risk: Attempted to {action} file outside of project root." # (H) Call processor logs CALL_PROCESSING_FILE = "Processing calls in cached AST for: {path}" CALL_PROCESSING_FAILED = "Failed to process calls in {path}: {error}" CALL_FOUND_NODES = "Found {count} call nodes in {language} for {caller}" +CALL_SKIP_CLASS = "Skipping CALLS edge from {caller} to {call_name} (callee is Class node: {callee_qn})" CALL_FOUND = ( "Found call from {caller} to {call_name} (resolved as {callee_type}:{callee_qn})" ) @@ -593,6 +662,14 @@ MCP_ERROR_WRITE = "[MCP] Error writing file: {error}" MCP_LIST_DIR = "[MCP] list_directory: {path}" MCP_ERROR_LIST_DIR = "[MCP] Error listing directory: {error}" +MCP_SEMANTIC_NOT_AVAILABLE = ( + "[MCP] Semantic search not available. Install with: uv sync --extra semantic" +) +MCP_UPDATING_REPO = "[MCP] Updating repository at: {path}" +MCP_ERROR_UPDATING = "[MCP] Error updating repository: {error}" +MCP_SEMANTIC_SEARCH = "[MCP] semantic_search: {query}" +MCP_ASK_AGENT = "[MCP] ask_agent: {question}" +MCP_ASK_AGENT_ERROR = "[MCP] Error running ask_agent: {error}" # (H) MCP server logs MCP_SERVER_INFERRED_ROOT = "[GraphCode MCP] Using inferred project root: {path}" @@ -612,6 +689,35 @@ MCP_SERVER_CONNECTED = "[GraphCode MCP] Connected to Memgraph at {host}:{port}" MCP_SERVER_FATAL_ERROR = "[GraphCode MCP] Fatal error: {error}" MCP_SERVER_SHUTDOWN = "[GraphCode MCP] Shutting down server..." +MCP_HTTP_SERVER_STARTING = "[GraphCode MCP] Starting HTTP server on {host}:{port}..." +MCP_HTTP_SERVER_READY = ( + "[GraphCode MCP] HTTP server ready. MCP endpoint: http://{host}:{port}/mcp" +) + +# (H) Incremental update logs +HASH_CACHE_LOADED = "Loaded hash cache with {count} entries from {path}" +HASH_CACHE_LOAD_FAILED = "Failed to load hash cache from {path}: {error}" +HASH_CACHE_SAVED = "Saved hash cache with {count} entries to {path}" +HASH_CACHE_SAVE_FAILED = "Failed to save hash cache to {path}: {error}" +PERIODIC_FLUSH = "Periodic flush after {count} files processed" +INCREMENTAL_SKIPPED = "Skipped {count} unchanged files" +INCREMENTAL_CHANGED = "Re-indexing {count} changed files" +INCREMENTAL_DELETED = "Removed state for {count} deleted files" +INCREMENTAL_FORCE = "Force mode enabled, bypassing hash cache" + +# (H) Orphan pruning logs +PRUNE_START = "--- Pruning orphan nodes from graph ---" +PRUNE_FOUND = "Found {count} orphan {label} nodes to remove" +PRUNE_DELETING = "Pruning orphan {label}: {path}" +PRUNE_COMPLETE = "Pruning complete. Removed {count} orphan nodes." +PRUNE_SKIP = "No orphan nodes found. Graph is clean." +FILE_HASH_UNCHANGED = "File unchanged (hash match): {path}" +FILE_HASH_CHANGED = "File changed (hash mismatch): {path}" +FILE_HASH_NEW = "New file detected: {path}" +FILE_UNREADABLE = ( + "Skipping unreadable file (broken symlink or removed): {path} ({error})" +) +INCREMENTAL_UNREADABLE = "Skipped {count} unreadable files (broken symlinks or removed)" # (H) Exclude prompt logs EXCLUDE_INVALID_INDEX = "Invalid index: {index} (out of range)" @@ -621,3 +727,7 @@ MODEL_SWITCHED = "Model switched to: {model}" MODEL_SWITCH_FAILED = "Failed to switch model: {error}" MODEL_CURRENT = "Current model: {model}" + +# (H) Progress bar logs +PROGRESS_INDEXING_LABEL = "[bold blue]Indexing files..." +PROGRESS_FILES_PROCESSED = "{count} processed" diff --git a/codebase_rag/main.py b/codebase_rag/main.py index af58a84a4..9bb1ef791 100644 --- a/codebase_rag/main.py +++ b/codebase_rag/main.py @@ -3,26 +3,44 @@ import asyncio import difflib import json +import mimetypes import os import shlex import shutil +import subprocess import sys import uuid from collections import deque -from collections.abc import Coroutine +from collections.abc import Callable, Coroutine +from contextlib import contextmanager from dataclasses import replace +from html import escape as html_escape from pathlib import Path from typing import TYPE_CHECKING from loguru import logger -from prompt_toolkit import prompt +from prompt_toolkit import PromptSession, prompt from prompt_toolkit.formatted_text import HTML from prompt_toolkit.key_binding import KeyBindings from prompt_toolkit.shortcuts import print_formatted_text -from pydantic_ai import DeferredToolRequests, DeferredToolResults, ToolDenied -from rich.markdown import Markdown +from pydantic_ai import ( + BinaryContent, + DeferredToolRequests, + DeferredToolResults, + ToolDenied, +) +from pydantic_ai.messages import ( + ModelRequest, + ModelResponse, + ToolCallPart, + ToolReturnPart, + UserContent, +) +from rich.console import Group +from rich.live import Live from rich.panel import Panel -from rich.prompt import Confirm, Prompt +from rich.prompt import Prompt +from rich.spinner import Spinner from rich.table import Table from rich.text import Text @@ -39,7 +57,6 @@ from .tools.code_retrieval import CodeRetriever, create_code_retrieval_tool from .tools.codebase_query import create_query_tool from .tools.directory_lister import DirectoryLister, create_directory_lister_tool -from .tools.document_analyzer import DocumentAnalyzer, create_document_analyzer_tool from .tools.file_editor import FileEditor, create_file_editor_tool from .tools.file_reader import FileReader, create_file_reader_tool from .tools.file_writer import FileWriter, create_file_writer_tool @@ -57,11 +74,13 @@ ConfirmationToolNames, CreateFileArgs, GraphData, + QueryJsonOutput, RawToolArgs, ReplaceCodeArgs, ShellCommandArgs, ToolArgs, ) +from .utils.rich_markdown import LeftAlignedMarkdown if TYPE_CHECKING: from prompt_toolkit.key_binding import KeyPressEvent @@ -109,6 +128,50 @@ def get_session_context() -> str: return "" +def _autowrap_diff_blocks(text: str) -> str: + if cs.DIFF_GIT_HEADER not in text: + return text + lines = text.split("\n") + out: list[str] = [] + in_fence = False + in_diff = False + + def is_diff_continuation(line: str) -> bool: + if line == "": + return True + return line.startswith(cs.DIFF_CONTINUATION_PREFIXES) + + for line in lines: + if line.startswith(cs.MARKDOWN_FENCE): + if in_diff: + out.append(cs.MARKDOWN_FENCE) + in_diff = False + in_fence = not in_fence + out.append(line) + continue + if in_fence: + out.append(line) + continue + if not in_diff and line.startswith(cs.DIFF_GIT_HEADER): + out.append(cs.MARKDOWN_FENCE_DIFF) + in_diff = True + out.append(line) + continue + if in_diff: + if is_diff_continuation(line): + out.append(line) + else: + out.append(cs.MARKDOWN_FENCE) + in_diff = False + out.append(line) + continue + out.append(line) + + if in_diff: + out.append(cs.MARKDOWN_FENCE) + return "\n".join(out) + + def _print_unified_diff(target: str, replacement: str, path: str) -> None: separator = dim(cs.HORIZONTAL_SEPARATOR) app_context.console.print(f"\n{cs.UI_DIFF_FILE_HEADER.format(path=path)}") @@ -216,7 +279,7 @@ def _display_tool_call_diff( ) -def _process_tool_approvals( +async def _process_tool_approvals( requests: DeferredToolRequests, approval_prompt: str, denial_default: str, @@ -228,30 +291,102 @@ def _process_tool_approvals( tool_args = _to_tool_args( call.tool_name, RawToolArgs(**call.args_as_dict()), tool_names ) - app_context.console.print( - f"\n{cs.UI_TOOL_APPROVAL.format(tool_name=call.tool_name)}" + will_prompt = ( + app_context.session.confirm_edits and not app_context.session.is_yolo() ) + + if will_prompt: + app_context.console.print( + f"\n{cs.UI_TOOL_APPROVAL.format(tool_name=call.tool_name)}" + ) _display_tool_call_diff(call.tool_name, tool_args, tool_names) - if app_context.session.confirm_edits: - if Confirm.ask(style(approval_prompt, cs.Color.CYAN)): - deferred_results.approvals[call.tool_call_id] = True - else: - feedback = Prompt.ask( - cs.UI_FEEDBACK_PROMPT, - default="", - ) - denial_msg = feedback.strip() or denial_default - deferred_results.approvals[call.tool_call_id] = ToolDenied(denial_msg) - else: + if not will_prompt: + deferred_results.approvals[call.tool_call_id] = True + continue + + if await _confirm_with_toggle(approval_prompt): + deferred_results.approvals[call.tool_call_id] = True + elif app_context.session.is_yolo(): deferred_results.approvals[call.tool_call_id] = True + else: + feedback = await _prompt_with_toggle(cs.UI_FEEDBACK_PROMPT) + denial_msg = feedback.strip() or denial_default + deferred_results.approvals[call.tool_call_id] = ToolDenied(denial_msg) return deferred_results +def _approval_keybindings() -> KeyBindings: + bindings = KeyBindings() + + @bindings.add(cs.KeyBinding.SHIFT_TAB) + def _toggle(event: KeyPressEvent) -> None: + app_context.session.cycle_permission_mode() + if app_context.session.is_yolo(): + event.app.exit(result=cs.YES_ANSWER) + else: + event.app.invalidate() + + @bindings.add(cs.KeyBinding.CTRL_C) + def _interrupt(event: KeyPressEvent) -> None: + event.app.exit(exception=KeyboardInterrupt) + + return bindings + + +async def _confirm_with_toggle(question: str) -> bool: + bindings = _approval_keybindings() + prompt_text = HTML( + f' [y/n] (Y): ' + ) + session: PromptSession[str] = PromptSession() + while True: + try: + answer = await session.prompt_async( + prompt_text, + key_bindings=bindings, + style=ORANGE_STYLE, + bottom_toolbar=lambda: _status_bar_label(), + refresh_interval=0.5, + ) + except (KeyboardInterrupt, EOFError): + return False + if app_context.session.is_yolo(): + return True + normalized = (answer or "").strip().lower() + if normalized in cs.YES_ANSWERS: + return True + if normalized in cs.NO_ANSWERS: + return False + + +async def _prompt_with_toggle(question: str) -> str: + bindings = _approval_keybindings() + prompt_text = HTML( + f': ' + ) + session: PromptSession[str] = PromptSession() + try: + answer = await session.prompt_async( + prompt_text, + key_bindings=bindings, + style=ORANGE_STYLE, + bottom_toolbar=lambda: _status_bar_label(), + refresh_interval=0.5, + ) + except (KeyboardInterrupt, EOFError): + return "" + return answer or "" + + +def _rich_log_sink(message: object) -> None: + app_context.console.print(str(message), end="", markup=False, highlight=False) + + def _setup_common_initialization(repo_path: str) -> Path: logger.remove() - logger.add(sys.stdout, format=cs.LOG_FORMAT) + logger.add(_rich_log_sink, format=cs.LOG_FORMAT, colorize=False) project_root = Path(repo_path).resolve() tmp_dir = project_root / cs.TMP_DIR @@ -262,6 +397,7 @@ def _setup_common_initialization(repo_path: str) -> Path: tmp_dir.unlink() tmp_dir.mkdir() + app_context.session.target_repo = project_root return project_root @@ -385,46 +521,75 @@ async def run_with_cancellation[T]( return CancelledResult(cancelled=True) +def _cancel_orphaned_tool_calls(message_history: list[ModelMessage]) -> None: + if not message_history: + return + last = message_history[-1] + if not isinstance(last, ModelResponse): + return + tool_calls = [p for p in last.parts if isinstance(p, ToolCallPart)] + if not tool_calls: + return + message_history.append( + ModelRequest( + parts=[ + ToolReturnPart( + tool_name=p.tool_name, + content=cs.MSG_TOOL_CALL_CANCELLED, + tool_call_id=p.tool_call_id, + ) + for p in tool_calls + ] + ) + ) + + async def _run_agent_response_loop( rag_agent: Agent[None, str | DeferredToolRequests], message_history: list[ModelMessage], - question_with_context: str, + question_with_context: str | list[UserContent], config: AgentLoopUI, tool_names: ConfirmationToolNames, model_override: Model | None = None, ) -> None: deferred_results: DeferredToolResults | None = None + pending_prompt: str | list[UserContent] | None = question_with_context while True: - with app_context.console.status(config.status_message): + with _thinking_with_status_bar(config.status_message): response = await run_with_cancellation( rag_agent.run( - question_with_context, + pending_prompt, message_history=message_history, deferred_tool_results=deferred_results, model=model_override, ), ) + pending_prompt = None if isinstance(response, CancelledResult): log_session_event(config.cancelled_log) app_context.session.cancelled = True + _cancel_orphaned_tool_calls(message_history) break + message_history.extend(response.new_messages()) + if isinstance(response.output, DeferredToolRequests): - deferred_results = _process_tool_approvals( + deferred_results = await _process_tool_approvals( response.output, config.approval_prompt, config.denial_default, tool_names, ) - message_history.extend(response.new_messages()) continue + asyncio.create_task(_refresh_context_tokens(list(message_history))) + output_text = response.output if not isinstance(output_text, str): continue - markdown_response = Markdown(output_text) + markdown_response = LeftAlignedMarkdown(_autowrap_diff_blocks(output_text)) app_context.console.print( Panel( markdown_response, @@ -434,35 +599,29 @@ async def _run_agent_response_loop( ) log_session_event(f"{cs.SESSION_PREFIX_ASSISTANT}{output_text}") - message_history.extend(response.new_messages()) break -def _find_image_paths(question: str) -> list[Path]: +def _find_multimodal_paths(question: str) -> list[Path]: try: if os.name == "nt": - # (H) On Windows, shlex.split with posix=False to preserve backslashes tokens = shlex.split(question, posix=False) else: tokens = shlex.split(question) except ValueError: tokens = question.split() - image_paths: list[Path] = [] + paths: list[Path] = [] for token in tokens: - # (H) Strip quotes if they remain (shlex with posix=False might keep some) token = token.strip("'\"") - # (H) Check if it looks like an image path - if token.lower().endswith(cs.IMAGE_EXTENSIONS): - # (H) On Windows, could be C:\... or \... - # (H) On POSIX, starts with / + if token.lower().endswith(cs.MULTIMODAL_EXTENSIONS): p = Path(token) if p.is_absolute() or token.startswith("/") or token.startswith("\\"): - image_paths.append(p) - return image_paths + paths.append(p) + return paths -def _get_path_variants(path_str: str) -> tuple[str, ...]: +def _path_variants(path_str: str) -> tuple[str, ...]: return ( path_str.replace(" ", r"\ "), f"'{path_str}'", @@ -471,40 +630,417 @@ def _get_path_variants(path_str: str) -> tuple[str, ...]: ) -def _replace_path_in_question(question: str, old_path: str, new_path: str) -> str: - for variant in _get_path_variants(old_path): - if variant in question: - return question.replace(variant, new_path) - logger.warning(ls.PATH_NOT_IN_QUESTION.format(path=old_path)) - return question +def _guess_media_type(path: Path) -> str: + mime, _ = mimetypes.guess_type(str(path)) + return mime or cs.MIME_TYPE_FALLBACK -def _handle_chat_images(question: str, project_root: Path) -> str: - image_files = _find_image_paths(question) - if not image_files: +def _build_user_prompt(question: str) -> str | list[UserContent]: + paths = _find_multimodal_paths(question) + if not paths: return question - tmp_dir = project_root / cs.TMP_DIR - tmp_dir.mkdir(exist_ok=True) - updated_question = question - - for original_path in image_files: - if not original_path.exists() or not original_path.is_file(): - logger.warning(ls.IMAGE_NOT_FOUND.format(path=original_path)) + content: list[UserContent] = [] + remaining = question + for path in paths: + if not path.exists() or not path.is_file(): + logger.warning(ls.MULTIMODAL_NOT_FOUND.format(path=path)) continue - + match_token = next( + (v for v in _path_variants(str(path)) if v in remaining), None + ) + if match_token is None: + logger.warning(ls.PATH_NOT_IN_QUESTION.format(path=path)) + continue + before, _, after = remaining.partition(match_token) + if before.strip(): + content.append(before.rstrip()) try: - new_path = tmp_dir / f"{uuid.uuid4()}-{original_path.name}" - shutil.copy(original_path, new_path) - new_relative = str(new_path.relative_to(project_root)) - updated_question = _replace_path_in_question( - updated_question, str(original_path), new_relative + content.append( + BinaryContent( + data=path.read_bytes(), media_type=_guess_media_type(path) + ) ) - logger.info(ls.IMAGE_COPIED.format(path=new_relative)) + logger.info(ls.MULTIMODAL_ATTACHED.format(path=path)) except Exception as e: - logger.error(ls.IMAGE_COPY_FAILED.format(error=e)) + logger.error(ls.MULTIMODAL_READ_FAILED.format(path=path, error=e)) + content.append(match_token) + remaining = after + + if remaining.strip(): + content.append(remaining.lstrip()) + + return content or question + + +def _permission_mode_label() -> str: + return ( + cs.PERMISSION_MODE_YOLO_LABEL + if app_context.session.is_yolo() + else cs.PERMISSION_MODE_NORMAL_LABEL + ) + + +def _git_state() -> tuple[str, bool] | None: + repo = app_context.session.target_repo + if repo is None or not repo.exists(): + return None + try: + result = subprocess.run( + ["git", "status", "--porcelain", "--branch"], + capture_output=True, + text=True, + timeout=1.0, + check=True, + cwd=repo, + ) + except (subprocess.SubprocessError, FileNotFoundError): + return None + lines = result.stdout.splitlines() + if not lines or not lines[0].startswith("## "): + return None + header = lines[0][3:].split("...", 1)[0].split(" ", 1)[0] + if header in ("HEAD", "No"): + return None + is_dirty = any(line for line in lines[1:]) + return header, is_dirty + + +def _terminal_columns() -> int: + return shutil.get_terminal_size((80, 24)).columns + + +def _format_tokens(n: int) -> str: + if n >= 1_000_000: + return f"{n / 1_000_000:.1f}M" + if n >= 1_000: + return f"{n / 1_000:.1f}k" + return str(n) + + +def _token_color(pct: float) -> str: + if pct >= cs.TOKEN_THRESHOLD_CRITICAL: + return cs.TOKEN_COLOR_CRITICAL + if pct >= cs.TOKEN_THRESHOLD_WARNING: + return cs.TOKEN_COLOR_WARNING + return cs.TOKEN_COLOR_OK + - return updated_question +def _token_usage() -> tuple[int, int, float]: + try: + used = int(app_context.session.context_tokens) + except (TypeError, ValueError): + used = 0 + try: + model_id = settings.active_orchestrator_config.model_id or "" + except Exception: + model_id = "" + bare = model_id.split(":", 1)[-1] + max_ctx = cs.MODEL_CONTEXT_WINDOWS.get(bare, cs.DEFAULT_CONTEXT_WINDOW) + pct = (used / max_ctx * 100) if max_ctx > 0 else 0.0 + return used, max_ctx, pct + + +async def _refresh_context_tokens(messages: list[ModelMessage]) -> None: + try: + config = settings.active_orchestrator_config + except Exception: + return + if config.provider != cs.Provider.ANTHROPIC or not config.api_key: + return + try: + from .services.anthropic_token_counter import count_anthropic_context + + count = await count_anthropic_context(config.api_key, config.model_id, messages) + app_context.session.context_tokens = count + except Exception as e: + logger.debug(ls.CONTEXT_TOKEN_COUNT_FAILED.format(error=e)) + + +def _prime_context_token_counter(system_prompt: str) -> None: + if not system_prompt: + return + from pydantic_ai.messages import ModelRequest, SystemPromptPart + + baseline_messages: list[ModelMessage] = [ + ModelRequest(parts=[SystemPromptPart(content=system_prompt)]) + ] + asyncio.create_task(_refresh_context_tokens(baseline_messages)) + + +def _short_model_id() -> tuple[str, str]: + try: + orch = settings.active_orchestrator_config.model_id or "" + except Exception: + orch = "" + try: + cyph = settings.active_cypher_config.model_id or "" + except Exception: + cyph = "" + return orch.split(":", 1)[-1], cyph.split(":", 1)[-1] + + +def _abbreviated_repo(p: Path | None) -> str: + if p is None: + return "" + try: + home = Path.home() + return f"~/{p.relative_to(home)}" if p.is_relative_to(home) else str(p) + except (ValueError, OSError): + return str(p) + + +def _config_segments() -> list[tuple[str, str]]: + orch, cyph = _short_model_id() + segments: list[tuple[str, str]] = [] + if orch: + segments.append((cs.STATUS_BAR_CONFIG_LABEL_O, orch)) + if cyph: + segments.append((cs.STATUS_BAR_CONFIG_LABEL_C, cyph)) + segments.append( + ( + cs.STATUS_BAR_CONFIG_LABEL_EDIT, + cs.STATUS_BAR_EDIT_ON + if app_context.session.confirm_edits + else cs.STATUS_BAR_EDIT_OFF, + ) + ) + segments.append( + ( + cs.STATUS_BAR_CONFIG_LABEL_INSTRUCTIONS, + cs.STATUS_BAR_EDIT_ON + if app_context.session.load_cgr_instructions + else cs.STATUS_BAR_EDIT_OFF, + ) + ) + repo = _abbreviated_repo(app_context.session.target_repo) + if repo: + segments.append((cs.STATUS_BAR_CONFIG_LABEL_REPO, repo)) + return segments + + +def _config_status_html() -> str: + parts = [ + f'' + f'' + for label, value in _config_segments() + ] + return cs.STATUS_BAR_CONFIG_SEPARATOR.join(parts) + + +def _config_status_plain() -> str: + parts = [f"{label}:{value}" for label, value in _config_segments()] + return cs.STATUS_BAR_CONFIG_SEPARATOR.join(parts) + + +def _config_status_rich() -> Text: + line = Text() + segments = _config_segments() + for i, (label, value) in enumerate(segments): + if i > 0: + line.append(cs.STATUS_BAR_CONFIG_SEPARATOR, style="dim") + line.append(f"{label}:", style=f"bold {cs.STATUS_BAR_CONFIG_LABEL_COLOR}") + line.append(value, style=cs.STATUS_BAR_CONFIG_COLOR) + return line + + +def _branch_chip_html_and_plain(state: tuple[str, bool] | None) -> tuple[str, str]: + if state is None: + return "", "" + branch, is_dirty = state + html_template = ( + cs.STATUS_BAR_BRANCH_DIRTY_HTML if is_dirty else cs.STATUS_BAR_BRANCH_CLEAN_HTML + ) + plain_template = ( + cs.STATUS_BAR_BRANCH_DIRTY_PLAIN + if is_dirty + else cs.STATUS_BAR_BRANCH_CLEAN_PLAIN + ) + return ( + html_template.format(branch=html_escape(branch)), + plain_template.format(branch=branch), + ) + + +def _branch_chip_rich(state: tuple[str, bool] | None) -> Text: + if state is None: + return Text() + branch, is_dirty = state + marker = cs.STATUS_BAR_DIRTY_MARKER if is_dirty else "" + chip_style = cs.STATUS_BAR_DIRTY_STYLE if is_dirty else cs.STATUS_BAR_CLEAN_STYLE + chip = Text() + chip.append( + cs.STATUS_BAR_BRANCH_RICH_TEXT.format(branch=branch, marker=marker), + style=chip_style, + ) + return chip + + +def _status_bar_label() -> HTML | str: + mode = _permission_mode_label() + state = _git_state() + columns = _terminal_columns() + sep_html = ( + f'" + ) + + used, max_ctx, pct = _token_usage() + used_str = _format_tokens(used) + max_str = _format_tokens(max_ctx) + pct_str = f"{pct:.1f}%" + token_html = cs.STATUS_BAR_TOKEN_HTML.format( + color=_token_color(pct), + used=used_str, + max_ctx=max_str, + pct=pct_str, + ) + token_plain = f" {used_str} / {max_str} ({pct_str})" + body_html = html_escape(mode) + token_html + body_plain = mode + token_plain + + config_html = _config_status_html() + config_plain = _config_status_plain() + branch_html, branch_plain = _branch_chip_html_and_plain(state) + + config_with_branch_html = config_html + config_with_branch_plain = config_plain + if branch_html: + if config_html: + config_with_branch_html = f"{config_html} {branch_html}" + config_with_branch_plain = f"{config_plain} {branch_plain}" + else: + config_with_branch_html = branch_html + config_with_branch_plain = branch_plain + + if not config_with_branch_plain: + return HTML(f"{sep_html}\n{body_html}") + inline_sep = " " + if len(body_plain) + len(inline_sep) + len(config_with_branch_plain) <= columns: + return HTML(f"{sep_html}\n{body_html}{inline_sep}{config_with_branch_html}") + return HTML(f"{sep_html}\n{config_with_branch_html}\n{body_html}") + + +def _rich_status_bar() -> Text: + body = Text() + body.append(_permission_mode_label(), style="dim") + used, max_ctx, pct = _token_usage() + body.append(" ") + body.append( + f"{_format_tokens(used)} / {_format_tokens(max_ctx)} ({pct:.1f}%)", + style=_token_color(pct), + ) + + config_line = _config_status_rich() + branch_chip = _branch_chip_rich(_git_state()) + if config_line.plain and branch_chip.plain: + config_line.append(" ") + config_line.append_text(branch_chip) + elif branch_chip.plain: + config_line = branch_chip + + if not config_line.plain: + return body + + inline_sep = " " + if ( + len(body.plain) + len(inline_sep) + len(config_line.plain) + <= _terminal_columns() + ): + body.append(inline_sep) + body.append_text(config_line) + return body + return Text("\n").join([config_line, body]) + + +@contextmanager +def _shift_tab_listener(): + if sys.platform == "win32" or not sys.stdin.isatty(): + yield + return + try: + import termios + except ImportError: + yield + return + fd = sys.stdin.fileno() + try: + original = termios.tcgetattr(fd) + except (termios.error, OSError): + yield + return + try: + new_attrs = termios.tcgetattr(fd) + new_attrs[3] &= ~(termios.ICANON | termios.ECHO) + new_attrs[6][termios.VMIN] = 0 + new_attrs[6][termios.VTIME] = 0 + termios.tcsetattr(fd, termios.TCSANOW, new_attrs) + loop = asyncio.get_running_loop() + buffer = bytearray() + + def on_input() -> None: + try: + data = os.read(fd, 1024) + except OSError: + return + if not data: + return + buffer.extend(data) + while cs.SHIFT_TAB_ESCAPE in buffer: + idx = buffer.index(cs.SHIFT_TAB_ESCAPE) + del buffer[idx : idx + len(cs.SHIFT_TAB_ESCAPE)] + app_context.session.cycle_permission_mode() + + loop.add_reader(fd, on_input) + try: + yield + finally: + try: + loop.remove_reader(fd) + except Exception: + pass + finally: + try: + termios.tcsetattr(fd, termios.TCSADRAIN, original) + except (termios.error, OSError): + pass + + +@contextmanager +def _thinking_with_status_bar(message: str): + spinner = Spinner(cs.STATUS_BAR_SPINNER, text=Text.from_markup(message)) + separator = Text( + cs.STATUS_BAR_SEPARATOR_CHAR * _terminal_columns(), + style=cs.STATUS_BAR_SEPARATOR_COLOR, + ) + + def render() -> Group: + return Group(separator, spinner, _rich_status_bar()) + + with ( + Live( + render(), + console=app_context.console, + refresh_per_second=4, + transient=True, + ) as live, + _shift_tab_listener(), + ): + + async def _refresh_bar() -> None: + while True: + try: + live.update(render()) + await asyncio.sleep(0.25) + except asyncio.CancelledError: + return + + refresh_task = asyncio.get_running_loop().create_task(_refresh_bar()) + try: + yield live + finally: + refresh_task.cancel() def get_multiline_input(prompt_text: str = cs.PROMPT_ASK_QUESTION) -> str: @@ -514,6 +1050,10 @@ def get_multiline_input(prompt_text: str = cs.PROMPT_ASK_QUESTION) -> str: def submit(event: KeyPressEvent) -> None: event.app.exit(result=event.app.current_buffer.text) + @bindings.add(cs.KeyBinding.CTRL_E) + def submit_ctrl_e(event: KeyPressEvent) -> None: + event.app.exit(result=event.app.current_buffer.text) + @bindings.add(cs.KeyBinding.ENTER) def new_line(event: KeyPressEvent) -> None: event.current_buffer.insert_text("\n") @@ -522,6 +1062,11 @@ def new_line(event: KeyPressEvent) -> None: def keyboard_interrupt(event: KeyPressEvent) -> None: event.app.exit(exception=KeyboardInterrupt) + @bindings.add(cs.KeyBinding.SHIFT_TAB) + def toggle_permission_mode(event: KeyPressEvent) -> None: + app_context.session.cycle_permission_mode() + event.app.invalidate() + clean_prompt = Text.from_markup(prompt_text).plain print_formatted_text( @@ -538,6 +1083,8 @@ def keyboard_interrupt(event: KeyPressEvent) -> None: key_bindings=bindings, wrap_lines=True, style=ORANGE_STYLE, + bottom_toolbar=lambda: _status_bar_label(), + refresh_interval=0.5, ) if result is None: raise EOFError @@ -664,19 +1211,17 @@ async def _run_interactive_loop( log_session_event(f"{cs.SESSION_PREFIX_USER}{question}") if app_context.session.cancelled: - question_with_context = question + get_session_context() + question_text = question + get_session_context() app_context.session.reset_cancelled() else: - question_with_context = question + question_text = question - question_with_context = _handle_chat_images( - question_with_context, project_root - ) + user_prompt: str | list[UserContent] = _build_user_prompt(question_text) await _run_agent_response_loop( rag_agent, message_history, - question_with_context, + user_prompt, config, tool_names, model_override, @@ -752,6 +1297,8 @@ def connect_memgraph(batch_size: int) -> MemgraphIngestor: host=settings.MEMGRAPH_HOST, port=settings.MEMGRAPH_PORT, batch_size=batch_size, + username=settings.MEMGRAPH_USERNAME, + password=settings.MEMGRAPH_PASSWORD, ) @@ -969,8 +1516,10 @@ def _validate_provider_config(role: cs.ModelRole, config: ModelConfig) -> None: def _initialize_services_and_agent( - repo_path: str, ingestor: QueryProtocol -) -> tuple[Agent[None, str | DeferredToolRequests], ConfirmationToolNames]: + repo_path: str, + ingestor: QueryProtocol, + active_projects: list[str] | None = None, +) -> tuple[Agent[None, str | DeferredToolRequests], ConfirmationToolNames, str]: _validate_provider_config( cs.ModelRole.ORCHESTRATOR, settings.active_orchestrator_config ) @@ -982,10 +1531,11 @@ def _initialize_services_and_agent( file_writer = FileWriter(project_root=repo_path) file_editor = FileEditor(project_root=repo_path) shell_commander = ShellCommander( - project_root=repo_path, timeout=settings.SHELL_COMMAND_TIMEOUT + project_root=repo_path, + timeout=settings.SHELL_COMMAND_TIMEOUT, + is_yolo=app_context.session.is_yolo, ) directory_lister = DirectoryLister(project_root=repo_path) - document_analyzer = DocumentAnalyzer(project_root=repo_path) query_tool = create_query_tool(ingestor, cypher_generator, app_context.console) code_tool = create_code_retrieval_tool(code_retriever) @@ -994,7 +1544,6 @@ def _initialize_services_and_agent( file_editor_tool = create_file_editor_tool(file_editor) shell_command_tool = create_shell_command_tool(shell_commander) directory_lister_tool = create_directory_lister_tool(directory_lister) - document_analyzer_tool = create_document_analyzer_tool(document_analyzer) semantic_search_tool = create_semantic_search_tool() function_source_tool = create_get_function_source_tool() @@ -1004,7 +1553,7 @@ def _initialize_services_and_agent( shell_command=shell_command_tool.name, ) - rag_agent = create_rag_orchestrator( + rag_agent, system_prompt = create_rag_orchestrator( tools=[ query_tool, code_tool, @@ -1013,21 +1562,55 @@ def _initialize_services_and_agent( file_editor_tool, shell_command_tool, directory_lister_tool, - document_analyzer_tool, semantic_search_tool, function_source_tool, - ] + ], + project_root=Path(repo_path), + load_instructions=app_context.session.load_cgr_instructions, + active_projects=active_projects, ) - return rag_agent, confirmation_tool_names + return rag_agent, confirmation_tool_names, system_prompt -async def main_async(repo_path: str, batch_size: int) -> None: +def main_single_query( + repo_path: str, + batch_size: int, + question: str, + active_projects: list[str] | None = None, + output_format: cs.QueryFormat = cs.QueryFormat.TABLE, +) -> None: + _setup_common_initialization(repo_path) + # (H) Override logger to stderr so stdout is clean for scripted output + logger.remove() + logger.add(sys.stderr, level=cs.LOG_LEVEL_ERROR, format=cs.LOG_FORMAT) + + with connect_memgraph(batch_size) as ingestor: + rag_agent, _, _ = _initialize_services_and_agent( + repo_path, ingestor, active_projects=active_projects + ) + response = asyncio.run(rag_agent.run(question, message_history=[])) + if output_format == cs.QueryFormat.JSON: + payload = QueryJsonOutput(query=question, response=str(response.output)) + print(json.dumps(payload, ensure_ascii=False)) # noqa: T201 + else: + print(response.output) # noqa: T201 + + +async def main_async( + repo_path: str, + batch_size: int, + active_projects: list[str] | None = None, + show_config_table: bool = True, + pre_chat_sync: Callable[[], None] | None = None, + pre_chat_sync_message: str = cs.MSG_SYNCING_KNOWLEDGE_GRAPH, +) -> None: project_root = _setup_common_initialization(repo_path) - table = _create_configuration_table(repo_path) - app_context.console.print(table) + if show_config_table: + table = _create_configuration_table(repo_path) + app_context.console.print(table) - with connect_memgraph(batch_size) as ingestor: + async with connect_memgraph(batch_size) as ingestor: app_context.console.print(style(cs.MSG_CONNECTED_MEMGRAPH, cs.Color.GREEN)) app_context.console.print( Panel( @@ -1036,10 +1619,26 @@ async def main_async(repo_path: str, batch_size: int) -> None: ) ) - rag_agent, tool_names = _initialize_services_and_agent(repo_path, ingestor) + rag_agent, tool_names, system_prompt = _initialize_services_and_agent( + repo_path, ingestor, active_projects=active_projects + ) + _prime_context_token_counter(system_prompt) + + if pre_chat_sync is not None: + await _run_pre_chat_sync(pre_chat_sync, pre_chat_sync_message) + await run_chat_loop(rag_agent, [], project_root, tool_names) +async def _run_pre_chat_sync(task: Callable[[], None], message: str) -> None: + logger.disable("codebase_rag") + try: + with _thinking_with_status_bar(message): + await asyncio.to_thread(task) + finally: + logger.enable("codebase_rag") + + async def main_optimize_async( language: str, target_repo_path: str, @@ -1063,12 +1662,13 @@ async def main_optimize_async( effective_batch_size = settings.resolve_batch_size(batch_size) - with connect_memgraph(effective_batch_size) as ingestor: + async with connect_memgraph(effective_batch_size) as ingestor: app_context.console.print(style(cs.MSG_CONNECTED_MEMGRAPH, cs.Color.GREEN)) - rag_agent, tool_names = _initialize_services_and_agent( + rag_agent, tool_names, system_prompt = _initialize_services_and_agent( target_repo_path, ingestor ) + _prime_context_token_counter(system_prompt) await run_optimization_loop( rag_agent, [], project_root, language, tool_names, reference_document ) diff --git a/codebase_rag/mcp/__init__.py b/codebase_rag/mcp/__init__.py index 77c80d78a..f3a26b0b7 100644 --- a/codebase_rag/mcp/__init__.py +++ b/codebase_rag/mcp/__init__.py @@ -1 +1,2 @@ -from codebase_rag.mcp.server import main as main +from codebase_rag.mcp.server import serve_http as serve_http +from codebase_rag.mcp.server import serve_stdio as serve_stdio diff --git a/codebase_rag/mcp/client.py b/codebase_rag/mcp/client.py new file mode 100644 index 000000000..b6abb205d --- /dev/null +++ b/codebase_rag/mcp/client.py @@ -0,0 +1,65 @@ +import asyncio +import io +import json +import os +import sys + +import typer +from mcp import ClientSession +from mcp.client.stdio import StdioServerParameters, stdio_client + +from codebase_rag import constants as cs + +app = typer.Typer() + + +async def _query_with_errlog(question: str, errlog: io.TextIOWrapper) -> dict[str, str]: + server_params = StdioServerParameters( + command=sys.executable, + args=["-m", "codebase_rag.cli", "mcp-server"], + ) + + async with stdio_client(server=server_params, errlog=errlog) as (read, write): + async with ClientSession(read, write) as session: + await session.initialize() + result = await session.call_tool( + cs.MCPToolName.ASK_AGENT, + {cs.MCPParamName.QUESTION: question}, + ) + + if result.content: + response_text = result.content[0].text + try: + parsed = json.loads(response_text) + if isinstance(parsed, dict): + return parsed + return {"output": str(parsed)} + except json.JSONDecodeError: + return {"output": response_text} + return {"output": "No response from server"} + + +def query_mcp_server(question: str) -> dict[str, str]: + with open(os.devnull, "w") as devnull: # noqa: SIM115 + return asyncio.run(_query_with_errlog(question, devnull)) + + +@app.command() +def main( + question: str = typer.Option( + ..., "--ask-agent", "-a", help="Question to ask about the codebase" + ), +) -> None: + try: + result = query_mcp_server(question) + if isinstance(result, dict) and "output" in result: + print(result["output"]) # noqa: T201 + else: + print(json.dumps(result)) # noqa: T201 + except Exception as e: + print(f"Error: {e}", file=sys.stderr) # noqa: T201 + sys.exit(1) + + +if __name__ == "__main__": + app() diff --git a/codebase_rag/mcp/server.py b/codebase_rag/mcp/server.py index 9218a2d93..6f59e4b67 100644 --- a/codebase_rag/mcp/server.py +++ b/codebase_rag/mcp/server.py @@ -1,6 +1,8 @@ +import contextlib import json import os import sys +from collections.abc import Iterator from pathlib import Path from loguru import logger @@ -16,6 +18,7 @@ from codebase_rag.services.graph_service import MemgraphIngestor from codebase_rag.services.llm import CypherGenerator from codebase_rag.types_defs import MCPToolArguments +from codebase_rag.vector_store import close_qdrant_client def setup_logging() -> None: @@ -71,6 +74,8 @@ def create_server() -> tuple[Server, MemgraphIngestor]: host=settings.MEMGRAPH_HOST, port=settings.MEMGRAPH_PORT, batch_size=settings.MEMGRAPH_BATCH_SIZE, + username=settings.MEMGRAPH_USERNAME, + password=settings.MEMGRAPH_PASSWORD, ) cypher_generator = CypherGenerator() @@ -135,18 +140,33 @@ async def call_tool(name: str, arguments: MCPToolArguments) -> list[TextContent] return server, ingestor -async def main() -> None: +@contextlib.contextmanager +def _service_lifecycle(ingestor: MemgraphIngestor) -> Iterator[None]: + """Manage shared service lifetimes for the MCP server. + + Opens the Memgraph ingestor connection and guarantees the embedded Qdrant + client lock is released on shutdown, so a CLI indexing run can reuse the + storage folder once the server stops. + """ + try: + with ingestor: + logger.info( + lg.MCP_SERVER_CONNECTED.format( + host=settings.MEMGRAPH_HOST, port=settings.MEMGRAPH_PORT + ) + ) + yield + finally: + close_qdrant_client() + + +async def serve_stdio() -> None: logger.info(lg.MCP_SERVER_STARTING) server, ingestor = create_server() logger.info(lg.MCP_SERVER_CREATED) - with ingestor: - logger.info( - lg.MCP_SERVER_CONNECTED.format( - host=settings.MEMGRAPH_HOST, port=settings.MEMGRAPH_PORT - ) - ) + with _service_lifecycle(ingestor): try: async with stdio_server() as (read_stream, write_stream): await server.run( @@ -159,7 +179,45 @@ async def main() -> None: logger.info(lg.MCP_SERVER_SHUTDOWN) +async def serve_http( + host: str = settings.MCP_HTTP_HOST, + port: int = settings.MCP_HTTP_PORT, +) -> None: + import uvicorn + from mcp.server.streamable_http_manager import StreamableHTTPSessionManager + from starlette.applications import Starlette + from starlette.routing import Mount + + logger.info(lg.MCP_HTTP_SERVER_STARTING.format(host=host, port=port)) + + server, ingestor = create_server() + + session_manager = StreamableHTTPSessionManager( + app=server, + json_response=False, + stateless=False, + ) + + @contextlib.asynccontextmanager + async def lifespan(app: Starlette): + with _service_lifecycle(ingestor): + async with session_manager.run(): + logger.info(lg.MCP_HTTP_SERVER_READY.format(host=host, port=port)) + yield + + starlette_app = Starlette( + routes=[ + Mount(settings.MCP_HTTP_ENDPOINT_PATH, app=session_manager.handle_request), + ], + lifespan=lifespan, + ) + + config = uvicorn.Config(starlette_app, host=host, port=port, log_level="info") + uvicorn_server = uvicorn.Server(config) + await uvicorn_server.serve() + + if __name__ == "__main__": import asyncio - asyncio.run(main()) + asyncio.run(serve_stdio()) diff --git a/codebase_rag/mcp/tools.py b/codebase_rag/mcp/tools.py index 5d1d2f7f5..51ffb7923 100644 --- a/codebase_rag/mcp/tools.py +++ b/codebase_rag/mcp/tools.py @@ -1,7 +1,11 @@ +import asyncio import itertools +import sys from pathlib import Path from loguru import logger +from pydantic_ai import Agent +from rich.console import Console from codebase_rag import constants as cs from codebase_rag import logs as lg @@ -10,9 +14,12 @@ from codebase_rag.models import ToolMetadata from codebase_rag.parser_loader import load_parsers from codebase_rag.services.graph_service import MemgraphIngestor -from codebase_rag.services.llm import CypherGenerator +from codebase_rag.services.llm import CypherGenerator, create_rag_orchestrator from codebase_rag.tools import tool_descriptions as td -from codebase_rag.tools.code_retrieval import CodeRetriever, create_code_retrieval_tool +from codebase_rag.tools.code_retrieval import ( + CodeRetriever, + create_code_retrieval_tool, +) from codebase_rag.tools.codebase_query import create_query_tool from codebase_rag.tools.directory_lister import ( DirectoryLister, @@ -21,6 +28,7 @@ from codebase_rag.tools.file_editor import FileEditor, create_file_editor_tool from codebase_rag.tools.file_reader import FileReader, create_file_reader_tool from codebase_rag.tools.file_writer import FileWriter, create_file_writer_tool +from codebase_rag.tools.shell_command import ShellCommander, create_shell_command_tool from codebase_rag.types_defs import ( CodeSnippetResultDict, DeleteProjectErrorResult, @@ -35,6 +43,8 @@ MCPToolSchema, QueryResultDict, ) +from codebase_rag.utils.dependencies import has_semantic_dependencies +from codebase_rag.vector_store import delete_project_embeddings class MCPToolsRegistry: @@ -47,6 +57,7 @@ def __init__( self.project_root = project_root self.ingestor = ingestor self.cypher_gen = cypher_gen + self._ingestor_lock = asyncio.Lock() self.parsers, self.queries = load_parsers() @@ -55,9 +66,11 @@ def __init__( self.file_reader = FileReader(project_root=project_root) self.file_writer = FileWriter(project_root=project_root) self.directory_lister = DirectoryLister(project_root=project_root) + self.shell_commander = ShellCommander(project_root=project_root) + stderr_console = Console(file=sys.stderr, width=None, force_terminal=True) self._query_tool = create_query_tool( - ingestor=ingestor, cypher_gen=cypher_gen, console=None + ingestor=ingestor, cypher_gen=cypher_gen, console=stderr_console ) self._code_tool = create_code_retrieval_tool(code_retriever=self.code_retriever) self._file_editor_tool = create_file_editor_tool(file_editor=self.file_editor) @@ -66,6 +79,24 @@ def __init__( self._directory_lister_tool = create_directory_lister_tool( directory_lister=self.directory_lister ) + self._shell_command_tool = create_shell_command_tool( + shell_commander=self.shell_commander + ) + + self._rag_agent: Agent | None = None + + self._semantic_search_tool = None + self._semantic_search_available = False + + if has_semantic_dependencies(): + from codebase_rag.tools.semantic_search import ( + create_semantic_search_tool, + ) + + self._semantic_search_tool = create_semantic_search_tool() + self._semantic_search_available = True + else: + logger.info(lg.MCP_SEMANTIC_NOT_AVAILABLE) self._tools: dict[str, ToolMetadata] = { cs.MCPToolName.LIST_PROJECTS: ToolMetadata( @@ -122,6 +153,17 @@ def __init__( handler=self.index_repository, returns_json=False, ), + cs.MCPToolName.UPDATE_REPOSITORY: ToolMetadata( + name=cs.MCPToolName.UPDATE_REPOSITORY, + description=td.MCP_TOOLS[cs.MCPToolName.UPDATE_REPOSITORY], + input_schema=MCPInputSchema( + type=cs.MCPSchemaType.OBJECT, + properties={}, + required=[], + ), + handler=self.update_repository, + returns_json=False, + ), cs.MCPToolName.QUERY_CODE_GRAPH: ToolMetadata( name=cs.MCPToolName.QUERY_CODE_GRAPH, description=td.MCP_TOOLS[cs.MCPToolName.QUERY_CODE_GRAPH], @@ -247,33 +289,122 @@ def __init__( returns_json=False, ), } + if self._semantic_search_available: + self._tools[cs.MCPToolName.SEMANTIC_SEARCH] = ToolMetadata( + name=cs.MCPToolName.SEMANTIC_SEARCH, + description=td.MCP_TOOLS[cs.MCPToolName.SEMANTIC_SEARCH], + input_schema=MCPInputSchema( + type=cs.MCPSchemaType.OBJECT, + properties={ + cs.MCPParamName.NATURAL_LANGUAGE_QUERY: MCPInputSchemaProperty( + type=cs.MCPSchemaType.STRING, + description=td.MCP_PARAM_NATURAL_LANGUAGE_QUERY, + ), + cs.MCPParamName.TOP_K: MCPInputSchemaProperty( + type=cs.MCPSchemaType.INTEGER, + description=td.MCP_PARAM_TOP_K, + default=5, + ), + }, + required=[cs.MCPParamName.NATURAL_LANGUAGE_QUERY], + ), + handler=self.semantic_search, + returns_json=False, + ) + + self._tools[cs.MCPToolName.ASK_AGENT] = ToolMetadata( + name=cs.MCPToolName.ASK_AGENT, + description=td.MCP_TOOLS[cs.MCPToolName.ASK_AGENT], + input_schema=MCPInputSchema( + type=cs.MCPSchemaType.OBJECT, + properties={ + cs.MCPParamName.QUESTION: MCPInputSchemaProperty( + type=cs.MCPSchemaType.STRING, + description=td.MCP_PARAM_QUESTION, + ) + }, + required=[cs.MCPParamName.QUESTION], + ), + handler=self.ask_agent, + returns_json=True, + ) + + @property + def rag_agent(self) -> Agent: + if self._rag_agent is None: + from codebase_rag.tools.semantic_search import ( + create_get_function_source_tool, + ) + + tools = [ + self._query_tool, + self._code_tool, + self._file_reader_tool, + self._file_writer_tool, + self._file_editor_tool, + self._shell_command_tool, + self._directory_lister_tool, + create_get_function_source_tool(), + ] + if self._semantic_search_tool is not None: + tools.append(self._semantic_search_tool) + self._rag_agent, _ = create_rag_orchestrator( + tools=tools, project_root=Path(self.project_root) + ) + return self._rag_agent + + # (H) Setter allows tests to inject a mock agent without triggering LLM init + @rag_agent.setter + def rag_agent(self, value: Agent) -> None: + self._rag_agent = value async def list_projects(self) -> ListProjectsResult: logger.info(lg.MCP_LISTING_PROJECTS) try: - projects = self.ingestor.list_projects() + projects = await asyncio.to_thread(self.ingestor.list_projects) return ListProjectsSuccessResult(projects=projects, count=len(projects)) except Exception as e: logger.error(lg.MCP_ERROR_LIST_PROJECTS.format(error=e)) return ListProjectsErrorResult(error=str(e), projects=[], count=0) + def _get_project_node_ids(self, project_name: str) -> list[int]: + rows = self.ingestor.fetch_all( + cs.CYPHER_QUERY_PROJECT_NODE_IDS, + {cs.KEY_PROJECT_NAME: project_name}, + ) + result: list[int] = [] + for row in rows: + node_id = row.get(cs.KEY_NODE_ID) + if isinstance(node_id, int): + result.append(node_id) + return result + + def _cleanup_project_embeddings(self, project_name: str) -> None: + node_ids = self._get_project_node_ids(project_name) + delete_project_embeddings(project_name, node_ids) + + def _delete_project_sync(self, project_name: str) -> DeleteProjectResult: + projects = self.ingestor.list_projects() + if project_name not in projects: + return DeleteProjectErrorResult( + success=False, + error=te.MCP_PROJECT_NOT_FOUND.format( + project_name=project_name, projects=projects + ), + ) + self._cleanup_project_embeddings(project_name) + self.ingestor.delete_project(project_name) + return DeleteProjectSuccessResult( + success=True, + project=project_name, + message=cs.MCP_PROJECT_DELETED.format(project_name=project_name), + ) + async def delete_project(self, project_name: str) -> DeleteProjectResult: logger.info(lg.MCP_DELETING_PROJECT.format(project_name=project_name)) try: - projects = self.ingestor.list_projects() - if project_name not in projects: - return DeleteProjectErrorResult( - success=False, - error=te.MCP_PROJECT_NOT_FOUND.format( - project_name=project_name, projects=projects - ), - ) - self.ingestor.delete_project(project_name) - return DeleteProjectSuccessResult( - success=True, - project=project_name, - message=cs.MCP_PROJECT_DELETED.format(project_name=project_name), - ) + async with self._ingestor_lock: + return await asyncio.to_thread(self._delete_project_sync, project_name) except Exception as e: logger.error(lg.MCP_ERROR_DELETE_PROJECT.format(error=e)) return DeleteProjectErrorResult(success=False, error=str(e)) @@ -283,34 +414,88 @@ async def wipe_database(self, confirm: bool) -> str: return cs.MCP_WIPE_CANCELLED logger.warning(lg.MCP_WIPING_DATABASE) try: - self.ingestor.clean_database() + async with self._ingestor_lock: + await asyncio.to_thread(self.ingestor.clean_database) return cs.MCP_WIPE_SUCCESS except Exception as e: logger.error(lg.MCP_ERROR_WIPE.format(error=e)) return cs.MCP_WIPE_ERROR.format(error=e) + def _index_repository_sync(self) -> str: + project_name = Path(self.project_root).resolve().name + logger.info(lg.MCP_CLEARING_PROJECT.format(project_name=project_name)) + self._cleanup_project_embeddings(project_name) + self.ingestor.delete_project(project_name) + + self.ingestor.ensure_constraints() + self.ingestor.flush_all() + + updater = GraphUpdater( + ingestor=self.ingestor, + repo_path=Path(self.project_root), + parsers=self.parsers, + queries=self.queries, + project_name=project_name, + ) + updater.run() + self.ingestor.flush_all() + + return cs.MCP_INDEX_SUCCESS_PROJECT.format( + path=self.project_root, project_name=project_name + ) + async def index_repository(self) -> str: logger.info(lg.MCP_INDEXING_REPO.format(path=self.project_root)) - project_name = Path(self.project_root).resolve().name try: - logger.info(lg.MCP_CLEARING_PROJECT.format(project_name=project_name)) - self.ingestor.delete_project(project_name) - - updater = GraphUpdater( - ingestor=self.ingestor, - repo_path=Path(self.project_root), - parsers=self.parsers, - queries=self.queries, - ) - updater.run() - - return cs.MCP_INDEX_SUCCESS_PROJECT.format( - path=self.project_root, project_name=project_name - ) + async with self._ingestor_lock: + return await asyncio.to_thread(self._index_repository_sync) except Exception as e: logger.error(lg.MCP_ERROR_INDEXING.format(error=e)) return cs.MCP_INDEX_ERROR.format(error=e) + def _update_repository_sync(self) -> str: + project_name = Path(self.project_root).resolve().name + + self.ingestor.ensure_constraints() + self.ingestor.flush_all() + + updater = GraphUpdater( + ingestor=self.ingestor, + repo_path=Path(self.project_root), + parsers=self.parsers, + queries=self.queries, + project_name=project_name, + ) + updater.run() + self.ingestor.flush_all() + return cs.MCP_UPDATE_SUCCESS.format(path=self.project_root) + + async def update_repository(self) -> str: + logger.info(lg.MCP_UPDATING_REPO.format(path=self.project_root)) + try: + async with self._ingestor_lock: + return await asyncio.to_thread(self._update_repository_sync) + except Exception as e: + logger.error(lg.MCP_ERROR_UPDATING.format(error=e)) + return cs.MCP_UPDATE_ERROR.format(error=e) + + async def semantic_search(self, natural_language_query: str, top_k: int = 5) -> str: + assert self._semantic_search_tool is not None + logger.info(lg.MCP_SEMANTIC_SEARCH.format(query=natural_language_query)) + result = await self._semantic_search_tool.function( + query=natural_language_query, top_k=top_k + ) + return str(result) + + async def ask_agent(self, question: str) -> dict[str, str]: + logger.info(lg.MCP_ASK_AGENT.format(question=question)) + try: + response = await self.rag_agent.run(question, message_history=[]) + return {"output": str(response.output)} + except Exception as e: + logger.error(lg.MCP_ASK_AGENT_ERROR.format(error=e)) + return {"error": cs.MCP_ASK_AGENT_ERROR.format(error=e)} + async def query_code_graph(self, natural_language_query: str) -> QueryResultDict: logger.info(lg.MCP_QUERY_CODE_GRAPH.format(query=natural_language_query)) try: diff --git a/codebase_rag/models.py b/codebase_rag/models.py index e189dbde0..763371a16 100644 --- a/codebase_rag/models.py +++ b/codebase_rag/models.py @@ -5,7 +5,7 @@ from rich.console import Console -from .constants import SupportedLanguage +from .constants import PermissionMode, SupportedLanguage from .types_defs import MCPHandlerType, MCPInputSchema, PropertyValue if TYPE_CHECKING: @@ -15,12 +15,27 @@ @dataclass class SessionState: confirm_edits: bool = True + load_cgr_instructions: bool = True log_file: Path | None = None cancelled: bool = False + permission_mode: PermissionMode = PermissionMode.NORMAL + context_tokens: int = 0 + target_repo: Path | None = None def reset_cancelled(self) -> None: self.cancelled = False + def is_yolo(self) -> bool: + return self.permission_mode == PermissionMode.YOLO + + def cycle_permission_mode(self) -> PermissionMode: + self.permission_mode = ( + PermissionMode.YOLO + if self.permission_mode == PermissionMode.NORMAL + else PermissionMode.NORMAL + ) + return self.permission_mode + def _default_console() -> Console: return Console(width=None, force_terminal=True) diff --git a/codebase_rag/parser_loader.py b/codebase_rag/parser_loader.py index 69ddabda3..6e79353ce 100644 --- a/codebase_rag/parser_loader.py +++ b/codebase_rag/parser_loader.py @@ -33,7 +33,7 @@ def _try_load_from_submodule(lang_name: cs.SupportedLanguage) -> LanguageLoader: setup_py_path = submodule_path / cs.SETUP_PY if setup_py_path.exists(): - logger.debug(ls.BUILDING_BINDINGS.format(lang=lang_name)) + logger.debug(ls.BUILDING_BINDINGS, lang=lang_name) result = subprocess.run( [sys.executable, cs.SETUP_PY, cs.BUILD_EXT_CMD, cs.INPLACE_FLAG], check=False, @@ -44,14 +44,15 @@ def _try_load_from_submodule(lang_name: cs.SupportedLanguage) -> LanguageLoader: if result.returncode != 0: logger.debug( - ls.BUILD_FAILED.format( - lang=lang_name, stdout=result.stdout, stderr=result.stderr - ) + ls.BUILD_FAILED, + lang=lang_name, + stdout=result.stdout, + stderr=result.stderr, ) return None - logger.debug(ls.BUILD_SUCCESS.format(lang=lang_name)) + logger.debug(ls.BUILD_SUCCESS, lang=lang_name) - logger.debug(ls.IMPORTING_MODULE.format(module=module_name)) + logger.debug(ls.IMPORTING_MODULE, module=module_name) module = importlib.import_module(module_name) language_attrs: list[str] = [ @@ -63,21 +64,19 @@ def _try_load_from_submodule(lang_name: cs.SupportedLanguage) -> LanguageLoader: for attr_name in language_attrs: if hasattr(module, attr_name): logger.debug( - ls.LOADED_FROM_SUBMODULE.format(lang=lang_name, attr=attr_name) + ls.LOADED_FROM_SUBMODULE, lang=lang_name, attr=attr_name ) loader: LanguageLoader = getattr(module, attr_name) return loader - logger.debug( - ls.NO_LANG_ATTR.format(module=module_name, available=dir(module)) - ) + logger.debug(ls.NO_LANG_ATTR, module=module_name, available=dir(module)) finally: if python_bindings_str in sys.path: sys.path.remove(python_bindings_str) except Exception as e: - logger.debug(ls.SUBMODULE_LOAD_FAILED.format(lang=lang_name, error=e)) + logger.debug(ls.SUBMODULE_LOAD_FAILED, lang=lang_name, error=e) return None @@ -137,6 +136,12 @@ def _import_language_loaders() -> dict[cs.SupportedLanguage, LanguageLoader]: cs.QUERY_LANGUAGE, cs.SupportedLanguage.JAVA, ), + LanguageImport( + cs.SupportedLanguage.C, + cs.TreeSitterModule.C, + cs.QUERY_LANGUAGE, + cs.SupportedLanguage.C, + ), LanguageImport( cs.SupportedLanguage.CPP, cs.TreeSitterModule.CPP, @@ -149,6 +154,12 @@ def _import_language_loaders() -> dict[cs.SupportedLanguage, LanguageLoader]: cs.QUERY_LANGUAGE, cs.SupportedLanguage.LUA, ), + LanguageImport( + cs.SupportedLanguage.PHP, + cs.TreeSitterModule.PHP, + cs.LANG_ATTR_PHP, + cs.SupportedLanguage.PHP, + ), ] loaders: dict[cs.SupportedLanguage, LanguageLoader] = { @@ -215,10 +226,14 @@ def _create_locals_query( try: return Query(language, locals_pattern) except Exception as e: - logger.debug(ls.LOCALS_QUERY_FAILED.format(lang=lang_name, error=e)) + logger.debug(ls.LOCALS_QUERY_FAILED, lang=lang_name, error=e) return None +COMBINED_FUNC_CLASS_QUERIES: dict[cs.SupportedLanguage, Query | None] = {} +COMBINED_FUNC_CLASS_IMPORT_QUERIES: dict[cs.SupportedLanguage, Query | None] = {} + + def _create_language_queries( language: Language, parser: Parser, @@ -236,6 +251,22 @@ def _create_language_queries( ) combined_import_patterns = _build_combined_import_pattern(lang_config) + combined_fc_pattern = f"{function_patterns} {class_patterns}".strip() + try: + COMBINED_FUNC_CLASS_QUERIES[lang_name] = ( + Query(language, combined_fc_pattern) if combined_fc_pattern else None + ) + except Exception: + COMBINED_FUNC_CLASS_QUERIES[lang_name] = None + + combined_fci_pattern = f"{function_patterns} {class_patterns} {combined_import_patterns} {call_patterns}".strip() + try: + COMBINED_FUNC_CLASS_IMPORT_QUERIES[lang_name] = ( + Query(language, combined_fci_pattern) if combined_fci_pattern else None + ) + except Exception: + COMBINED_FUNC_CLASS_IMPORT_QUERIES[lang_name] = None + return LanguageQueries( functions=_create_optional_query(language, function_patterns), classes=_create_optional_query(language, class_patterns), @@ -256,7 +287,7 @@ def _process_language( ) -> bool: lang_lib = LANGUAGE_LIBRARIES.get(lang_name) if not lang_lib: - logger.debug(ls.LIB_NOT_AVAILABLE.format(lang=lang_name)) + logger.debug(ls.LIB_NOT_AVAILABLE, lang=lang_name) return False try: diff --git a/codebase_rag/parsers/call_processor.py b/codebase_rag/parsers/call_processor.py index 0e53cbe73..3d459c2a4 100644 --- a/codebase_rag/parsers/call_processor.py +++ b/codebase_rag/parsers/call_processor.py @@ -1,6 +1,9 @@ from __future__ import annotations +from bisect import bisect_left, bisect_right +from collections import defaultdict from pathlib import Path +from typing import NamedTuple from loguru import logger from tree_sitter import Node, QueryCursor @@ -8,16 +11,61 @@ from .. import constants as cs from .. import logs as ls from ..language_spec import LanguageSpec +from ..parser_loader import COMBINED_FUNC_CLASS_QUERIES from ..services import IngestorProtocol from ..types_defs import FunctionRegistryTrieProtocol, LanguageQueries +from ..utils.path_utils import cached_relative_path from .call_resolver import CallResolver from .cpp import utils as cpp_utils from .import_processor import ImportProcessor from .type_inference import TypeInferenceEngine -from .utils import get_function_captures, is_method_node +from .utils import ( + get_function_captures, + is_method_node, + python_parameter_names, + safe_decode_text, + sorted_captures, +) + + +class _CallableFlowArg(NamedTuple): + # (H) One call-site argument that may carry a callable: bound either to a concrete + # (H) function (source_concrete) or to a parameter of the caller (source_caller + + # (H) source_param), keyed to the callee parameter by position or keyword. + callee_qn: str + position: int + keyword: str + source_concrete: str + source_caller: str + source_param: str + + +_TYPED_LANGUAGES = frozenset( + { + cs.SupportedLanguage.PYTHON, + cs.SupportedLanguage.JS, + cs.SupportedLanguage.TS, + cs.SupportedLanguage.JAVA, + cs.SupportedLanguage.LUA, + } +) + +# (H) C and C++ share the function_definition/declarator shape, so the callee +# (H) name lives in a nested declarator (no `name` field). Both need the libclang +# (H) declarator-aware extractor rather than a plain child_by_field_name("name"). +_C_FAMILY_LANGUAGES = frozenset({cs.SupportedLanguage.C, cs.SupportedLanguage.CPP}) class CallProcessor: + __slots__ = ( + "ingestor", + "repo_path", + "project_name", + "_resolver", + "_flow_param_names", + "_flow_args", + ) + def __init__( self, ingestor: IngestorProtocol, @@ -38,6 +86,10 @@ def __init__( type_inference=type_inference, class_inheritance=class_inheritance, ) + # (H) Inter-procedural callable-parameter flow: ordered params per function and + # (H) the per-call-site argument bindings, resolved to a fixpoint in finalize. + self._flow_param_names: dict[str, list[str]] = {} + self._flow_args: list[_CallableFlowArg] = [] def _get_node_name(self, node: Node, field: str = cs.FIELD_NAME) -> str | None: name_node = node.child_by_field_name(field) @@ -46,31 +98,299 @@ def _get_node_name(self, node: Node, field: str = cs.FIELD_NAME) -> str | None: text = name_node.text return None if text is None else text.decode(cs.ENCODING_UTF8) + def _collect_all_call_nodes( + self, + root_node: Node, + language: cs.SupportedLanguage, + queries: dict[cs.SupportedLanguage, LanguageQueries], + ) -> tuple[list[Node], list[int]]: + calls_query = queries[language].get(cs.QUERY_CALLS) + if not calls_query: + return [], [] + cursor = QueryCursor(calls_query) + captures = sorted_captures(cursor, root_node) + call_nodes = captures.get(cs.CAPTURE_CALL, []) + call_starts = [n.start_byte for n in call_nodes] + return call_nodes, call_starts + + def _filter_calls_in_node( + self, + all_call_nodes: list[Node], + call_starts: list[int], + container: Node, + ) -> list[Node]: + start = container.start_byte + end = container.end_byte + lo = bisect_left(call_starts, start) + hi = bisect_right(call_starts, end) + return [n for n in all_call_nodes[lo:hi] if n.end_byte <= end] + + def _filter_top_level_calls( + self, + all_call_nodes: list[Node], + call_starts: list[int], + func_nodes: list[Node], + ) -> list[Node]: + # (H) Calls inside a function's BODY belong to that function, not the + # (H) module; only genuine top-level calls are module-attributed. The body + # (H) (not the whole node) is the boundary so def-time calls in the + # (H) signature -- default args like `def f(x=make_default())` and + # (H) decorators -- run at module load and stay module-attributed. A node + # (H) with no body is not a real function scope (e.g. a file-scope + # (H) declaration `int x = top();` that the grammar captures as a + # (H) function); its calls run at load time, so it excludes nothing. + nested_starts: set[int] = set() + for func_node in func_nodes: + body = func_node.child_by_field_name(cs.FIELD_BODY) + if body is None: + continue + for call in self._filter_calls_in_node(all_call_nodes, call_starts, body): + nested_starts.add(call.start_byte) + return [c for c in all_call_nodes if c.start_byte not in nested_starts] + + def _bare_decorator_name(self, decorator_node: Node) -> str | None: + # (H) A bare decorator `@task` / `@pkg.deco` (no call parens) is not a + # (H) `call` node, so the normal call pass misses it even though applying + # (H) it runs `task(func)` at module load. A call decorator `@deco(...)` + # (H) IS a call node and is already captured, so skip it here. + named = decorator_node.named_children + if not named: + return None + expr = named[0] + if expr.type in (cs.TS_IDENTIFIER, cs.TS_ATTRIBUTE) and expr.text is not None: + return expr.text.decode(cs.ENCODING_UTF8) + return None + + def _runs_at_module_load(self, node: Node) -> bool: + # (H) A definition runs at module load only when it is at module or + # (H) class-body scope; nested inside a function body it runs at that + # (H) function's call time, so its decorator is not a module-load call. + ancestor = node.parent + while ancestor is not None: + if ancestor.type == cs.TS_PY_FUNCTION_DEFINITION: + return False + ancestor = ancestor.parent + return True + + def _ingest_decorator_calls( + self, + nodes: list[Node], + module_qn: str, + root_node: Node, + lang_config: LanguageSpec, + ) -> None: + # (H) Emit `(Module)->decorator` CALLS for bare decorators on functions, + # (H) methods, AND classes: the decoration executes at module-load time, + # (H) so the module is the caller. Only first-party callables get an edge. + resolver = self._resolver + ensure_rel = self.ingestor.ensure_relationship_batch + qn_key = cs.KEY_QUALIFIED_NAME + module_spec = (cs.NodeLabel.MODULE, qn_key, module_qn) + callable_labels = (cs.NodeLabel.FUNCTION, cs.NodeLabel.METHOD) + alias_map: dict[str, str] | None = None + for node in nodes: + parent = node.parent + if parent is None or parent.type != cs.TS_PY_DECORATED_DEFINITION: + continue + if not self._runs_at_module_load(parent): + continue + for child in parent.children: + if child.type != cs.TS_PY_DECORATOR: + continue + name = self._bare_decorator_name(child) + if not name: + continue + callee = resolver.resolve_function_call(name, module_qn) + if not callee and cs.SEPARATOR_DOT not in name: + # (H) `@alias` where `alias = task` still calls task at load; + # (H) reuse the local-alias fallback the call pass uses. + if alias_map is None: + alias_map = self._build_local_alias_map( + root_node, lang_config, module_qn + ) + if (rhs := alias_map.get(name)) is not None: + callee = resolver.resolve_function_call(rhs, module_qn) + if callee and callee[0] in callable_labels: + ensure_rel( + module_spec, + cs.RelationshipType.CALLS, + (callee[0], qn_key, callee[1]), + ) + + def _module_qn(self, relative_path: Path, file_name: str) -> str: + if file_name in (cs.INIT_PY, cs.MOD_RS): + return cs.SEPARATOR_DOT.join( + [self.project_name] + list(relative_path.parent.parts) + ) + return cs.SEPARATOR_DOT.join( + [self.project_name] + list(relative_path.with_suffix("").parts) + ) + + def collect_callable_field_bindings( + self, + file_path: Path, + root_node: Node, + language: cs.SupportedLanguage, + queries: dict[cs.SupportedLanguage, LanguageQueries], + func_class_captures_cache: dict[Path, dict] | None = None, + ) -> None: + # (H) Pre-pass: record which functions are bound to a class's callable + # (H) fields (FQNSpec(get_name=_python_get_name, ...)). Runs before call + # (H) resolution so a field invocation can resolve regardless of which + # (H) file the construction site lives in. Keyword bindings only; + # (H) positional callable args would need declared field order. + if language != cs.SupportedLanguage.PYTHON: + return + try: + module_qn = self._module_qn( + cached_relative_path(file_path, self.repo_path), file_path.name + ) + if ( + func_class_captures_cache is not None + and file_path in func_class_captures_cache + ): + call_nodes = func_class_captures_cache[file_path].get(cs.CAPTURE_CALL) + else: + call_nodes = None + if call_nodes is None: + call_nodes, _ = self._collect_all_call_nodes( + root_node, language, queries + ) + resolver = self._resolver + registry = resolver.function_registry + callable_labels = (cs.NodeLabel.FUNCTION, cs.NodeLabel.METHOD) + for call_node in call_nodes: + _positional, keyword = self._parse_call_arguments(call_node) + if not keyword: + continue + name = self._get_call_target_name(call_node) + if not name: + continue + callee = resolver.resolve_function_call(name, module_qn) + if not callee or callee[0] != cs.NodeLabel.CLASS: + continue + for field, value_node in keyword.items(): + if not (value_text := safe_decode_text(value_node)): + continue + bound = resolver.resolve_function_call(value_text, module_qn) + if bound and bound[0] in callable_labels and bound[1] in registry: + resolver.record_callable_field_binding( + callee[1], field, bound[1] + ) + except Exception as e: + logger.error(ls.CALL_PROCESSING_FAILED, path=file_path, error=e) + def process_calls_in_file( self, file_path: Path, root_node: Node, language: cs.SupportedLanguage, queries: dict[cs.SupportedLanguage, LanguageQueries], + func_class_captures_cache: dict[Path, dict] | None = None, ) -> None: - relative_path = file_path.relative_to(self.repo_path) - logger.debug(ls.CALL_PROCESSING_FILE.format(path=relative_path)) + relative_path = cached_relative_path(file_path, self.repo_path) + logger.debug(ls.CALL_PROCESSING_FILE, path=relative_path) try: - module_qn = cs.SEPARATOR_DOT.join( - [self.project_name] + list(relative_path.with_suffix("").parts) - ) - if file_path.name in (cs.INIT_PY, cs.MOD_RS): - module_qn = cs.SEPARATOR_DOT.join( - [self.project_name] + list(relative_path.parent.parts) + module_qn = self._module_qn(relative_path, file_path.name) + + call_name_cache: dict[int, str | None] = {} + + if ( + func_class_captures_cache is not None + and file_path in func_class_captures_cache + ): + combined_captures = func_class_captures_cache[file_path] + else: + combined_query = COMBINED_FUNC_CLASS_QUERIES.get(language) + if combined_query: + cursor = QueryCursor(combined_query) + combined_captures = sorted_captures(cursor, root_node) + else: + combined_captures = {} + + cached_calls = combined_captures.get(cs.CAPTURE_CALL) + if cached_calls is not None: + all_call_nodes = cached_calls + call_starts: list[int] | None = None + else: + all_call_nodes, call_starts = self._collect_all_call_nodes( + root_node, language, queries ) - self._process_calls_in_functions(root_node, module_qn, language, queries) - self._process_calls_in_classes(root_node, module_qn, language, queries) - self._process_module_level_calls(root_node, module_qn, language, queries) + sorted_func_nodes = combined_captures.get(cs.CAPTURE_FUNCTION) + if sorted_func_nodes or combined_captures.get(cs.CAPTURE_CLASS): + if cached_calls is not None: + call_starts = [n.start_byte for n in all_call_nodes] + func_node_starts = ( + [n.start_byte for n in sorted_func_nodes] + if sorted_func_nodes + else None + ) + else: + call_starts = None + func_node_starts = None + + self._process_calls_in_functions( + root_node, + module_qn, + language, + queries, + all_call_nodes, + call_starts, + call_name_cache=call_name_cache, + combined_captures=combined_captures or None, + ) + # (H) Bare decorators (`@task`) are not call nodes; emit their + # (H) module-load CALLS before the empty-`all_call_nodes` early return, + # (H) since a file may have decorators but no other calls. Classes can + # (H) be decorated too, so include captured class nodes. + if language == cs.SupportedLanguage.PYTHON: + decorator_targets = list(sorted_func_nodes or []) + if combined_captures and ( + class_nodes := combined_captures.get(cs.CAPTURE_CLASS) + ): + decorator_targets.extend(class_nodes) + if decorator_targets: + self._ingest_decorator_calls( + decorator_targets, + module_qn, + root_node, + queries[language][cs.QUERY_CONFIG], + ) + if not all_call_nodes: + return + self._process_calls_in_classes( + root_node, + module_qn, + language, + queries, + all_call_nodes, + call_starts, + call_name_cache=call_name_cache, + combined_captures=combined_captures, + sorted_func_nodes=sorted_func_nodes, + func_node_starts=func_node_starts, + ) + if sorted_func_nodes and call_starts is not None: + module_calls = self._filter_top_level_calls( + all_call_nodes, call_starts, sorted_func_nodes + ) + else: + module_calls = all_call_nodes + self._ingest_function_calls( + root_node, + module_qn, + cs.NodeLabel.MODULE, + module_qn, + language, + queries, + call_nodes=module_calls, + call_name_cache=call_name_cache, + ) except Exception as e: - logger.error(ls.CALL_PROCESSING_FAILED.format(path=file_path, error=e)) + logger.error(ls.CALL_PROCESSING_FAILED, path=file_path, error=e) def _process_calls_in_functions( self, @@ -78,28 +398,68 @@ def _process_calls_in_functions( module_qn: str, language: cs.SupportedLanguage, queries: dict[cs.SupportedLanguage, LanguageQueries], + all_call_nodes: list[Node] | None = None, + call_starts: list[int] | None = None, + call_name_cache: dict[int, str | None] | None = None, + combined_captures: dict[str, list[Node]] | None = None, ) -> None: - result = get_function_captures(root_node, language, queries) - if not result: - return - - lang_config, captures = result - func_nodes = captures.get(cs.CAPTURE_FUNCTION, []) + if combined_captures is not None: + lang_config = queries[language][cs.QUERY_CONFIG] + func_nodes = combined_captures.get(cs.CAPTURE_FUNCTION, []) + has_classes = bool(combined_captures.get(cs.CAPTURE_CLASS)) + else: + result = get_function_captures(root_node, language, queries) + if not result: + return + lang_config, captures = result + func_nodes = captures.get(cs.CAPTURE_FUNCTION, []) + has_classes = bool(captures.get(cs.CAPTURE_CLASS)) for func_node in func_nodes: - if not isinstance(func_node, Node): - continue - if self._is_method(func_node, lang_config): + if has_classes and self._is_method(func_node, lang_config): continue - if language == cs.SupportedLanguage.CPP: + if language in _C_FAMILY_LANGUAGES: func_name = cpp_utils.extract_function_name(func_node) else: func_name = self._get_node_name(func_node) if not func_name: continue + # (H) An out-of-line C++ method definition (`Ret Class::method() {...}` + # (H) at namespace/file scope) is bound by the definition pass to its + # (H) class node (qn `class_qn.method`). Attribute its body's calls to + # (H) that method node, not a phantom module-rooted free-function qn, + # (H) so the CALLS edges join to a real node. + if language == cs.SupportedLanguage.CPP and ( + bound := self._cpp_out_of_class_method_caller( + func_node, func_name, module_qn + ) + ): + caller_qn, class_qn = bound + filtered = ( + self._filter_calls_in_node(all_call_nodes, call_starts, func_node) + if all_call_nodes is not None and call_starts is not None + else None + ) + self._ingest_function_calls( + func_node, + caller_qn, + cs.NodeLabel.METHOD, + module_qn, + language, + queries, + class_qn, + call_nodes=filtered, + call_name_cache=call_name_cache, + ) + continue if func_qn := self._build_nested_qualified_name( func_node, module_qn, func_name, lang_config ): + filtered = ( + self._filter_calls_in_node(all_call_nodes, call_starts, func_node) + if all_call_nodes is not None and call_starts is not None + else None + ) self._ingest_function_calls( func_node, func_qn, @@ -107,8 +467,34 @@ def _process_calls_in_functions( module_qn, language, queries, + call_nodes=filtered, + call_name_cache=call_name_cache, ) + def _cpp_out_of_class_method_caller( + self, func_node: Node, method_name: str, module_qn: str + ) -> tuple[str, str] | None: + # (H) Resolve an out-of-line C++ method definition to its (method_qn, + # (H) class_qn), mirroring the definition pass's class binding. The leaf + # (H) class name resolves the class across files (header-declared classes); + # (H) `endswith(normalized)` guards against a leaf collision binding to the + # (H) wrong class, and the registry membership check ensures the method node + # (H) actually exists before overriding the default attribution. + if not cpp_utils.is_out_of_class_method_definition(func_node): + return None + class_name = cpp_utils.extract_class_name_from_out_of_class_method(func_node) + if not class_name: + return None + normalized = class_name.replace(cs.SEPARATOR_DOUBLE_COLON, cs.SEPARATOR_DOT) + leaf = normalized.rsplit(cs.SEPARATOR_DOT, 1)[-1] + class_qn = self._resolver._resolve_class_name(leaf, module_qn) + if not class_qn or not class_qn.endswith(normalized): + return None + caller_qn = f"{class_qn}{cs.SEPARATOR_DOT}{method_name}" + if caller_qn in self._resolver.function_registry: + return caller_qn, class_qn + return None + def _get_rust_impl_class_name(self, class_node: Node) -> str | None: class_name = self._get_node_name(class_node, cs.FIELD_TYPE) if class_name: @@ -136,20 +522,40 @@ def _process_methods_in_class( module_qn: str, language: cs.SupportedLanguage, queries: dict[cs.SupportedLanguage, LanguageQueries], + all_call_nodes: list[Node] | None = None, + call_starts: list[int] | None = None, + call_name_cache: dict[int, str | None] | None = None, + sorted_func_nodes: list[Node] | None = None, + func_node_starts: list[int] | None = None, ) -> None: - method_query = queries[language][cs.QUERY_FUNCTIONS] - if not method_query: - return - method_cursor = QueryCursor(method_query) - method_captures = method_cursor.captures(body_node) - method_nodes = method_captures.get(cs.CAPTURE_FUNCTION, []) + if sorted_func_nodes is not None and func_node_starts is not None: + body_start = body_node.start_byte + body_end = body_node.end_byte + lo = bisect_left(func_node_starts, body_start) + hi = bisect_right(func_node_starts, body_end) + method_nodes = [ + n for n in sorted_func_nodes[lo:hi] if n.end_byte <= body_end + ] + else: + method_query = queries[language][cs.QUERY_FUNCTIONS] + if not method_query: + return + method_cursor = QueryCursor(method_query) + method_captures = sorted_captures(method_cursor, body_node) + method_nodes = method_captures.get(cs.CAPTURE_FUNCTION, []) for method_node in method_nodes: - if not isinstance(method_node, Node): - continue - method_name = self._get_node_name(method_node) + if language in _C_FAMILY_LANGUAGES: + method_name = cpp_utils.extract_function_name(method_node) + else: + method_name = self._get_node_name(method_node) if not method_name: continue method_qn = f"{class_qn}{cs.SEPARATOR_DOT}{method_name}" + filtered = ( + self._filter_calls_in_node(all_call_nodes, call_starts, method_node) + if all_call_nodes is not None and call_starts is not None + else None + ) self._ingest_function_calls( method_node, method_qn, @@ -158,6 +564,8 @@ def _process_methods_in_class( language, queries, class_qn, + call_nodes=filtered, + call_name_cache=call_name_cache, ) def _process_calls_in_classes( @@ -166,38 +574,47 @@ def _process_calls_in_classes( module_qn: str, language: cs.SupportedLanguage, queries: dict[cs.SupportedLanguage, LanguageQueries], + all_call_nodes: list[Node] | None = None, + call_starts: list[int] | None = None, + call_name_cache: dict[int, str | None] | None = None, + combined_captures: dict[str, list] | None = None, + sorted_func_nodes: list[Node] | None = None, + func_node_starts: list[int] | None = None, ) -> None: - query = queries[language][cs.QUERY_CLASSES] - if not query: - return - cursor = QueryCursor(query) - captures = cursor.captures(root_node) - class_nodes = captures.get(cs.CAPTURE_CLASS, []) + if combined_captures is not None: + class_nodes = combined_captures.get(cs.CAPTURE_CLASS, []) + else: + query = queries[language][cs.QUERY_CLASSES] + if not query: + return + cursor = QueryCursor(query) + captures = sorted_captures(cursor, root_node) + class_nodes = captures.get(cs.CAPTURE_CLASS, []) for class_node in class_nodes: - if not isinstance(class_node, Node): - continue class_name = self._get_class_name_for_node(class_node, language) if not class_name: continue class_qn = f"{module_qn}{cs.SEPARATOR_DOT}{class_name}" if body_node := class_node.child_by_field_name(cs.FIELD_BODY): self._process_methods_in_class( - body_node, class_qn, module_qn, language, queries + body_node, + class_qn, + module_qn, + language, + queries, + all_call_nodes, + call_starts, + call_name_cache=call_name_cache, + sorted_func_nodes=sorted_func_nodes, + func_node_starts=func_node_starts, ) - def _process_module_level_calls( - self, - root_node: Node, - module_qn: str, - language: cs.SupportedLanguage, - queries: dict[cs.SupportedLanguage, LanguageQueries], - ) -> None: - self._ingest_function_calls( - root_node, module_qn, cs.NodeLabel.MODULE, module_qn, language, queries - ) - def _get_call_target_name(self, call_node: Node) -> str | None: + # (H) A macro-internal call (Rust `name(args)` inside a token_tree) is + # (H) captured as the bare identifier node; its text is the callee name. + if call_node.type == cs.TS_IDENTIFIER and call_node.text is not None: + return call_node.text.decode(cs.ENCODING_UTF8) if func_child := call_node.child_by_field_name(cs.TS_FIELD_FUNCTION): match func_child.type: case ( @@ -208,11 +625,16 @@ def _get_call_target_name(self, call_node: Node) -> str | None: | cs.TS_SCOPED_IDENTIFIER ): if func_child.text is not None: - return str(func_child.text.decode(cs.ENCODING_UTF8)) + return func_child.text.decode(cs.ENCODING_UTF8) + case cs.TS_GENERIC_FUNCTION: + # (H) turbofish: unwrap to the underlying callee identifier + inner = func_child.child_by_field_name(cs.TS_FIELD_FUNCTION) + if inner and inner.text: + return inner.text.decode(cs.ENCODING_UTF8) case cs.TS_CPP_FIELD_EXPRESSION: field_node = func_child.child_by_field_name(cs.FIELD_FIELD) if field_node and field_node.text: - return str(field_node.text.decode(cs.ENCODING_UTF8)) + return field_node.text.decode(cs.ENCODING_UTF8) case cs.TS_PARENTHESIZED_EXPRESSION: return self._get_iife_target_name(func_child) @@ -230,15 +652,15 @@ def _get_call_target_name(self, call_node: Node) -> str | None: object_node = call_node.child_by_field_name(cs.FIELD_OBJECT) name_node = call_node.child_by_field_name(cs.FIELD_NAME) if name_node and name_node.text: - method_name = str(name_node.text.decode(cs.ENCODING_UTF8)) + method_name = name_node.text.decode(cs.ENCODING_UTF8) if not object_node or not object_node.text: return method_name - object_text = str(object_node.text.decode(cs.ENCODING_UTF8)) + object_text = object_node.text.decode(cs.ENCODING_UTF8) return f"{object_text}{cs.SEPARATOR_DOT}{method_name}" if name_node := call_node.child_by_field_name(cs.FIELD_NAME): if name_node.text is not None: - return str(name_node.text.decode(cs.ENCODING_UTF8)) + return name_node.text.decode(cs.ENCODING_UTF8) return None @@ -260,70 +682,774 @@ def _ingest_function_calls( language: cs.SupportedLanguage, queries: dict[cs.SupportedLanguage, LanguageQueries], class_context: str | None = None, + call_nodes: list[Node] | None = None, + call_name_cache: dict[int, str | None] | None = None, ) -> None: - calls_query = queries[language].get(cs.QUERY_CALLS) - if not calls_query: - return + if language in _TYPED_LANGUAGES: + local_var_types = ( + self._resolver.type_inference.build_local_variable_type_map( + caller_node, module_qn, language + ) + ) + else: + local_var_types = None - local_var_types = self._resolver.type_inference.build_local_variable_type_map( - caller_node, module_qn, language - ) + caller_spec = (caller_type, cs.KEY_QUALIFIED_NAME, caller_qn) - cursor = QueryCursor(calls_query) - captures = cursor.captures(caller_node) - call_nodes = captures.get(cs.CAPTURE_CALL, []) + caller_params: frozenset[str] = frozenset() + if language == cs.SupportedLanguage.PYTHON: + ordered_params = python_parameter_names(caller_node) + self._flow_param_names[caller_qn] = ordered_params + caller_params = frozenset(ordered_params) - logger.debug( - ls.CALL_FOUND_NODES.format( - count=len(call_nodes), language=language, caller=caller_qn + # (H) Runs independently of call_nodes: a getter access is an attribute, not + # (H) a call, so callers that read a property but make no other call must + # (H) still reach this pass before the early return below. + if language == cs.SupportedLanguage.PYTHON and ( + prop_names := self._resolver.function_registry.property_names() + ): + self._ingest_property_accesses( + caller_node, + caller_spec, + caller_qn, + module_qn, + local_var_types, + class_context, + queries[language][cs.QUERY_CONFIG], + prop_names, ) - ) - for call_node in call_nodes: - if not isinstance(call_node, Node): - continue + # (H) Operator syntax (k in r, r[k], r[k]=v, len(r)) dispatches to dunder + # (H) methods; emit those edges when the operand is a first-party type. + if language == cs.SupportedLanguage.PYTHON: + self._ingest_operator_dispatch_calls( + caller_node, caller_spec, module_qn, local_var_types + ) + + if call_nodes is None: + calls_query = queries[language].get(cs.QUERY_CALLS) + if not calls_query: + return + cursor = QueryCursor(calls_query) + captures = sorted_captures(cursor, caller_node) + call_nodes = captures.get(cs.CAPTURE_CALL, []) + + if not call_nodes: + return - # (H) tree-sitter finds ALL call nodes including nested; no recursive processing needed + is_java = language == cs.SupportedLanguage.JAVA + is_js_ts = language in (cs.SupportedLanguage.JS, cs.SupportedLanguage.TS) + is_cpp = language == cs.SupportedLanguage.CPP + method_invocation_type = cs.TS_METHOD_INVOCATION + resolver = self._resolver + resolve_func = resolver.resolve_function_call + resolve_builtin = resolver.resolve_builtin_call if is_js_ts else None + resolve_cpp_op = resolver.resolve_cpp_operator_call if is_cpp else None + get_target = self._get_call_target_name + class_label = cs.NodeLabel.CLASS + ensure_rel = self.ingestor.ensure_relationship_batch + calls_rel = cs.RelationshipType.CALLS + qn_key = cs.KEY_QUALIFIED_NAME + _id = id + is_python = language == cs.SupportedLanguage.PYTHON + alias_map: dict[str, str] | None = None - call_name = self._get_call_target_name(call_node) + for call_node in call_nodes: + node_id = _id(call_node) + if call_name_cache is not None and node_id in call_name_cache: + call_name = call_name_cache[node_id] + else: + call_name = get_target(call_node) + if call_name_cache is not None: + call_name_cache[node_id] = call_name if not call_name: continue - if ( - language == cs.SupportedLanguage.JAVA - and call_node.type == cs.TS_METHOD_INVOCATION - ): - callee_info = self._resolver.resolve_java_method_call( + if is_java and call_node.type == method_invocation_type: + callee_info = resolver.resolve_java_method_call( call_node, module_qn, local_var_types ) else: - callee_info = self._resolver.resolve_function_call( + callee_info = resolve_func( call_name, module_qn, local_var_types, class_context ) - if callee_info: - callee_type, callee_qn = callee_info - elif builtin_info := self._resolver.resolve_builtin_call(call_name): - callee_type, callee_qn = builtin_info - elif operator_info := self._resolver.resolve_cpp_operator_call( - call_name, module_qn + if not callee_info and resolve_builtin is not None: + callee_info = resolve_builtin(call_name) + if not callee_info and resolve_cpp_op is not None: + callee_info = resolve_cpp_op(call_name, module_qn) + if not callee_info and is_python and cs.SEPARATOR_DOT not in call_name: + # (H) A bare name that resolves to nothing may be a local alias of a + # (H) callable (do = self._start; do()). Resolve the assignment's + # (H) right-hand side and treat the alias call as a call to it. + if alias_map is None: + alias_map = self._build_local_alias_map( + caller_node, queries[language][cs.QUERY_CONFIG], module_qn + ) + if (rhs := alias_map.get(call_name)) is not None: + callee_info = resolve_func( + rhs, module_qn, local_var_types, class_context + ) + + if not callee_info and is_python and cs.SEPARATOR_DOT in call_name: + # (H) recv.field(...) where field is a callable struct field: + # (H) resolve to the functions bound to it at construction sites. + self._ingest_callable_field_calls( + call_name, caller_spec, local_var_types, ensure_rel + ) + + if is_python and call_name.rsplit(cs.SEPARATOR_DOT, 1)[-1] in ( + cs.HIGHER_ORDER_BUILTINS + ): + # (H) sorted(xs, key=f) and friends invoke f synchronously in this + # (H) frame, so the trace attributes the call to the enclosing fn. + self._ingest_higher_order_builtin_calls( + call_node, + caller_spec, + module_qn, + local_var_types, + class_context, + resolve_func, + ensure_rel, + ) + + if not callee_info: + continue + + callee_type, callee_qn = callee_info + + if is_python: + self._collect_callable_flow( + call_node, + callee_qn, + caller_qn, + caller_params, + module_qn, + local_var_types, + class_context, + ) + + if is_python and ( + dispatch_targets := resolver.protocol_dispatch_targets(callee_qn) ): - callee_type, callee_qn = operator_info + # (H) The call resolved to a Protocol stub; the stub never runs, so emit + # (H) edges to the method on every conformer instead of the stub. + for conformer_type, conformer_qn in dispatch_targets: + for target_qn in resolver.function_registry.variants(conformer_qn): + ensure_rel( + caller_spec, + calls_rel, + (conformer_type, qn_key, target_qn), + ) + continue + + if is_python: + # (H) f(...) invoked through a parameter: the edge runs from the + # (H) callee to whatever each call site binds to that parameter. + self._ingest_callable_param_calls( + call_node, + callee_type, + callee_qn, + module_qn, + local_var_types, + class_context, + resolve_func, + ensure_rel, + ) + + if callee_type == class_label: + # (H) Record construction as INSTANTIATES -> the class node (keeps + # (H) CALLS function/method-only). When the class defines __init__, + # (H) ALSO redirect a CALLS edge to it (the constructor runs); when + # (H) it does not (dataclass/NamedTuple/pydantic), INSTANTIATES is + # (H) the only edge. + for class_variant in resolver.function_registry.variants(callee_qn): + ensure_rel( + caller_spec, + cs.RelationshipType.INSTANTIATES, + (class_label, qn_key, class_variant), + ) + init_qn = f"{callee_qn}{cs.SEPARATOR_DOT}{cs.PY_METHOD_INIT}" + if init_qn not in resolver.function_registry: + continue + callee_type = cs.NodeLabel.METHOD + callee_qn = init_qn + + for target_qn in resolver.function_registry.variants(callee_qn): + ensure_rel( + caller_spec, + calls_rel, + (callee_type, qn_key, target_qn), + ) + + def _ingest_operator_dispatch_calls( + self, + caller_node: Node, + caller_spec: tuple[str, str, str], + module_qn: str, + local_var_types: dict[str, str] | None, + ) -> None: + boundary = (cs.TS_PY_FUNCTION_DEFINITION, cs.TS_PY_CLASS_DEFINITION) + stack: list[Node] = list(caller_node.children) + while stack: + node = stack.pop() + if node.type in boundary: + continue + match node.type: + case cs.TS_PY_SUBSCRIPT: + parent = node.parent + left = ( + parent.child_by_field_name(cs.TS_FIELD_LEFT) + if parent is not None and parent.type == cs.TS_PY_ASSIGNMENT + else None + ) + is_write = left is not None and left.id == node.id + self._emit_operator_dunder( + node.child_by_field_name(cs.FIELD_VALUE), + cs.PY_DUNDER_SETITEM if is_write else cs.PY_DUNDER_GETITEM, + caller_spec, + module_qn, + local_var_types, + ) + case cs.TS_PY_COMPARISON_OPERATOR: + operators = node.child_by_field_name(cs.TS_FIELD_OPERATORS) + if ( + operators is not None + and (op_text := safe_decode_text(operators)) + and cs.PY_OP_IN in op_text.split() + and node.named_children + ): + self._emit_operator_dunder( + node.named_children[-1], + cs.PY_DUNDER_CONTAINS, + caller_spec, + module_qn, + local_var_types, + ) + case cs.TS_PY_CALL: + func = node.child_by_field_name(cs.TS_FIELD_FUNCTION) + args = node.child_by_field_name(cs.FIELD_ARGUMENTS) + if ( + func is not None + and safe_decode_text(func) == cs.PY_BUILTIN_LEN + and args is not None + and len(args.named_children) == 1 + ): + self._emit_operator_dunder( + args.named_children[0], + cs.PY_DUNDER_LEN, + caller_spec, + module_qn, + local_var_types, + ) + case cs.TS_PY_BOOLEAN_OPERATOR: + self._emit_truthiness( + node.child_by_field_name(cs.TS_FIELD_LEFT), + caller_spec, + module_qn, + local_var_types, + ) + self._emit_truthiness( + node.child_by_field_name(cs.TS_FIELD_RIGHT), + caller_spec, + module_qn, + local_var_types, + ) + case cs.TS_PY_NOT_OPERATOR: + self._emit_truthiness( + node.child_by_field_name(cs.TS_FIELD_ARGUMENT), + caller_spec, + module_qn, + local_var_types, + ) + case ( + cs.TS_PY_IF_STATEMENT + | cs.TS_PY_WHILE_STATEMENT + | cs.TS_PY_ELIF_CLAUSE + | cs.TS_PY_CONDITIONAL_EXPRESSION + ): + # (H) A bare object as a condition is tested for truthiness; nested + # (H) boolean/not operators are handled when the walk reaches them. + self._emit_truthiness( + node.child_by_field_name(cs.TS_FIELD_CONDITION), + caller_spec, + module_qn, + local_var_types, + ) + stack.extend(node.children) + + def _emit_truthiness( + self, + operand: Node | None, + caller_spec: tuple[str, str, str], + module_qn: str, + local_var_types: dict[str, str] | None, + ) -> None: + # (H) Truthiness of an object calls __bool__ if defined, else __len__. Only a + # (H) bare name/attribute operand names an object (a comparison/call is already + # (H) a bool and is handled elsewhere); try __bool__ first, then __len__. + if operand is None or operand.type not in ( + cs.TS_PY_IDENTIFIER, + cs.TS_PY_ATTRIBUTE, + ): + return + for dunder in (cs.PY_DUNDER_BOOL, cs.PY_DUNDER_LEN): + if self._emit_operator_dunder( + operand, dunder, caller_spec, module_qn, local_var_types + ): + return + + def _emit_operator_dunder( + self, + operand: Node | None, + dunder: str, + caller_spec: tuple[str, str, str], + module_qn: str, + local_var_types: dict[str, str] | None, + ) -> bool: + # (H) Resolve the implied .__dunder__ call; resolution only succeeds + # (H) for a first-party class that defines the dunder, so builtin containers + # (H) (dict/list) yield no edge. Restrict to simple attribute/name operands. + # (H) Returns whether an edge was emitted (truthiness tries __bool__ then __len__). + if operand is None or not (operand_text := safe_decode_text(operand)): + return False + if any(ch in operand_text for ch in cs.PY_OPERAND_REJECT_CHARS): + return False + targets = self._resolver.operator_dunder_targets( + operand_text, dunder, module_qn, local_var_types + ) + if not targets: + return False + for callee_type, callee_qn in targets: + for target_qn in self._resolver.function_registry.variants(callee_qn): + self.ingestor.ensure_relationship_batch( + caller_spec, + cs.RelationshipType.CALLS, + (callee_type, cs.KEY_QUALIFIED_NAME, target_qn), + ) + return True + + def _parse_call_arguments( + self, call_node: Node + ) -> tuple[list[Node], dict[str, Node]]: + positional: list[Node] = [] + keyword: dict[str, Node] = {} + args_node = call_node.child_by_field_name(cs.FIELD_ARGUMENTS) + if args_node is None: + return positional, keyword + for child in args_node.named_children: + if child.type == cs.TS_PY_KEYWORD_ARGUMENT: + name_node = child.child_by_field_name(cs.FIELD_NAME) + value_node = child.child_by_field_name(cs.FIELD_VALUE) + if ( + name_node is not None + and value_node is not None + and (name := safe_decode_text(name_node)) is not None + ): + keyword[name] = value_node + else: + positional.append(child) + return positional, keyword + + def _emit_callback_edge( + self, + source_spec: tuple[str, str, str], + arg_node: Node, + module_qn: str, + local_var_types: dict[str, str] | None, + class_context: str | None, + resolve_func, + ensure_rel, + ) -> None: + if not (arg_text := safe_decode_text(arg_node)): + return + if not ( + resolved := resolve_func( + arg_text, module_qn, local_var_types, class_context + ) + ): + return + res_type, res_qn = resolved + registry = self._resolver.function_registry + if res_type == cs.NodeLabel.CLASS: + init_qn = f"{res_qn}{cs.SEPARATOR_DOT}{cs.PY_METHOD_INIT}" + if init_qn not in registry: + return + res_type = cs.NodeLabel.METHOD + res_qn = init_qn + for target_qn in registry.variants(res_qn): + ensure_rel( + source_spec, + cs.RelationshipType.CALLS, + (res_type, cs.KEY_QUALIFIED_NAME, target_qn), + ) + + def _ingest_callable_param_calls( + self, + call_node: Node, + callee_type: str, + callee_qn: str, + module_qn: str, + local_var_types: dict[str, str] | None, + class_context: str | None, + resolve_func, + ensure_rel, + ) -> None: + if not (params := self._resolver.function_registry.callable_params(callee_qn)): + return + positional, keyword = self._parse_call_arguments(call_node) + source_spec = (callee_type, cs.KEY_QUALIFIED_NAME, callee_qn) + for param_name, index in params.items(): + arg_node = keyword.get(param_name) + if arg_node is None and index < len(positional): + arg_node = positional[index] + if arg_node is not None: + self._emit_callback_edge( + source_spec, + arg_node, + module_qn, + local_var_types, + class_context, + resolve_func, + ensure_rel, + ) + + def _collect_callable_flow( + self, + call_node: Node, + callee_qn: str, + caller_qn: str, + caller_params: frozenset[str], + module_qn: str, + local_var_types: dict[str, str] | None, + class_context: str | None, + ) -> None: + # (H) Record, for each call-site argument that names a callable, whether it is a + # (H) concrete function or a parameter of the caller (a pass-through). The + # (H) fixpoint in finalize propagates concretes through pass-through params to + # (H) the functions that actually invoke them. + positional, keyword = self._parse_call_arguments(call_node) + items: list[tuple[int, str, Node]] = [ + (index, "", node) for index, node in enumerate(positional) + ] + items.extend((-1, name, node) for name, node in keyword.items()) + callable_labels = ( + cs.NodeLabel.FUNCTION, + cs.NodeLabel.METHOD, + cs.NodeLabel.CLASS, + ) + for position, keyword_name, arg_node in items: + if arg_node.type not in (cs.TS_PY_IDENTIFIER, cs.TS_PY_ATTRIBUTE): + continue + arg_text = safe_decode_text(arg_node) + if not arg_text: + continue + if arg_node.type == cs.TS_PY_IDENTIFIER and arg_text in caller_params: + self._flow_args.append( + _CallableFlowArg( + callee_qn, position, keyword_name, "", caller_qn, arg_text + ) + ) + continue + resolved = self._resolver.resolve_function_call( + arg_text, module_qn, local_var_types, class_context + ) + if resolved is not None and resolved[0] in callable_labels: + self._flow_args.append( + _CallableFlowArg( + callee_qn, position, keyword_name, resolved[1], "", "" + ) + ) + + def finalize_callable_param_flow(self) -> None: + # (H) Resolve the recorded call-site argument bindings to a fixpoint and emit a + # (H) CALLS edge from every function that invokes a callable parameter to each + # (H) concrete function that can reach it (directly or via pass-through params). + registry = self._resolver.function_registry + seeds: dict[tuple[str, str], set[str]] = defaultdict(set) + edges: dict[tuple[str, str], set[tuple[str, str]]] = defaultdict(set) + for arg in self._flow_args: + if arg.keyword: + param_name = arg.keyword + else: + callee_params = self._flow_param_names.get(arg.callee_qn) + if callee_params is None or not ( + 0 <= arg.position < len(callee_params) + ): + continue + param_name = callee_params[arg.position] + slot = (arg.callee_qn, param_name) + if arg.source_concrete: + seeds[slot].add(arg.source_concrete) else: + edges[slot].add((arg.source_caller, arg.source_param)) + + bindings: dict[tuple[str, str], set[str]] = { + k: set(v) for k, v in seeds.items() + } + for slot in edges: + bindings.setdefault(slot, set()) + changed = True + while changed: + changed = False + for slot, sources in edges.items(): + for source in sources: + if (reachable := bindings.get(source)) and not reachable.issubset( + bindings[slot] + ): + bindings[slot] |= reachable + changed = True + + ensure_rel = self.ingestor.ensure_relationship_batch + for func_qn, invoked in ( + (qn, registry.callable_params(qn)) for qn in self._flow_param_names + ): + if not invoked or (func_type := registry.get(func_qn)) is None: continue - logger.debug( - ls.CALL_FOUND.format( - caller=caller_qn, - call_name=call_name, - callee_type=callee_type, - callee_qn=callee_qn, + source_spec = (func_type, cs.KEY_QUALIFIED_NAME, func_qn) + for param_name in invoked: + for target_qn in bindings.get((func_qn, param_name), ()): + target_type = registry.get(target_qn) + if target_type is None: + continue + for variant in registry.variants(target_qn): + ensure_rel( + source_spec, + cs.RelationshipType.CALLS, + (target_type, cs.KEY_QUALIFIED_NAME, variant), + ) + + def _ingest_callable_field_calls( + self, + call_name: str, + caller_spec: tuple[str, str, str], + local_var_types: dict[str, str] | None, + ensure_rel, + ) -> None: + recv, sep, field = call_name.rpartition(cs.SEPARATOR_DOT) + if not sep: + return + recv_type = local_var_types.get(recv) if local_var_types else None + targets = self._resolver.callable_field_targets(field, recv_type) + if not targets: + return + registry = self._resolver.function_registry + for target_qn in targets: + if target_qn in registry: + ensure_rel( + caller_spec, + cs.RelationshipType.CALLS, + (registry[target_qn], cs.KEY_QUALIFIED_NAME, target_qn), ) + + def _ingest_higher_order_builtin_calls( + self, + call_node: Node, + caller_spec: tuple[str, str, str], + module_qn: str, + local_var_types: dict[str, str] | None, + class_context: str | None, + resolve_func, + ensure_rel, + ) -> None: + positional, keyword = self._parse_call_arguments(call_node) + for arg_node in (*positional, *keyword.values()): + self._emit_callback_edge( + caller_spec, + arg_node, + module_qn, + local_var_types, + class_context, + resolve_func, + ensure_rel, ) - self.ingestor.ensure_relationship_batch( - (caller_type, cs.KEY_QUALIFIED_NAME, caller_qn), - cs.RelationshipType.CALLS, - (callee_type, cs.KEY_QUALIFIED_NAME, callee_qn), + def _build_local_alias_map( + self, caller_node: Node, lang_config: LanguageSpec, module_qn: str + ) -> dict[str, str]: + identifier = cs.TS_PY_IDENTIFIER + attribute = cs.TS_PY_ATTRIBUTE + assignment = cs.TS_PY_ASSIGNMENT + left_field = cs.TS_FIELD_LEFT + right_field = cs.TS_FIELD_RIGHT + function_types = lang_config.function_node_types + class_types = lang_config.class_node_types + aliases: dict[str, str] = {} + stack = list(caller_node.children) + while stack: + node = stack.pop() + node_type = node.type + if node_type in function_types or node_type in class_types: + continue + if node_type == assignment: + left = node.child_by_field_name(left_field) + right = node.child_by_field_name(right_field) + if ( + left is not None + and left.type == identifier + and (left_text := left.text) is not None + and right is not None + and ( + target := self._alias_reference_text( + right, identifier, attribute, module_qn + ) + ) + is not None + ): + aliases.setdefault(left_text.decode(cs.ENCODING_UTF8), target) + stack.extend(node.children) + return aliases + + def _alias_reference_text( + self, right: Node, identifier: str, attribute: str, module_qn: str + ) -> str | None: + # (H) An alias rhs is a plain name/attribute, a conditional that picks one + # (H) (resolve_builtin_call if is_js_ts else None), or getattr(recv, name) + # (H) dynamic dispatch. Take the name/attribute branch (consequence or + # (H) alternative, never the condition) or build recv. for getattr. + if right.type in (identifier, attribute): + return right.text.decode(cs.ENCODING_UTF8) if right.text else None + if right.type == cs.TS_PY_CONDITIONAL_EXPRESSION and right.named_children: + for branch in (right.named_children[0], right.named_children[-1]): + if branch.type in (identifier, attribute) and branch.text: + return branch.text.decode(cs.ENCODING_UTF8) + if right.type == cs.TS_PY_CALL: + return self._getattr_reference_text(right, identifier, attribute, module_qn) + return None + + def _getattr_reference_text( + self, call: Node, identifier: str, attribute: str, module_qn: str + ) -> str | None: + func = call.child_by_field_name(cs.TS_FIELD_FUNCTION) + args = call.child_by_field_name(cs.FIELD_ARGUMENTS) + if ( + func is None + or safe_decode_text(func) != cs.PY_BUILTIN_GETATTR + or args is None + or len(args.named_children) < 2 + ): + return None + receiver, name_node = args.named_children[0], args.named_children[1] + if receiver.type not in (identifier, attribute): + return None + if (name := self._resolve_str_const(name_node, module_qn)) is None: + return None + return f"{safe_decode_text(receiver)}{cs.SEPARATOR_DOT}{name}" + + def _resolve_str_const(self, node: Node, module_qn: str) -> str | None: + # (H) Resolve a getattr name argument to its string value: a string literal + # (H) directly, or a module-level constant (cs.METHOD_X / METHOD_X) read from + # (H) the defining module's AST. + if node.type == cs.TS_PY_STRING: + content = next( + (c for c in node.children if c.type == cs.TS_PY_STRING_CONTENT), None + ) + return safe_decode_text(content) if content is not None else None + if node.type not in (cs.TS_PY_IDENTIFIER, cs.TS_PY_ATTRIBUTE): + return None + name_text = safe_decode_text(node) + if not name_text: + return None + import_map = self._resolver.import_processor.import_mapping.get(module_qn, {}) + prefix, _, const_name = name_text.rpartition(cs.SEPARATOR_DOT) + if not prefix: + mapped = import_map.get(const_name) + const_module_qn = ( + mapped.rsplit(cs.SEPARATOR_DOT, 1)[0] if mapped else module_qn ) + elif (mapped_module := import_map.get(prefix)) is not None: + const_module_qn = mapped_module + else: + const_module_qn = prefix + return self._module_string_constant(const_module_qn, const_name) + + def _module_string_constant(self, module_qn: str, const_name: str) -> str | None: + type_inference = self._resolver.type_inference + file_path = type_inference.module_qn_to_file_path.get(module_qn) + if file_path is None or file_path not in type_inference.ast_cache: + return None + root_node, _ = type_inference.ast_cache[file_path] + for child in root_node.children: + if child.type != cs.TS_PY_EXPRESSION_STATEMENT or not child.children: + continue + assignment = child.children[0] + if assignment.type != cs.TS_PY_ASSIGNMENT: + continue + left = assignment.child_by_field_name(cs.TS_FIELD_LEFT) + right = assignment.child_by_field_name(cs.TS_FIELD_RIGHT) + if ( + left is not None + and left.type == cs.TS_PY_IDENTIFIER + and safe_decode_text(left) == const_name + and right is not None + and right.type == cs.TS_PY_STRING + ): + return self._resolve_str_const(right, module_qn) + return None + + def _ingest_property_accesses( + self, + caller_node: Node, + caller_spec: tuple[str, str, str], + caller_qn: str, + module_qn: str, + local_var_types: dict[str, str] | None, + class_context: str | None, + lang_config: LanguageSpec, + prop_names: set[str], + ) -> None: + # (H) Accessing an @property getter invokes the getter method at runtime, but + # (H) tree-sitter sees a plain attribute, not a call. Resolve attribute + # (H) accesses whose tail names a known property and emit a CALLS edge to the + # (H) getter (skipping the attribute that is itself a call's function, which + # (H) the call path above already resolves). + resolver = self._resolver + resolve_func = resolver.resolve_function_call + registry = resolver.function_registry + ensure_rel = self.ingestor.ensure_relationship_batch + calls_rel = cs.RelationshipType.CALLS + qn_key = cs.KEY_QUALIFIED_NAME + method_label = cs.NodeLabel.METHOD + attr_type = cs.TS_PY_ATTRIBUTE + call_type = cs.TS_PY_CALL + func_field = cs.TS_FIELD_FUNCTION + function_types = lang_config.function_node_types + class_types = lang_config.class_node_types + seen: set[str] = set() + + stack = list(caller_node.children) + while stack: + node = stack.pop() + node_type = node.type + if node_type in function_types or node_type in class_types: + continue + if node_type == attr_type and (text := node.text) is not None: + attr_text = text.decode(cs.ENCODING_UTF8) + if attr_text.rsplit(cs.SEPARATOR_DOT, 1)[-1] in prop_names: + parent = node.parent + is_call_target = ( + parent is not None + and parent.type == call_type + and parent.child_by_field_name(func_field) is node + ) + if not is_call_target and ( + callee_info := resolve_func( + attr_text, module_qn, local_var_types, class_context + ) + ): + callee_qn = callee_info[1] + if ( + registry.is_property(callee_qn) + and callee_qn != caller_qn + and callee_qn not in seen + ): + seen.add(callee_qn) + for target_qn in registry.variants(callee_qn): + ensure_rel( + caller_spec, + calls_rel, + (method_label, qn_key, target_qn), + ) + stack.extend(node.children) def _build_nested_qualified_name( self, @@ -337,9 +1463,7 @@ def _build_nested_qualified_name( if not isinstance(current, Node): logger.warning( - ls.CALL_UNEXPECTED_PARENT.format( - node=func_node, parent_type=type(current) - ) + ls.CALL_UNEXPECTED_PARENT, node=func_node, parent_type=type(current) ) return None diff --git a/codebase_rag/parsers/call_resolver.py b/codebase_rag/parsers/call_resolver.py index 322a583a3..3815ff344 100644 --- a/codebase_rag/parsers/call_resolver.py +++ b/codebase_rag/parsers/call_resolver.py @@ -1,6 +1,7 @@ from __future__ import annotations import re +from collections import defaultdict, deque from loguru import logger from tree_sitter import Node @@ -12,8 +13,28 @@ from .py import resolve_class_name from .type_inference import TypeInferenceEngine +_SEPARATOR_PATTERN = re.compile(r"[.:]|::") +_SEARCH_NAME_CACHE: dict[str, str] = {} +_CHAINED_METHOD_PATTERN = re.compile(r"\.([^.()]+)$") +_QN_SPLIT_CACHE: dict[str, tuple[list[str], int]] = {} + class CallResolver: + __slots__ = ( + "function_registry", + "import_processor", + "type_inference", + "class_inheritance", + "_simple_resolution_cache", + "_wildcard_cache", + "_protocol_impl_cache", + "_field_bindings", + "_field_to_classes", + "_subclass_map_cache", + "_protocol_classes_cache", + "_struct_impl_cache", + ) + def __init__( self, function_registry: FunctionRegistryTrieProtocol, @@ -25,16 +46,95 @@ def __init__( self.import_processor = import_processor self.type_inference = type_inference self.class_inheritance = class_inheritance + self._simple_resolution_cache: dict[ + tuple[str, str], tuple[str, str] | None + ] = {} + self._wildcard_cache: dict[int, list[tuple[str, str]]] = {} + self._protocol_impl_cache: dict[str, str] | None = None + self._field_bindings: dict[tuple[str, str], set[str]] = {} + self._field_to_classes: dict[str, set[str]] = {} + self._subclass_map_cache: dict[str, set[str]] | None = None + self._protocol_classes_cache: set[str] | None = None + self._struct_impl_cache: dict[str, set[str]] = {} + + def record_callable_field_binding( + self, class_qn: str, field: str, func_qn: str + ) -> None: + # (H) A NamedTuple/dataclass field holding a function reference: every + # (H) function bound to it at any construction site is a possible callee + # (H) when the field is invoked. Recording all of them is a sound call + # (H) graph (each runs for its own configuration), so recall is complete. + self._field_bindings.setdefault((class_qn, field), set()).add(func_qn) + self._field_to_classes.setdefault(field, set()).add(class_qn) + + def callable_field_targets( + self, field: str, recv_type: str | None = None + ) -> set[str]: + classes = self._field_to_classes.get(field) + if not classes: + return set() + if recv_type: + simple = recv_type.rsplit(cs.SEPARATOR_DOT, 1)[-1] + matched = [ + qn + for qn in classes + if qn == recv_type or qn.rsplit(cs.SEPARATOR_DOT, 1)[-1] == simple + ] + if len(matched) == 1: + return self._field_bindings.get((matched[0], field), set()) + # (H) Receiver type unknown or ambiguous: only resolve when exactly one + # (H) class declares this callable field, so the targets are unambiguous. + if len(classes) == 1: + return self._field_bindings.get((next(iter(classes)), field), set()) + return set() def _resolve_class_qn_from_type( self, var_type: str, import_map: dict[str, str], module_qn: str ) -> str: + var_type = self._strip_optional(var_type) if cs.SEPARATOR_DOT in var_type: - return var_type + return self._follow_reexports(var_type) if var_type in import_map: - return import_map[var_type] + return self._follow_reexports(import_map[var_type]) return self._resolve_class_name(var_type, module_qn) or "" + def _strip_optional(self, var_type: str) -> str: + # (H) An Optional annotation (X | None) names a single concrete class; reduce it + # (H) so attribute/operator resolution can find that class. Genuine multi-type + # (H) unions stay unresolved (ambiguous). + if cs.PY_UNION_SEPARATOR not in var_type: + return var_type + non_none = [ + member + for part in var_type.split(cs.PY_UNION_SEPARATOR) + if (member := part.strip()) and member != cs.PY_NONE + ] + return non_none[0] if len(non_none) == 1 else var_type + + def _follow_reexports(self, class_qn: str) -> str: + # (H) `from .pkg import Cls` records the importer's name against the re-export + # (H) module (pkg.Cls), not the class's real definition (pkg.mod.Cls), so a + # (H) class_qn that is not itself registered may be a re-export. Follow the + # (H) module's own import map one hop at a time until a registered class is + # (H) reached, guarding against cycles. + seen: set[str] = set() + current = class_qn + while ( + current + and current not in seen + and current not in self.function_registry + and cs.SEPARATOR_DOT in current + ): + seen.add(current) + module_qn, _, name = current.rpartition(cs.SEPARATOR_DOT) + following = self.import_processor.import_mapping.get(module_qn, {}).get( + name + ) + if not following or following == current: + break + current = following + return current + def _try_resolve_method( self, class_qn: str, method_name: str, separator: str = cs.SEPARATOR_DOT ) -> tuple[str, str] | None: @@ -50,6 +150,95 @@ def resolve_function_call( local_var_types: dict[str, str] | None = None, class_context: str | None = None, ) -> tuple[str, str] | None: + return self._redirect_protocol_method( + self._resolve_function_call( + call_name, module_qn, local_var_types, class_context + ) + ) + + def _protocol_impl_map(self) -> dict[str, str]: + # (H) A Protocol stub never runs; the concrete implementer does. Map each + # (H) XxxProtocol to a unique non-Protocol class named Xxx (the suffix + # (H) convention disambiguates the real impl from test mocks or other + # (H) structural conformers, which structural matching alone cannot). + if self._protocol_impl_cache is not None: + return self._protocol_impl_cache + sep = cs.SEPARATOR_DOT + protocols: set[str] = set() + classes_by_simple: dict[str, list[str]] = defaultdict(list) + for qn, bases in self.class_inheritance.items(): + classes_by_simple[qn.rsplit(sep, 1)[-1]].append(qn) + if any(base.rsplit(sep, 1)[-1] == cs.PY_PROTOCOL for base in bases): + protocols.add(qn) + impl: dict[str, str] = {} + for protocol_qn in protocols: + simple = protocol_qn.rsplit(sep, 1)[-1] + if simple == cs.PY_PROTOCOL or not simple.endswith(cs.PY_PROTOCOL): + continue + base_name = simple[: -len(cs.PY_PROTOCOL)] + candidates = [ + qn for qn in classes_by_simple.get(base_name, []) if qn not in protocols + ] + if len(candidates) == 1: + impl[protocol_qn] = candidates[0] + self._protocol_impl_cache = impl + return impl + + def _protocol_classes(self) -> set[str]: + if self._protocol_classes_cache is None: + sep = cs.SEPARATOR_DOT + self._protocol_classes_cache = { + qn + for qn, bases in self.class_inheritance.items() + if any(base.rsplit(sep, 1)[-1] == cs.PY_PROTOCOL for base in bases) + } + return self._protocol_classes_cache + + def protocol_dispatch_targets(self, callee_qn: str) -> set[tuple[str, str]]: + # (H) A call resolved to a Protocol stub method (P.M) never runs the stub: the + # (H) runtime receiver is some conformer, so the sound call graph emits an edge + # (H) to M on every non-Protocol class that defines it. Gating on the resolved + # (H) target being a Protocol method keeps this from firing on ordinary calls. + class_qn, sep, method_name = callee_qn.rpartition(cs.SEPARATOR_DOT) + if not sep or class_qn not in self._protocol_classes(): + return set() + protocols = self._protocol_classes() + targets: set[tuple[str, str]] = set() + for qn in self.function_registry.find_ending_with(method_name): + definer, dot, name = qn.rpartition(cs.SEPARATOR_DOT) + if dot and name == method_name and definer not in protocols: + targets.add((self.function_registry[qn], qn)) + return targets + + def _redirect_protocol_method( + self, result: tuple[str, str] | None + ) -> tuple[str, str] | None: + if result is None: + return result + class_qn, sep, method_name = result[1].rpartition(cs.SEPARATOR_DOT) + if not sep: + return result + impl_qn = self._protocol_impl_map().get(class_qn) + if impl_qn is None: + return result + redirected = f"{impl_qn}{cs.SEPARATOR_DOT}{method_name}" + if redirected in self.function_registry: + return self.function_registry[redirected], redirected + return result + + def _resolve_function_call( + self, + call_name: str, + module_qn: str, + local_var_types: dict[str, str] | None = None, + class_context: str | None = None, + ) -> tuple[str, str] | None: + use_cache = not local_var_types + if use_cache: + cache_key = (call_name, module_qn) + if cache_key in self._simple_resolution_cache: + return self._simple_resolution_cache[cache_key] + if result := self._try_resolve_iife(call_name, module_qn): return result @@ -62,12 +251,24 @@ def resolve_function_call( if result := self._try_resolve_via_imports( call_name, module_qn, local_var_types ): + if use_cache: + self._simple_resolution_cache[cache_key] = result return result if result := self._try_resolve_same_module(call_name, module_qn): + if use_cache: + self._simple_resolution_cache[cache_key] = result return result - return self._try_resolve_via_trie(call_name, module_qn) + if class_context and ( + result := self._resolve_self_sibling_method(call_name, class_context) + ): + return result + + result = self._try_resolve_via_trie(call_name, module_qn) + if use_cache: + self._simple_resolution_cache[cache_key] = result + return result def _try_resolve_iife( self, call_name: str, module_qn: str @@ -119,9 +320,7 @@ def _try_resolve_direct_import( return None imported_qn = import_map[call_name] if imported_qn in self.function_registry: - logger.debug( - ls.CALL_DIRECT_IMPORT.format(call_name=call_name, qn=imported_qn) - ) + logger.debug(ls.CALL_DIRECT_IMPORT, call_name=call_name, qn=imported_qn) return self.function_registry[imported_qn], imported_qn return None @@ -132,10 +331,15 @@ def _try_resolve_qualified_call( module_qn: str, local_var_types: dict[str, str] | None, ) -> tuple[str, str] | None: - if not self._has_separator(call_name): + if cs.SEPARATOR_DOUBLE_COLON in call_name: + separator = cs.SEPARATOR_DOUBLE_COLON + elif cs.SEPARATOR_COLON in call_name: + separator = cs.SEPARATOR_COLON + elif cs.SEPARATOR_DOT in call_name: + separator = cs.SEPARATOR_DOT + else: return None - separator = self._get_separator(call_name) parts = call_name.split(separator) if len(parts) == 2: @@ -170,9 +374,17 @@ def _get_separator(self, call_name: str) -> str: def _try_resolve_wildcard_imports( self, call_name: str, import_map: dict[str, str] ) -> tuple[str, str] | None: - for local_name, imported_qn in import_map.items(): - if not local_name.startswith("*"): - continue + map_id = id(import_map) + if map_id not in self._wildcard_cache: + self._wildcard_cache[map_id] = ( + [(k, v) for k, v in import_map.items() if k[0] == "*"] + if import_map + else [] + ) + wildcards = self._wildcard_cache[map_id] + if not wildcards: + return None + for _, imported_qn in wildcards: if result := self._try_wildcard_qns(call_name, imported_qn): return result return None @@ -187,9 +399,7 @@ def _try_wildcard_qns( for wildcard_qn in potential_qns: if wildcard_qn in self.function_registry: - logger.debug( - ls.CALL_WILDCARD.format(call_name=call_name, qn=wildcard_qn) - ) + logger.debug(ls.CALL_WILDCARD, call_name=call_name, qn=wildcard_qn) return self.function_registry[wildcard_qn], wildcard_qn return None @@ -199,7 +409,7 @@ def _try_resolve_same_module( same_module_func_qn = f"{module_qn}.{call_name}" if same_module_func_qn in self.function_registry: logger.debug( - ls.CALL_SAME_MODULE.format(call_name=call_name, qn=same_module_func_qn) + ls.CALL_SAME_MODULE, call_name=call_name, qn=same_module_func_qn ) return self.function_registry[same_module_func_qn], same_module_func_qn return None @@ -207,19 +417,39 @@ def _try_resolve_same_module( def _try_resolve_via_trie( self, call_name: str, module_qn: str ) -> tuple[str, str] | None: - search_name = re.split(r"[.:]|::", call_name)[-1] + search_name = _SEARCH_NAME_CACHE.get(call_name) + if search_name is None: + search_name = _SEPARATOR_PATTERN.split(call_name)[-1] + _SEARCH_NAME_CACHE[call_name] = search_name possible_matches = self.function_registry.find_ending_with(search_name) if not possible_matches: - logger.debug(ls.CALL_UNRESOLVED.format(call_name=call_name)) + logger.debug(ls.CALL_UNRESOLVED, call_name=call_name) return None - possible_matches.sort( - key=lambda qn: self._calculate_import_distance(qn, module_qn) - ) - best_candidate_qn = possible_matches[0] - logger.debug( - ls.CALL_TRIE_FALLBACK.format(call_name=call_name, qn=best_candidate_qn) - ) + if len(possible_matches) == 1: + best_candidate_qn = possible_matches[0] + else: + caller_parts = module_qn.split(cs.SEPARATOR_DOT) + caller_len = len(caller_parts) + caller_parent_prefix = ( + cs.SEPARATOR_DOT.join(caller_parts[:-1]) + cs.SEPARATOR_DOT + if caller_len > 1 + else "" + ) + best_candidate_qn = min( + possible_matches, + key=lambda qn: ( + # (H) An @abstractmethod stub never runs when a concrete override + # (H) exists, so prefer concrete candidates over abstract ones + # (H) even when the abstract stub is closer by import distance. + self.function_registry.is_abstract(qn), + self._import_distance_fast( + qn, caller_parts, caller_len, caller_parent_prefix + ), + qn, + ), + ) + logger.debug(ls.CALL_TRIE_FALLBACK, call_name=call_name, qn=best_candidate_qn) return self.function_registry[best_candidate_qn], best_candidate_qn def _resolve_two_part_call( @@ -293,23 +523,21 @@ def _try_method_on_class( method_qn = f"{class_qn}{separator}{method_name}" if method_qn in self.function_registry: logger.debug( - ls.CALL_TYPE_INFERRED.format( - call_name=call_name, - method_qn=method_qn, - obj=object_name, - var_type=var_type, - ) + ls.CALL_TYPE_INFERRED, + call_name=call_name, + method_qn=method_qn, + obj=object_name, + var_type=var_type, ) return self.function_registry[method_qn], method_qn if inherited := self._resolve_inherited_method(class_qn, method_name): logger.debug( - ls.CALL_TYPE_INFERRED_INHERITED.format( - call_name=call_name, - method_qn=inherited[1], - obj=object_name, - var_type=var_type, - ) + ls.CALL_TYPE_INFERRED_INHERITED, + call_name=call_name, + method_qn=inherited[1], + obj=object_name, + var_type=var_type, ) return inherited return None @@ -336,7 +564,7 @@ def _try_resolve_via_import( if method_qn in self.function_registry: logger.debug( - ls.CALL_IMPORT_STATIC.format(call_name=call_name, method_qn=method_qn) + ls.CALL_IMPORT_STATIC, call_name=call_name, method_qn=method_qn ) return self.function_registry[method_qn], method_qn return None @@ -377,7 +605,7 @@ def _try_resolve_module_method( method_qn = f"{module_qn}.{method_name}" if method_qn in self.function_registry: logger.debug( - ls.CALL_OBJECT_METHOD.format(call_name=call_name, method_qn=method_qn) + ls.CALL_OBJECT_METHOD, call_name=call_name, method_qn=method_qn ) return self.function_registry[method_qn], method_qn return None @@ -401,12 +629,11 @@ def _resolve_self_attribute_call( method_qn = f"{class_qn}.{method_name}" if method_qn in self.function_registry: logger.debug( - ls.CALL_INSTANCE_ATTR.format( - call_name=call_name, - method_qn=method_qn, - attr_ref=attribute_ref, - var_type=var_type, - ) + ls.CALL_INSTANCE_ATTR, + call_name=call_name, + method_qn=method_qn, + attr_ref=attribute_ref, + var_type=var_type, ) return self.function_registry[method_qn], method_qn @@ -414,12 +641,11 @@ def _resolve_self_attribute_call( class_qn, method_name ): logger.debug( - ls.CALL_INSTANCE_ATTR_INHERITED.format( - call_name=call_name, - method_qn=inherited_method[1], - attr_ref=attribute_ref, - var_type=var_type, - ) + ls.CALL_INSTANCE_ATTR_INHERITED, + call_name=call_name, + method_qn=inherited_method[1], + attr_ref=attribute_ref, + var_type=var_type, ) return inherited_method @@ -441,9 +667,9 @@ def _resolve_multi_part_call( method_qn = f"{class_qn}.{method_name}" if method_qn in self.function_registry: logger.debug( - ls.CALL_IMPORT_QUALIFIED.format( - call_name=call_name, method_qn=method_qn - ) + ls.CALL_IMPORT_QUALIFIED, + call_name=call_name, + method_qn=method_qn, ) return self.function_registry[method_qn], method_qn @@ -455,12 +681,11 @@ def _resolve_multi_part_call( method_qn = f"{class_qn}.{method_name}" if method_qn in self.function_registry: logger.debug( - ls.CALL_INSTANCE_QUALIFIED.format( - call_name=call_name, - method_qn=method_qn, - class_name=class_name, - var_type=var_type, - ) + ls.CALL_INSTANCE_QUALIFIED, + call_name=call_name, + method_qn=method_qn, + class_name=class_name, + var_type=var_type, ) return self.function_registry[method_qn], method_qn @@ -468,17 +693,75 @@ def _resolve_multi_part_call( class_qn, method_name ): logger.debug( - ls.CALL_INSTANCE_INHERITED.format( - call_name=call_name, - method_qn=inherited_method[1], - class_name=class_name, - var_type=var_type, - ) + ls.CALL_INSTANCE_INHERITED, + call_name=call_name, + method_qn=inherited_method[1], + class_name=class_name, + var_type=var_type, ) return inherited_method return None + def operator_dunder_targets( + self, + operand_text: str, + dunder: str, + module_qn: str, + local_var_types: dict[str, str] | None, + ) -> set[tuple[str, str]]: + # (H) Operator syntax dispatches to a dunder on the operand's type. Resolve only + # (H) when the operand type is known; never via the name-only trie fallback, so a + # (H) builtin container does not borrow a first-party dunder. A Protocol-typed + # (H) operand dispatches to the dunder on each structural implementer (which may + # (H) define the dunder even when the Protocol stub does not, e.g. __len__). + if not local_var_types or not (var_type := local_var_types.get(operand_text)): + return set() + import_map = self.import_processor.import_mapping.get(module_qn, {}) + class_qn = self._resolve_class_qn_from_type(var_type, import_map, module_qn) + if not class_qn: + return set() + if class_qn in self._protocol_classes(): + # (H) Naming convention (XxxProtocol -> Xxx) is robust when it applies; + # (H) structural conformance covers protocols whose implementer is named + # (H) differently. Union both so neither gap drops a concrete target. + classes = set(self._protocol_structural_implementers(class_qn)) + if named_impl := self._protocol_impl_map().get(class_qn): + classes.add(named_impl) + else: + classes = {class_qn} + targets: set[tuple[str, str]] = set() + for candidate in classes: + if resolved := self._try_resolve_method(candidate, dunder): + targets.add(resolved) + return targets + + def _protocol_structural_implementers(self, protocol_qn: str) -> set[str]: + # (H) Classes that define every method declared on the Protocol (own or + # (H) inherited). Used to dispatch operator dunders to the concrete type when the + # (H) Protocol/implementer names don't follow the XxxProtocol convention. + if protocol_qn in self._struct_impl_cache: + return self._struct_impl_cache[protocol_qn] + sep = cs.SEPARATOR_DOT + protocol_methods = { + qn.rsplit(sep, 1)[-1] + for qn, node_type in self.function_registry.find_with_prefix(protocol_qn) + if node_type == NodeType.METHOD and qn.rsplit(sep, 1)[0] == protocol_qn + } + result: set[str] = set() + if protocol_methods: + protocols = self._protocol_classes() + for candidate in self.class_inheritance: + if candidate in protocols: + continue + if all( + self._try_resolve_method(candidate, method) + for method in protocol_methods + ): + result.add(candidate) + self._struct_impl_cache[protocol_qn] = result + return result + def resolve_builtin_call(self, call_name: str) -> tuple[str, str] | None: if call_name in cs.JS_BUILTIN_PATTERNS: return (cs.NodeLabel.FUNCTION, f"{cs.BUILTIN_PREFIX}.{call_name}") @@ -536,7 +819,7 @@ def _resolve_chained_call( module_qn: str, local_var_types: dict[str, str] | None = None, ) -> tuple[str, str] | None: - match = re.search(r"\.([^.()]+)$", call_name) + match = _CHAINED_METHOD_PATTERN.search(call_name) if not match: return None @@ -559,12 +842,11 @@ def _resolve_chained_call( if method_qn in self.function_registry: logger.debug( - ls.CALL_CHAINED.format( - call_name=call_name, - method_qn=method_qn, - obj_expr=object_expr, - obj_type=object_type, - ) + ls.CALL_CHAINED, + call_name=call_name, + method_qn=method_qn, + obj_expr=object_expr, + obj_type=object_type, ) return self.function_registry[method_qn], method_qn @@ -572,12 +854,11 @@ def _resolve_chained_call( full_object_type, final_method ): logger.debug( - ls.CALL_CHAINED_INHERITED.format( - call_name=call_name, - method_qn=inherited_method[1], - obj_expr=object_expr, - obj_type=object_type, - ) + ls.CALL_CHAINED_INHERITED, + call_name=call_name, + method_qn=inherited_method[1], + obj_expr=object_expr, + obj_type=object_type, ) return inherited_method @@ -596,45 +877,118 @@ def _resolve_super_call( current_class_qn = class_context if not current_class_qn: - logger.debug(ls.CALL_SUPER_NO_CONTEXT.format(call_name=call_name)) + logger.debug(ls.CALL_SUPER_NO_CONTEXT, call_name=call_name) return None if current_class_qn not in self.class_inheritance: - logger.debug(ls.CALL_SUPER_NO_INHERITANCE.format(class_qn=current_class_qn)) + logger.debug(ls.CALL_SUPER_NO_INHERITANCE, class_qn=current_class_qn) return None parent_classes = self.class_inheritance[current_class_qn] if not parent_classes: - logger.debug(ls.CALL_SUPER_NO_PARENTS.format(class_qn=current_class_qn)) + logger.debug(ls.CALL_SUPER_NO_PARENTS, class_qn=current_class_qn) return None if result := self._resolve_inherited_method(current_class_qn, method_name): callee_type, parent_method_qn = result logger.debug( - ls.CALL_SUPER_RESOLVED.format( - call_name=call_name, method_qn=parent_method_qn - ) + ls.CALL_SUPER_RESOLVED, + call_name=call_name, + method_qn=parent_method_qn, ) return callee_type, parent_method_qn logger.debug( - ls.CALL_SUPER_UNRESOLVED.format( - call_name=call_name, class_qn=current_class_qn - ) + ls.CALL_SUPER_UNRESOLVED, + call_name=call_name, + class_qn=current_class_qn, ) return None + def _resolve_self_sibling_method( + self, call_name: str, class_context: str + ) -> tuple[str, str] | None: + # (H) self.method() in a mixin may call a method defined on a SIBLING mixin + # (H) (neither is the other's base); both are combined into a concrete class. + # (H) Resolve through the concrete subclasses' MRO and accept the target only + # (H) when it is unambiguous, so an unrelated same-named method cannot win. + parts = call_name.split(cs.SEPARATOR_DOT) + if len(parts) != 2 or parts[0] != cs.KEYWORD_SELF: + return None + method_name = parts[1] + candidates: set[str] = set() + for subclass_qn in self._concrete_subclasses(class_context): + candidates |= self._mro_method_qns(subclass_qn, method_name) + if not candidates: + return None + # (H) An @abstractmethod stub never runs when a concrete sibling implements the + # (H) method, so prefer concrete candidates; resolve only when unambiguous. + chosen = { + qn for qn in candidates if not self.function_registry.is_abstract(qn) + } or candidates + if len(chosen) != 1: + return None + method_qn = next(iter(chosen)) + logger.debug( + ls.CALL_INSTANCE_ATTR_INHERITED, + call_name=call_name, + method_qn=method_qn, + attr_ref=cs.KEYWORD_SELF, + var_type=class_context, + ) + return self.function_registry[method_qn], method_qn + + def _mro_method_qns(self, class_qn: str, method_name: str) -> set[str]: + results: set[str] = set() + visited: set[str] = set() + queue: deque[str] = deque([class_qn]) + while queue: + current = self._follow_reexports(queue.popleft()) + if current in visited: + continue + visited.add(current) + method_qn = f"{current}.{method_name}" + if method_qn in self.function_registry: + results.add(method_qn) + queue.extend(self.class_inheritance.get(current, ())) + return results + + def _subclass_map(self) -> dict[str, set[str]]: + if self._subclass_map_cache is None: + mapping: dict[str, set[str]] = defaultdict(set) + for subclass_qn, bases in self.class_inheritance.items(): + for base in bases: + mapping[self._follow_reexports(base)].add(subclass_qn) + self._subclass_map_cache = mapping + return self._subclass_map_cache + + def _concrete_subclasses(self, class_qn: str) -> set[str]: + subclass_map = self._subclass_map() + found: set[str] = set() + stack = list(subclass_map.get(class_qn, ())) + while stack: + current = stack.pop() + if current in found: + continue + found.add(current) + stack.extend(subclass_map.get(current, ())) + return found + def _resolve_inherited_method( self, class_qn: str, method_name: str ) -> tuple[str, str] | None: if class_qn not in self.class_inheritance: return None - queue = list(self.class_inheritance.get(class_qn, [])) - visited = set(queue) + bfs_queue = deque(self.class_inheritance.get(class_qn, [])) + visited = set(bfs_queue) - while queue: - parent_class_qn = queue.pop(0) + while bfs_queue: + # (H) Base classes are recorded by the name the subclass imported, which + # (H) may be a package re-export (class_ingest.ClassIngestMixin) rather than + # (H) the real definition (class_ingest.mixin.ClassIngestMixin); follow the + # (H) re-export so the inherited method qn matches the registry. + parent_class_qn = self._follow_reexports(bfs_queue.popleft()) parent_method_qn = f"{parent_class_qn}.{method_name}" if parent_method_qn in self.function_registry: @@ -647,7 +1001,7 @@ def _resolve_inherited_method( for grandparent_qn in self.class_inheritance[parent_class_qn]: if grandparent_qn not in visited: visited.add(grandparent_qn) - queue.append(grandparent_qn) + bfs_queue.append(grandparent_qn) return None @@ -673,6 +1027,30 @@ def _calculate_import_distance( return base_distance + def _import_distance_fast( + self, + candidate_qn: str, + caller_parts: list[str], + caller_len: int, + caller_parent_prefix: str, + ) -> int: + if candidate_qn in _QN_SPLIT_CACHE: + candidate_parts, candidate_len = _QN_SPLIT_CACHE[candidate_qn] + else: + candidate_parts = candidate_qn.split(cs.SEPARATOR_DOT) + candidate_len = len(candidate_parts) + _QN_SPLIT_CACHE[candidate_qn] = (candidate_parts, candidate_len) + common_prefix = 0 + for i in range(min(caller_len, candidate_len)): + if caller_parts[i] == candidate_parts[i]: + common_prefix += 1 + else: + break + base_distance = max(caller_len, candidate_len) - common_prefix + if caller_parent_prefix and candidate_qn.startswith(caller_parent_prefix): + base_distance -= 1 + return base_distance + def _resolve_class_name(self, class_name: str, module_qn: str) -> str | None: return resolve_class_name( class_name, module_qn, self.import_processor, self.function_registry @@ -682,7 +1060,7 @@ def resolve_java_method_call( self, call_node: Node, module_qn: str, - local_var_types: dict[str, str], + local_var_types: dict[str, str] | None, ) -> tuple[str, str] | None: java_engine = self.type_inference.java_type_inference @@ -697,7 +1075,7 @@ def resolve_java_method_call( else cs.TEXT_UNKNOWN ) logger.debug( - ls.CALL_JAVA_RESOLVED.format(call_text=call_text, method_qn=result[1]) + ls.CALL_JAVA_RESOLVED, call_text=call_text, method_qn=result[1] ) return result diff --git a/codebase_rag/parsers/class_ingest/cpp_modules.py b/codebase_rag/parsers/class_ingest/cpp_modules.py index a5db9bc47..afae6d901 100644 --- a/codebase_rag/parsers/class_ingest/cpp_modules.py +++ b/codebase_rag/parsers/class_ingest/cpp_modules.py @@ -8,6 +8,7 @@ from ... import constants as cs from ... import logs +from ...utils.path_utils import cached_relative_path, cached_resolve_posix from ..utils import safe_decode_text, safe_decode_with_fallback from .utils import decode_node_stripped @@ -41,7 +42,7 @@ def ingest_cpp_module_declarations( def _find_module_declarations(root_node: Node) -> list[tuple[Node, str]]: module_declarations: list[tuple[Node, str]] = [] - def find_declarations(node: Node) -> None: + for node in root_node.children: if node.type == cs.TS_MODULE_DECLARATION: module_declarations.append((node, decode_node_stripped(node))) elif node.type == cs.CppNodeType.DECLARATION: @@ -56,10 +57,6 @@ def find_declarations(node: Node) -> None: if has_module: module_declarations.append((node, decode_node_stripped(node))) - for child in node.children: - find_declarations(child) - - find_declarations(root_node) return module_declarations @@ -83,7 +80,8 @@ def _process_export_module( { cs.KEY_QUALIFIED_NAME: interface_qn, cs.KEY_NAME: module_name, - cs.KEY_PATH: str(file_path.relative_to(repo_path)), + cs.KEY_PATH: cached_relative_path(file_path, repo_path).as_posix(), + cs.KEY_ABSOLUTE_PATH: cached_resolve_posix(file_path), cs.KEY_MODULE_TYPE: cs.CPP_MODULE_TYPE_INTERFACE, }, ) @@ -117,7 +115,8 @@ def _process_module_implementation( { cs.KEY_QUALIFIED_NAME: impl_qn, cs.KEY_NAME: f"{module_name}{cs.CPP_IMPL_SUFFIX}", - cs.KEY_PATH: str(file_path.relative_to(repo_path)), + cs.KEY_PATH: cached_relative_path(file_path, repo_path).as_posix(), + cs.KEY_ABSOLUTE_PATH: cached_resolve_posix(file_path), cs.KEY_IMPLEMENTS_MODULE: module_name, cs.KEY_MODULE_TYPE: cs.CPP_MODULE_TYPE_IMPLEMENTATION, }, @@ -141,27 +140,27 @@ def _process_module_implementation( def find_cpp_exported_classes(root_node: Node) -> list[Node]: exported_class_nodes: list[Node] = [] + stack = list(root_node.children) - def traverse(node: Node) -> None: + while stack: + node = stack.pop() if node.type == cs.CppNodeType.FUNCTION_DEFINITION: node_text = decode_node_stripped(node) if node_text.startswith(cs.CPP_EXPORT_PREFIXES): + found = False for child in node.children: if child.type == cs.TS_ERROR and child.text: error_text = safe_decode_text(child) if error_text in cs.CPP_EXPORTED_CLASS_KEYWORDS: exported_class_nodes.append(node) + found = True break - else: - if ( - cs.CPP_EXPORT_CLASS_PREFIX in node_text - or cs.CPP_EXPORT_STRUCT_PREFIX in node_text - ): - exported_class_nodes.append(node) - - for child in node.children: - traverse(child) + if not found and ( + cs.CPP_EXPORT_CLASS_PREFIX in node_text + or cs.CPP_EXPORT_STRUCT_PREFIX in node_text + ): + exported_class_nodes.append(node) + stack.extend(node.children) - traverse(root_node) return exported_class_nodes diff --git a/codebase_rag/parsers/class_ingest/identity.py b/codebase_rag/parsers/class_ingest/identity.py index 85f670444..fc5ba13c6 100644 --- a/codebase_rag/parsers/class_ingest/identity.py +++ b/codebase_rag/parsers/class_ingest/identity.py @@ -7,7 +7,6 @@ from ... import constants as cs from ...language_spec import LANGUAGE_FQN_SPECS -from ...utils.fqn_resolver import resolve_fqn_from_ast from ..cpp import utils as cpp_utils from ..rs import utils as rs_utils from ..utils import safe_decode_text @@ -22,18 +21,23 @@ def resolve_class_identity( language: cs.SupportedLanguage, lang_config: LanguageSpec, file_path: Path | None, - repo_path: Path, - project_name: str, ) -> tuple[str, str, bool] | None: if (fqn_config := LANGUAGE_FQN_SPECS.get(language)) and file_path: - if class_qn := resolve_fqn_from_ast( - class_node, - file_path, - repo_path, - project_name, - fqn_config, - ): - class_name = class_qn.split(cs.SEPARATOR_DOT)[-1] + class_name = fqn_config.get_name(class_node) + if class_name: + parts = [class_name] + current = class_node.parent + while current: + if current.type in fqn_config.scope_node_types: + if scope_name := fqn_config.get_name(current): + parts.append(scope_name) + current = current.parent + parts.reverse() + + # (H) Use the module's already-resolved (and collision-disambiguated) + # (H) qualified name as the prefix rather than recomputing from the path, + # (H) so same-stem cross-language siblings get distinct class/method qns. + class_qn = module_qn + cs.SEPARATOR_DOT + cs.SEPARATOR_DOT.join(parts) is_exported = language == cs.SupportedLanguage.CPP and ( class_node.type == cs.CppNodeType.FUNCTION_DEFINITION or cpp_utils.is_exported(class_node) diff --git a/codebase_rag/parsers/class_ingest/method_override.py b/codebase_rag/parsers/class_ingest/method_override.py index 686ff26e6..9dfc8bedf 100644 --- a/codebase_rag/parsers/class_ingest/method_override.py +++ b/codebase_rag/parsers/class_ingest/method_override.py @@ -66,9 +66,9 @@ def check_method_overrides( (cs.NodeLabel.METHOD, cs.KEY_QUALIFIED_NAME, parent_method_qn), ) logger.debug( - logs.CLASS_METHOD_OVERRIDE.format( - method_qn=method_qn, parent_method_qn=parent_method_qn - ) + logs.CLASS_METHOD_OVERRIDE, + method_qn=method_qn, + parent_method_qn=parent_method_qn, ) return diff --git a/codebase_rag/parsers/class_ingest/mixin.py b/codebase_rag/parsers/class_ingest/mixin.py index 2ba3f8f8c..d2f189e00 100644 --- a/codebase_rag/parsers/class_ingest/mixin.py +++ b/codebase_rag/parsers/class_ingest/mixin.py @@ -1,6 +1,7 @@ from __future__ import annotations from abc import abstractmethod +from bisect import bisect_left, bisect_right from pathlib import Path from typing import TYPE_CHECKING @@ -9,11 +10,14 @@ from ... import constants as cs from ... import logs +from ...config import settings +from ...language_spec import LanguageSpec from ...types_defs import ASTNode, PropertyDict +from ...utils.path_utils import cached_relative_path, cached_resolve_posix from ..java import utils as java_utils from ..py import resolve_class_name from ..rs import utils as rs_utils -from ..utils import ingest_method, safe_decode_text +from ..utils import ingest_method, safe_decode_text, sorted_captures from . import cpp_modules from . import identity as id_ from . import method_override as mo @@ -21,7 +25,6 @@ from . import relationships as rel if TYPE_CHECKING: - from ...language_spec import LanguageSpec from ...services import IngestorProtocol from ...types_defs import ( FunctionRegistryTrieProtocol, @@ -31,7 +34,46 @@ from ..import_processor import ImportProcessor +def _is_nested_inside_function( + node: Node, class_body: Node, lang_config: LanguageSpec +) -> bool: + current = node.parent + while current and current is not class_body: + if ( + current.type in lang_config.function_node_types + and current.child_by_field_name(cs.FIELD_BODY) is not None + ): + return True + current = current.parent + return False + + +def _method_belongs_directly( + method_node: Node, class_node: Node, lang_config: LanguageSpec +) -> bool: + current = method_node.parent + while current is not None: + if current == class_node: + return True + if current.type in lang_config.class_node_types or ( + current.type in lang_config.function_node_types + and current.child_by_field_name(cs.FIELD_BODY) is not None + ): + return False + current = current.parent + return False + + +def _skip_method( + method_node: Node, class_node: Node, class_body: Node, lang_config: LanguageSpec +) -> bool: + if settings.CAPTURE_FUNCTION_LOCAL_DEFINITIONS: + return not _method_belongs_directly(method_node, class_node, lang_config) + return _is_nested_inside_function(method_node, class_body, lang_config) + + class ClassIngestMixin: + __slots__ = () ingestor: IngestorProtocol repo_path: Path project_name: str @@ -47,6 +89,16 @@ def _get_docstring(self, node: ASTNode) -> str | None: ... @abstractmethod def _extract_decorators(self, node: ASTNode) -> list[str]: ... + @abstractmethod + def _determine_function_parent( + self, + func_node: Node, + func_qn: str, + module_qn: str, + lang_config: LanguageSpec, + language: cs.SupportedLanguage | None = None, + ) -> tuple[str, str]: ... + def _resolve_to_qn(self, name: str, module_qn: str) -> str: return self._resolve_class_name(name, module_qn) or f"{module_qn}.{name}" @@ -74,32 +126,44 @@ def _ingest_classes_and_methods( module_qn: str, language: cs.SupportedLanguage, queries: dict[cs.SupportedLanguage, LanguageQueries], + combined_captures: dict[str, list] | None = None, ) -> None: lang_queries = queries[language] - if not (query := lang_queries[cs.QUERY_CLASSES]): - return - lang_config: LanguageSpec = lang_queries[cs.QUERY_CONFIG] - cursor = QueryCursor(query) - captures = cursor.captures(root_node) - class_nodes = captures.get(cs.CAPTURE_CLASS, []) - module_nodes = captures.get(cs.ONEOF_MODULE, []) + + if combined_captures is not None: + class_nodes = list(combined_captures.get(cs.CAPTURE_CLASS, [])) + module_nodes = combined_captures.get(cs.ONEOF_MODULE, []) + else: + if not (query := lang_queries[cs.QUERY_CLASSES]): + return + cursor = QueryCursor(query) + captures = sorted_captures(cursor, root_node) + class_nodes = captures.get(cs.CAPTURE_CLASS, []) + module_nodes = captures.get(cs.ONEOF_MODULE, []) if language == cs.SupportedLanguage.CPP: class_nodes.extend(self._find_cpp_exported_classes(root_node)) file_path = self.module_qn_to_file_path.get(module_qn) + sorted_func_nodes: list[Node] | None = None + func_node_starts: list[int] | None = None + if combined_captures is not None and cs.CAPTURE_FUNCTION in combined_captures: + sorted_func_nodes = combined_captures[cs.CAPTURE_FUNCTION] + func_node_starts = [n.start_byte for n in sorted_func_nodes] + for class_node in class_nodes: - if isinstance(class_node, Node): - self._process_class_node( - class_node, - module_qn, - language, - lang_queries, - lang_config, - file_path, - ) + self._process_class_node( + class_node, + module_qn, + language, + lang_queries, + lang_config, + file_path, + sorted_func_nodes=sorted_func_nodes, + func_node_starts=func_node_starts, + ) self._process_inline_modules(module_nodes, module_qn, lang_config) @@ -111,10 +175,17 @@ def _process_class_node( lang_queries: LanguageQueries, lang_config: LanguageSpec, file_path: Path | None, + sorted_func_nodes: list[Node] | None = None, + func_node_starts: list[int] | None = None, ) -> None: if language == cs.SupportedLanguage.RUST and class_node.type == cs.TS_IMPL_ITEM: self._ingest_rust_impl_methods( - class_node, module_qn, language, lang_queries + class_node, + module_qn, + language, + lang_queries, + sorted_func_nodes=sorted_func_nodes, + func_node_starts=func_node_starts, ) return @@ -124,13 +195,14 @@ def _process_class_node( language, lang_config, file_path, - self.repo_path, - self.project_name, ) if not identity: return class_qn, class_name, is_exported = identity + class_qn = self.function_registry.register_unique_qn( + class_qn, class_node.start_point[0] + 1 + ) node_type = nt.determine_node_type(class_node, class_name, class_qn, language) class_props: PropertyDict = { @@ -142,15 +214,25 @@ def _process_class_node( cs.KEY_DOCSTRING: self._get_docstring(class_node), cs.KEY_IS_EXPORTED: is_exported, } + if file_path is not None: + class_props[cs.KEY_PATH] = cached_relative_path( + file_path, self.repo_path + ).as_posix() + class_props[cs.KEY_ABSOLUTE_PATH] = cached_resolve_posix(file_path) self.ingestor.ensure_node_batch(node_type, class_props) self.function_registry[class_qn] = node_type if class_name: self.simple_name_lookup[class_name].add(class_qn) + parent_label, parent_qn = self._determine_function_parent( + class_node, class_qn, module_qn, lang_config, language + ) rel.create_class_relationships( class_node, class_qn, module_qn, + parent_label, + parent_qn, node_type, is_exported, language, @@ -160,7 +242,15 @@ def _process_class_node( self._resolve_to_qn, self.function_registry, ) - self._ingest_class_methods(class_node, class_qn, language, lang_queries) + self._ingest_class_methods( + class_node, + class_qn, + language, + lang_queries, + file_path, + sorted_func_nodes=sorted_func_nodes, + func_node_starts=func_node_starts, + ) def _ingest_rust_impl_methods( self, @@ -168,31 +258,83 @@ def _ingest_rust_impl_methods( module_qn: str, language: cs.SupportedLanguage, lang_queries: LanguageQueries, + sorted_func_nodes: list[Node] | None = None, + func_node_starts: list[int] | None = None, ) -> None: if not (impl_target := rs_utils.extract_impl_target(class_node)): return - class_qn = f"{module_qn}.{impl_target}" + # (H) An impl block inside `mod inner` targets a type whose node lives + # (H) under the module path (proj...inner.Widget). Resolve the impl target + # (H) against its enclosing module so the method binds to the real type + # (H) node instead of a phantom under the file module. + mod_parts = rs_utils.build_module_path(class_node) + owner_module_qn = ( + f"{module_qn}{cs.SEPARATOR_DOT}{cs.SEPARATOR_DOT.join(mod_parts)}" + if mod_parts + else module_qn + ) + class_qn = f"{owner_module_qn}.{impl_target}" + + # (H) `impl Trait for Type` means Type IMPLEMENTS Trait. The target type's + # (H) node label may be Class/Enum/Type, so match the relationship source + # (H) to its registered label (else the IMPLEMENTS edge never resolves). + if trait_name := rs_utils.extract_impl_trait(class_node): + owner_type = self.function_registry.get(class_qn) + owner_label = ( + cs.NodeLabel(owner_type.value) + if owner_type is not None + else cs.NodeLabel.CLASS + ) + self.ingestor.ensure_relationship_batch( + (owner_label, cs.KEY_QUALIFIED_NAME, class_qn), + cs.RelationshipType.IMPLEMENTS, + ( + cs.NodeLabel.INTERFACE, + cs.KEY_QUALIFIED_NAME, + self._resolve_to_qn(trait_name, owner_module_qn), + ), + ) + body_node = class_node.child_by_field_name("body") - method_query = lang_queries[cs.QUERY_FUNCTIONS] - if not body_node or not method_query: + if not body_node: return - method_cursor = QueryCursor(method_query) - method_captures = method_cursor.captures(body_node) - for method_node in method_captures.get(cs.CAPTURE_FUNCTION, []): - if isinstance(method_node, Node): - ingest_method( - method_node, - class_qn, - cs.NodeLabel.CLASS, - self.ingestor, - self.function_registry, - self.simple_name_lookup, - self._get_docstring, - language, - ) + file_path = self.module_qn_to_file_path.get(module_qn) + lang_config: LanguageSpec = lang_queries[cs.QUERY_CONFIG] + + if sorted_func_nodes is not None and func_node_starts is not None: + body_start = body_node.start_byte + body_end = body_node.end_byte + lo = bisect_left(func_node_starts, body_start) + hi = bisect_right(func_node_starts, body_end) + method_nodes = [ + n for n in sorted_func_nodes[lo:hi] if n.end_byte <= body_end + ] + else: + method_query = lang_queries[cs.QUERY_FUNCTIONS] + if not method_query: + return + method_cursor = QueryCursor(method_query) + method_captures = sorted_captures(method_cursor, body_node) + method_nodes = method_captures.get(cs.CAPTURE_FUNCTION, []) + + for method_node in method_nodes: + if _skip_method(method_node, class_node, body_node, lang_config): + continue + ingest_method( + method_node, + class_qn, + cs.NodeLabel.CLASS, + self.ingestor, + self.function_registry, + self.simple_name_lookup, + self._get_docstring, + language, + file_path=file_path, + repo_path=self.repo_path, + ) def _ingest_class_methods( self, @@ -200,16 +342,34 @@ def _ingest_class_methods( class_qn: str, language: cs.SupportedLanguage, lang_queries: LanguageQueries, + file_path: Path | None = None, + sorted_func_nodes: list[Node] | None = None, + func_node_starts: list[int] | None = None, ) -> None: body_node = class_node.child_by_field_name("body") - method_query = lang_queries[cs.QUERY_FUNCTIONS] - if not body_node or not method_query: + if not body_node: return - method_cursor = QueryCursor(method_query) - method_captures = method_cursor.captures(body_node) - for method_node in method_captures.get(cs.CAPTURE_FUNCTION, []): - if not isinstance(method_node, Node): + lang_config: LanguageSpec = lang_queries[cs.QUERY_CONFIG] + + if sorted_func_nodes is not None and func_node_starts is not None: + body_start = body_node.start_byte + body_end = body_node.end_byte + lo = bisect_left(func_node_starts, body_start) + hi = bisect_right(func_node_starts, body_end) + method_nodes = [ + n for n in sorted_func_nodes[lo:hi] if n.end_byte <= body_end + ] + else: + method_query = lang_queries[cs.QUERY_FUNCTIONS] + if not method_query: + return + method_cursor = QueryCursor(method_query) + method_captures = sorted_captures(method_cursor, body_node) + method_nodes = method_captures.get(cs.CAPTURE_FUNCTION, []) + + for method_node in method_nodes: + if _skip_method(method_node, class_node, body_node, lang_config): continue method_qualified_name = None @@ -233,6 +393,8 @@ def _ingest_class_methods( language, self._extract_decorators, method_qualified_name, + file_path=file_path, + repo_path=self.repo_path, ) def _process_inline_modules( @@ -249,6 +411,13 @@ def _process_inline_modules( if not module_name_node.text: continue + # (H) A bodyless `mod foo;` only declares that the file module foo.rs + # (H) belongs here; foo.rs already yields its own real-path Module node + # (H) with the same qn. Emitting a second synthetic-path node collides + # (H) on that qn and clobbers the file's real path, so skip it. + if module_node.child_by_field_name(cs.FIELD_BODY) is None: + continue + module_name = safe_decode_text(module_name_node) nested_qn = id_.build_nested_qualified_name_for_class( module_node, module_qn, module_name or "", lang_config @@ -259,7 +428,17 @@ def _process_inline_modules( cs.KEY_QUALIFIED_NAME: inline_module_qn, cs.KEY_NAME: module_name, cs.KEY_PATH: f"{cs.INLINE_MODULE_PATH_PREFIX}{module_name}", + cs.KEY_START_LINE: module_node.start_point[0] + 1, + cs.KEY_END_LINE: module_node.end_point[0] + 1, } + # (H) A bodied inline module is physically located in this file; give + # (H) it the real path so it joins containment on (file, line). + file_path = self.module_qn_to_file_path.get(module_qn) + if file_path is not None: + module_props[cs.KEY_PATH] = cached_relative_path( + file_path, self.repo_path + ).as_posix() + module_props[cs.KEY_ABSOLUTE_PATH] = cached_resolve_posix(file_path) logger.info( logs.CLASS_FOUND_INLINE_MODULE.format( name=module_name, qn=inline_module_qn @@ -267,6 +446,17 @@ def _process_inline_modules( ) self.ingestor.ensure_node_batch(cs.NodeLabel.MODULE, module_props) + # (H) Link the inline module into the containment tree: its enclosing + # (H) module (file module, or an outer mod) DEFINES it. Without this the + # (H) inline Module node is an orphan defining nothing. + parent_module_qn = inline_module_qn.rsplit(cs.SEPARATOR_DOT, 1)[0] + if parent_module_qn and parent_module_qn != inline_module_qn: + self.ingestor.ensure_relationship_batch( + (cs.NodeLabel.MODULE, cs.KEY_QUALIFIED_NAME, parent_module_qn), + cs.RelationshipType.DEFINES, + (cs.NodeLabel.MODULE, cs.KEY_QUALIFIED_NAME, inline_module_qn), + ) + def process_all_method_overrides(self) -> None: mo.process_all_method_overrides( self.function_registry, diff --git a/codebase_rag/parsers/class_ingest/node_type.py b/codebase_rag/parsers/class_ingest/node_type.py index 8cdf66d78..7485ab66b 100644 --- a/codebase_rag/parsers/class_ingest/node_type.py +++ b/codebase_rag/parsers/class_ingest/node_type.py @@ -16,19 +16,28 @@ def determine_node_type( language: cs.SupportedLanguage, ) -> NodeType: match class_node.type: - case cs.TS_INTERFACE_DECLARATION: + case cs.TS_GO_TYPE_SPEC | cs.TS_GO_TYPE_ALIAS if ( + language == cs.SupportedLanguage.GO + ): + return _go_type_node_type(class_node, class_name, class_qn) + case cs.TS_INTERFACE_DECLARATION | cs.TS_RS_TRAIT_ITEM: logger.info(logs.CLASS_FOUND_INTERFACE.format(name=class_name, qn=class_qn)) return NodeType.INTERFACE - case cs.TS_ENUM_DECLARATION | cs.TS_ENUM_SPECIFIER | cs.TS_ENUM_CLASS_SPECIFIER: + case ( + cs.TS_ENUM_DECLARATION + | cs.TS_ENUM_SPECIFIER + | cs.TS_ENUM_CLASS_SPECIFIER + | cs.TS_RS_ENUM_ITEM + ): logger.info(logs.CLASS_FOUND_ENUM.format(name=class_name, qn=class_qn)) return NodeType.ENUM - case cs.TS_TYPE_ALIAS_DECLARATION: + case cs.TS_TYPE_ALIAS_DECLARATION | cs.TS_RS_TYPE_ITEM: logger.info(logs.CLASS_FOUND_TYPE.format(name=class_name, qn=class_qn)) return NodeType.TYPE - case cs.TS_STRUCT_SPECIFIER: + case cs.TS_STRUCT_SPECIFIER | cs.TS_RS_STRUCT_ITEM: logger.info(logs.CLASS_FOUND_STRUCT.format(name=class_name, qn=class_qn)) return NodeType.CLASS - case cs.TS_UNION_SPECIFIER: + case cs.TS_UNION_SPECIFIER | cs.TS_RS_UNION_ITEM: logger.info(logs.CLASS_FOUND_UNION.format(name=class_name, qn=class_qn)) return NodeType.UNION case cs.CppNodeType.TEMPLATE_DECLARATION: @@ -47,6 +56,22 @@ def determine_node_type( return NodeType.CLASS +def _go_type_node_type( + class_node: Node, class_name: str | None, class_qn: str +) -> NodeType: + underlying = class_node.child_by_field_name(cs.FIELD_TYPE) + match underlying.type if underlying else None: + case cs.TS_GO_STRUCT_TYPE: + logger.info(logs.CLASS_FOUND_STRUCT.format(name=class_name, qn=class_qn)) + return NodeType.CLASS + case cs.TS_GO_INTERFACE_TYPE: + logger.info(logs.CLASS_FOUND_INTERFACE.format(name=class_name, qn=class_qn)) + return NodeType.INTERFACE + case _: + logger.info(logs.CLASS_FOUND_TYPE.format(name=class_name, qn=class_qn)) + return NodeType.TYPE + + def log_exported_class_type( class_node: Node, class_name: str | None, class_qn: str ) -> None: diff --git a/codebase_rag/parsers/class_ingest/parent_extraction.py b/codebase_rag/parsers/class_ingest/parent_extraction.py index 289e82c35..fd8673748 100644 --- a/codebase_rag/parsers/class_ingest/parent_extraction.py +++ b/codebase_rag/parsers/class_ingest/parent_extraction.py @@ -16,6 +16,21 @@ from ..import_processor import ImportProcessor +def php_base_simple_name(node: Node) -> str | None: + # (H) A PHP base type is a plain `name` (`Base`) or a `qualified_name` + # (H) (`\Exception`, `\App\Base`) whose trailing `name` child is the simple + # (H) name; cgr resolves bases by simple name. + if node.type == cs.TS_PHP_NAME and node.text: + return safe_decode_text(node) + if node.type == cs.TS_PHP_QUALIFIED_NAME: + last: Node | None = None + for child in node.children: + if child.type == cs.TS_PHP_NAME: + last = child + return safe_decode_text(last) if last and last.text else None + return None + + def extract_parent_classes( class_node: Node, module_qn: str, @@ -52,6 +67,22 @@ def extract_parent_classes( ) ) + # (H) PHP `extends` (a class's superclass or an interface's superinterfaces) + # (H) is a base_clause listing `name` nodes; both are inheritance. + if base_clause := find_child_by_type(class_node, cs.TS_PHP_BASE_CLAUSE): + for child in base_clause.children: + if parent_name := php_base_simple_name(child): + parent_classes.append(resolve_to_qn(parent_name, module_qn)) + + # (H) Rust supertrait bound (`trait Sub: Super`) is inheritance between traits. + if class_node.type == cs.TS_RS_TRAIT_ITEM: + if bounds := class_node.child_by_field_name(cs.FIELD_BOUNDS): + for child in bounds.children: + base = java_base_type_identifier(child) + if base is not None and base.text: + if name := safe_decode_text(base): + parent_classes.append(resolve_to_qn(name, module_qn)) + return parent_classes @@ -90,9 +121,9 @@ def parse_cpp_base_classes( ) parent_classes.append(parent_qn) logger.debug( - logs.CLASS_CPP_INHERITANCE.format( - parent_name=parent_name, parent_qn=parent_qn - ) + logs.CLASS_CPP_INHERITANCE, + parent_name=parent_name, + parent_qn=parent_qn, ) return parent_classes @@ -108,13 +139,38 @@ def extract_cpp_base_class_name(parent_text: str) -> str: return parent_text +def java_base_type_identifier(type_node: Node) -> Node | None: + # (H) The base type in a Java extends/implements clause may be plain + # (H) (`Base`), generic (`Base` -> generic_type), or qualified + # (H) (`pkg.Base` -> scoped_type_identifier). Unwrap to the base type's + # (H) type_identifier so generic/qualified bases are captured, not dropped. + if type_node.type == cs.TS_TYPE_IDENTIFIER: + return type_node + if type_node.type == cs.TS_GENERIC_TYPE: + for child in type_node.children: + if child.type in ( + cs.TS_TYPE_IDENTIFIER, + cs.TS_RS_SCOPED_TYPE_IDENTIFIER, + ): + return java_base_type_identifier(child) + if type_node.type == cs.TS_RS_SCOPED_TYPE_IDENTIFIER: + # (H) `a.b.Base` -> the trailing type_identifier is the simple name. + last: Node | None = None + for child in type_node.children: + if child.type == cs.TS_TYPE_IDENTIFIER: + last = child + return last + return None + + def resolve_superclass_from_type_identifier( type_identifier_node: Node, module_qn: str, resolve_to_qn: Callable[[str, str], str], ) -> str | None: - if type_identifier_node.text: - if parent_name := safe_decode_text(type_identifier_node): + base = java_base_type_identifier(type_identifier_node) + if base is not None and base.text: + if parent_name := safe_decode_text(base): return resolve_to_qn(parent_name, module_qn) return None @@ -128,7 +184,12 @@ def extract_java_superclass( if not superclass_node: return [] - if superclass_node.type == cs.TS_TYPE_IDENTIFIER: + _JAVA_BASE_TYPES = ( + cs.TS_TYPE_IDENTIFIER, + cs.TS_GENERIC_TYPE, + cs.TS_RS_SCOPED_TYPE_IDENTIFIER, + ) + if superclass_node.type in _JAVA_BASE_TYPES: if resolved := resolve_superclass_from_type_identifier( superclass_node, module_qn, resolve_to_qn ): @@ -136,7 +197,7 @@ def extract_java_superclass( return [] for child in superclass_node.children: - if child.type == cs.TS_TYPE_IDENTIFIER: + if child.type in _JAVA_BASE_TYPES: if resolved := resolve_superclass_from_type_identifier( child, module_qn, resolve_to_qn ): @@ -158,17 +219,19 @@ def extract_python_superclasses( import_map = import_processor.import_mapping.get(module_qn) for child in superclasses_node.children: - if child.type != cs.TS_IDENTIFIER or not child.text: + if child.type not in (cs.TS_IDENTIFIER, cs.TS_PY_ATTRIBUTE) or not child.text: continue if not (parent_name := safe_decode_text(child)): continue - if import_map and parent_name in import_map: - parent_classes.append(import_map[parent_name]) + head, sep, tail = parent_name.partition(cs.SEPARATOR_DOT) + if import_map and head in import_map: + resolved_head = import_map[head] elif import_map: - parent_classes.append(resolve_to_qn(parent_name, module_qn)) + resolved_head = resolve_to_qn(head, module_qn) else: - parent_classes.append(f"{module_qn}.{parent_name}") + resolved_head = f"{module_qn}.{head}" + parent_classes.append(f"{resolved_head}{sep}{tail}") return parent_classes @@ -238,6 +301,13 @@ def extract_interface_parents( import_processor: ImportProcessor, resolve_to_qn: Callable[[str, str], str], ) -> list[str]: + # (H) Java interface `extends A, B` is an `extends_interfaces` clause holding a + # (H) type_list; superinterfaces are inheritance, so emit them as INHERITS. + if java_extends := find_child_by_type(class_node, cs.TS_JAVA_EXTENDS_INTERFACES): + parents: list[str] = [] + extract_java_interface_names(java_extends, parents, module_qn, resolve_to_qn) + return parents + extends_clause = find_child_by_type(class_node, cs.TS_EXTENDS_TYPE_CLAUSE) if not extends_clause: return [] @@ -310,6 +380,23 @@ def extract_implemented_interfaces( interfaces_node, implemented_interfaces, module_qn, resolve_to_qn ) + # (H) TypeScript `class C implements I, J` lives in class_heritage > + # (H) implements_clause (no `interfaces` field), holding type_identifiers. + if class_heritage := find_child_by_type(class_node, cs.TS_CLASS_HERITAGE): + if implements_clause := find_child_by_type( + class_heritage, cs.TS_IMPLEMENTS_CLAUSE + ): + for child in implements_clause.children: + if child.type == cs.TS_TYPE_IDENTIFIER and child.text: + if name := safe_decode_text(child): + implemented_interfaces.append(resolve_to_qn(name, module_qn)) + + # (H) PHP `class C implements I, J` is a class_interface_clause of `name` nodes. + if php_impl := find_child_by_type(class_node, cs.TS_PHP_CLASS_INTERFACE_CLAUSE): + for child in php_impl.children: + if name := php_base_simple_name(child): + implemented_interfaces.append(resolve_to_qn(name, module_qn)) + return implemented_interfaces @@ -322,6 +409,10 @@ def extract_java_interface_names( for child in interfaces_node.children: if child.type == cs.TS_TYPE_LIST: for type_child in child.children: - if type_child.type == cs.TS_TYPE_IDENTIFIER and type_child.text: - if interface_name := safe_decode_text(type_child): + # (H) Unwrap generic/qualified bases (`TBase`, `pkg.IScheme`) to + # (H) the base type_identifier; plain identifiers pass straight + # (H) through. Skips list punctuation (commas). + base = java_base_type_identifier(type_child) + if base is not None and base.text: + if interface_name := safe_decode_text(base): interface_list.append(resolve_to_qn(interface_name, module_qn)) diff --git a/codebase_rag/parsers/class_ingest/relationships.py b/codebase_rag/parsers/class_ingest/relationships.py index 6af794fac..07daccf37 100644 --- a/codebase_rag/parsers/class_ingest/relationships.py +++ b/codebase_rag/parsers/class_ingest/relationships.py @@ -19,6 +19,8 @@ def create_class_relationships( class_node: Node, class_qn: str, module_qn: str, + parent_label: str, + parent_qn: str, node_type: NodeType, is_exported: bool, language: cs.SupportedLanguage, @@ -34,7 +36,7 @@ def create_class_relationships( class_inheritance[class_qn] = parent_classes ingestor.ensure_relationship_batch( - (cs.NodeLabel.MODULE, cs.KEY_QUALIFIED_NAME, module_qn), + (parent_label, cs.KEY_QUALIFIED_NAME, parent_qn), cs.RelationshipType.DEFINES, (node_type, cs.KEY_QUALIFIED_NAME, class_qn), ) @@ -51,7 +53,9 @@ def create_class_relationships( node_type, class_qn, parent_class_qn, function_registry, ingestor ) - if class_node.type == cs.TS_CLASS_DECLARATION: + # (H) A class OR an enum can `implements` interfaces; both expose them via the + # (H) `interfaces` field (a super_interfaces clause), so handle both. + if class_node.type in (cs.TS_CLASS_DECLARATION, cs.TS_ENUM_DECLARATION): for interface_qn in pe.extract_implemented_interfaces( class_node, module_qn, resolve_to_qn ): diff --git a/codebase_rag/parsers/cpp/utils.py b/codebase_rag/parsers/cpp/utils.py index de9669a33..c5b813d45 100644 --- a/codebase_rag/parsers/cpp/utils.py +++ b/codebase_rag/parsers/cpp/utils.py @@ -57,35 +57,43 @@ def build_qualified_name(node: Node, module_qn: str, name: str) -> str: return cs.SEPARATOR_DOT.join([module_qn, name]) +_EXPORT_CANDIDATE_TYPES = frozenset( + { + cs.CppNodeType.EXPORT, + cs.CppNodeType.EXPORT_KEYWORD, + cs.CppNodeType.IDENTIFIER, + cs.CppNodeType.PRIMITIVE_TYPE, + } +) + +_EXPORT_STOP_TYPES = frozenset( + { + cs.CppNodeType.DECLARATION, + cs.CppNodeType.FUNCTION_DEFINITION, + cs.CppNodeType.TEMPLATE_DECLARATION, + cs.CppNodeType.CLASS_SPECIFIER, + cs.CppNodeType.TRANSLATION_UNIT, + } +) + + def is_exported(node: Node) -> bool: current = node + export_text = cs.CppNodeType.EXPORT while current and current.parent: parent = current.parent - found_export = False for child in parent.children: if child == current: break - if child.text: - child_text = safe_decode_text(child) - if child_text == cs.CppNodeType.EXPORT and child.type in ( - cs.CppNodeType.EXPORT, - cs.CppNodeType.EXPORT_KEYWORD, - cs.CppNodeType.IDENTIFIER, - cs.CppNodeType.PRIMITIVE_TYPE, - ): - found_export = True - - if found_export: - return True + if ( + child.type in _EXPORT_CANDIDATE_TYPES + and child.text + and safe_decode_text(child) == export_text + ): + return True - if current.type in ( - cs.CppNodeType.DECLARATION, - cs.CppNodeType.FUNCTION_DEFINITION, - cs.CppNodeType.TEMPLATE_DECLARATION, - cs.CppNodeType.CLASS_SPECIFIER, - cs.CppNodeType.TRANSLATION_UNIT, - ): + if current.type in _EXPORT_STOP_TYPES: break current = current.parent diff --git a/codebase_rag/parsers/cpp_frontend/__init__.py b/codebase_rag/parsers/cpp_frontend/__init__.py new file mode 100644 index 000000000..eb67d2372 --- /dev/null +++ b/codebase_rag/parsers/cpp_frontend/__init__.py @@ -0,0 +1,14 @@ +from .frontend import ( + cpp_frontend_available, + find_compile_commands, + run_cpp_frontend, +) +from .qn import CppQnResolver, build_module_qn_map + +__all__ = [ + "CppQnResolver", + "build_module_qn_map", + "cpp_frontend_available", + "find_compile_commands", + "run_cpp_frontend", +] diff --git a/codebase_rag/parsers/cpp_frontend/constants.py b/codebase_rag/parsers/cpp_frontend/constants.py new file mode 100644 index 000000000..06c2f7735 --- /dev/null +++ b/codebase_rag/parsers/cpp_frontend/constants.py @@ -0,0 +1,39 @@ +from __future__ import annotations + +from ... import constants as cs + +# (H) libclang CursorKind members are registered dynamically (not static class +# (H) attributes), so they are matched by the stable NAME string that +# (H) `cursor.kind.name` yields at runtime, never via `ci.CursorKind.CLASS_DECL` +# (H) (which trips ty's unresolved-attribute). Same approach as the eval oracle +# (H) (evals/oracles/cpp_oracle.py). + +KIND_NAMESPACE = "NAMESPACE" +KIND_DESTRUCTOR = "DESTRUCTOR" +KIND_BASE_SPECIFIER = "CXX_BASE_SPECIFIER" +KIND_TRANSLATION_UNIT = "TRANSLATION_UNIT" +KIND_CALL_EXPR = "CALL_EXPR" + +# (H) class/struct/union and their templated forms -> a Class node (cgr collapses +# (H) struct/class to Class, matching parsers/cpp + the oracle). +CLASS_KIND_NAMES: frozenset[str] = frozenset( + {"CLASS_DECL", "STRUCT_DECL", "CLASS_TEMPLATE"} +) +# (H) free functions and function templates -> a Function node, UNLESS their +# (H) semantic parent is a class (a templated method is a FUNCTION_TEMPLATE whose +# (H) parent is the class), in which case they are Methods. +FUNCTION_KIND_NAMES: frozenset[str] = frozenset({"FUNCTION_DECL", "FUNCTION_TEMPLATE"}) +# (H) members -> a Method node. +METHOD_KIND_NAMES: frozenset[str] = frozenset( + {"CXX_METHOD", "CONSTRUCTOR", "DESTRUCTOR", "CONVERSION_FUNCTION"} +) +# (H) `using Alias = T;` (TYPE_ALIAS_DECL) and `typedef T Alias;` (TYPEDEF_DECL) +# (H) -> a Type node, matching how the tree-sitter path maps C++ alias/typedef +# (H) declarations (TS_TYPE_ALIAS_DECLARATION) and Go/Rust type decls. +TYPE_KIND_NAMES: frozenset[str] = frozenset({"TYPE_ALIAS_DECL", "TYPEDEF_DECL"}) + +LABEL_MODULE = cs.NodeLabel.MODULE.value +LABEL_CLASS = cs.NodeLabel.CLASS.value +LABEL_FUNCTION = cs.NodeLabel.FUNCTION.value +LABEL_METHOD = cs.NodeLabel.METHOD.value +LABEL_TYPE = cs.NodeLabel.TYPE.value diff --git a/codebase_rag/parsers/cpp_frontend/frontend.py b/codebase_rag/parsers/cpp_frontend/frontend.py new file mode 100644 index 000000000..4208b708b --- /dev/null +++ b/codebase_rag/parsers/cpp_frontend/frontend.py @@ -0,0 +1,364 @@ +from __future__ import annotations + +from pathlib import Path +from typing import TYPE_CHECKING + +from ... import constants as cs +from ...services import IngestorProtocol +from ...types_defs import ( + FunctionRegistryTrieProtocol, + NodeType, + PropertyDict, + SimpleNameLookup, +) +from . import constants as fc +from .qn import CppQnResolver + +if TYPE_CHECKING: + from clang.cindex import Cursor + +_NodeKey = tuple[str, str] +_EdgeKey = tuple[str, str, str, str, str] +_Scope = tuple[str, str] | None + +_COMPILE_COMMANDS = "compile_commands.json" +_BUILD_DIR = "build" + + +def cpp_frontend_available() -> bool: + try: + import clang.cindex as ci + + ci.Index.create() + except Exception: + return False + return True + + +def find_compile_commands(start: Path) -> Path | None: + # (H) Discover the directory holding a compile_commands.json: the indexed + # (H) target, a conventional build/ subdir, then walking up to the repo root. + start = start.resolve() + seen: set[Path] = set() + for candidate in (start, start / _BUILD_DIR, *start.parents): + if candidate in seen: + continue + seen.add(candidate) + if (candidate / _COMPILE_COMMANDS).is_file(): + return candidate + return None + + +def _base_simple_name(spelling: str) -> str: + flat = spelling.replace(cs.SEPARATOR_DOUBLE_COLON, cs.SEPARATOR_DOT) + return flat.rsplit(cs.SEPARATOR_DOT, 1)[-1] + + +def _classify(cursor: Cursor) -> str | None: + kind = cursor.kind.name + if kind in fc.CLASS_KIND_NAMES: + return fc.LABEL_CLASS + if kind in fc.TYPE_KIND_NAMES: + return fc.LABEL_TYPE + if kind in fc.METHOD_KIND_NAMES: + return fc.LABEL_METHOD + if kind in fc.FUNCTION_KIND_NAMES: + parent = cursor.semantic_parent + if parent is not None and parent.kind.name in fc.CLASS_KIND_NAMES: + return fc.LABEL_METHOD + return fc.LABEL_FUNCTION + return None + + +class _Collector: + def __init__( + self, + resolver: CppQnResolver, + function_registry: FunctionRegistryTrieProtocol | None = None, + simple_name_lookup: SimpleNameLookup | None = None, + structural_elements: dict[Path, str | None] | None = None, + ) -> None: + self.resolver = resolver + self.function_registry = function_registry + self.simple_name_lookup = simple_name_lookup + self.structural_elements = structural_elements + self.nodes: dict[_NodeKey, tuple[str, PropertyDict, bool]] = {} + self.modules: dict[str, PropertyDict] = {} + self.edges: set[_EdgeKey] = set() + self.covered: set[str] = set() + + def _node_props(self, cursor: Cursor, qn: str, name: str, rel: str) -> PropertyDict: + return { + cs.KEY_QUALIFIED_NAME: qn, + cs.KEY_NAME: name, + cs.KEY_DECORATORS: [], + cs.KEY_START_LINE: cursor.location.line, + cs.KEY_END_LINE: cursor.extent.end.line, + cs.KEY_DOCSTRING: None, + cs.KEY_IS_EXPORTED: False, + cs.KEY_PATH: rel, + cs.KEY_ABSOLUTE_PATH: Path(cursor.location.file.name).resolve().as_posix(), + } + + def _add_node(self, label: str, qn: str, props: PropertyDict, is_def: bool) -> None: + key: _NodeKey = (label, qn) + existing = self.nodes.get(key) + # (H) Prefer the definition cursor's properties (its span is the accurate + # (H) one) over a mere declaration's, matching cgr where the deferred + # (H) out-of-line definition is ingested last and wins the MERGE. + if existing is None or (is_def and not existing[2]): + self.nodes[key] = (label, props, is_def) + + def _add_module(self, module_qn: str, rel: str, absolute_file: str) -> None: + if module_qn in self.modules: + return + self.modules[module_qn] = { + cs.KEY_QUALIFIED_NAME: module_qn, + cs.KEY_NAME: Path(rel).name, + cs.KEY_PATH: rel, + cs.KEY_ABSOLUTE_PATH: Path(absolute_file).resolve().as_posix(), + } + + def _add_edge( + self, rel_type: str, from_label: str, from_qn: str, to_label: str, to_qn: str + ) -> None: + self.edges.add((rel_type, from_label, from_qn, to_label, to_qn)) + + def process(self, cursor: Cursor, enclosing: _Scope) -> _Scope: + # (H) Returns the scope its subtree should attribute calls to: the node's + # (H) own (label, qn) when it is a function/method, else the unchanged + # (H) enclosing scope. + if cursor.kind.name == fc.KIND_CALL_EXPR: + self._process_call(cursor, enclosing) + return None + label = _classify(cursor) + if label is None or cursor.location.file is None: + return None + if label == fc.LABEL_CLASS and not cursor.is_definition(): + return None # (H) forward declarations are not nodes + rel = self.resolver.rel_path(cursor.location.file.name) + module_qn = self.resolver.module_qn(cursor.location.file.name) + if rel is None or module_qn is None: + return None # (H) outside the indexed repo (system headers, etc.) + + if label == fc.LABEL_METHOD: + return self._process_method(cursor, rel) + if label == fc.LABEL_TYPE: + self._process_type(cursor, rel, module_qn) + return None + + qn = ( + self.resolver.class_qn(cursor) + if label == fc.LABEL_CLASS + else self.resolver.function_qn(cursor) + ) + if qn is None: + return None + self.covered.add(rel) + self._add_module(module_qn, rel, cursor.location.file.name) + self._add_node( + label, + qn, + self._node_props(cursor, qn, cursor.spelling, rel), + cursor.is_definition(), + ) + self._add_edge( + cs.RelationshipType.DEFINES, fc.LABEL_MODULE, module_qn, label, qn + ) + if label == fc.LABEL_CLASS: + self._emit_inheritance(cursor, qn) + return None + return (label, qn) + + def _process_method(self, cursor: Cursor, rel: str) -> _Scope: + qn = self.resolver.method_qn(cursor) + parent = cursor.semantic_parent + if qn is None or parent is None: + return None + class_qn = self.resolver.class_qn(parent) + if class_qn is None: + return None + self.covered.add(rel) + name = self.resolver.member_name(cursor) + self._add_node( + fc.LABEL_METHOD, + qn, + self._node_props(cursor, qn, name, rel), + cursor.is_definition(), + ) + self._add_edge( + cs.RelationshipType.DEFINES_METHOD, + fc.LABEL_CLASS, + class_qn, + fc.LABEL_METHOD, + qn, + ) + return (fc.LABEL_METHOD, qn) + + def _process_type(self, cursor: Cursor, rel: str, module_qn: str) -> None: + # (H) A `using`/`typedef` alias becomes a Type node, DEFINED by its + # (H) enclosing Class (member alias) or its Module (namespace/file scope), + # (H) matching the tree-sitter alias path and Go/Rust type decls. + qn = self.resolver.type_qn(cursor) + if qn is None: + return + self.covered.add(rel) + self._add_module(module_qn, rel, cursor.location.file.name) + self._add_node( + fc.LABEL_TYPE, + qn, + self._node_props(cursor, qn, cursor.spelling, rel), + cursor.is_definition(), + ) + parent = cursor.semantic_parent + if parent is not None and parent.kind.name in fc.CLASS_KIND_NAMES: + class_qn = self.resolver.class_qn(parent) + if class_qn is not None: + self._add_edge( + cs.RelationshipType.DEFINES, + fc.LABEL_CLASS, + class_qn, + fc.LABEL_TYPE, + qn, + ) + return + self._add_edge( + cs.RelationshipType.DEFINES, fc.LABEL_MODULE, module_qn, fc.LABEL_TYPE, qn + ) + + def _process_call(self, cursor: Cursor, enclosing: _Scope) -> None: + # (H) Resolve the callee semantically via cursor.referenced (libclang did + # (H) the overload/name resolution already), preferring its definition so + # (H) the edge targets the node the frontend emitted for the body. + if enclosing is None: + return + referenced = cursor.referenced + if referenced is None: + return + callee = referenced.get_definition() or referenced + callee_label = _classify(callee) + if callee_label is None or callee_label == fc.LABEL_CLASS: + return + callee_qn = ( + self.resolver.method_qn(callee) + if callee_label == fc.LABEL_METHOD + else self.resolver.function_qn(callee) + ) + if callee_qn is None: + return # (H) callee outside the indexed repo (stdlib, etc.) + caller_label, caller_qn = enclosing + self._add_edge( + cs.RelationshipType.CALLS, caller_label, caller_qn, callee_label, callee_qn + ) + + def _emit_inheritance(self, cursor: Cursor, derived_qn: str) -> None: + for child in cursor.get_children(): + if child.kind.name != fc.KIND_BASE_SPECIFIER: + continue + base_decl = child.type.get_declaration() + base_qn = self.resolver.class_qn(base_decl) if base_decl else None + if base_qn is None: + base_qn = _base_simple_name(child.type.spelling) + self._add_edge( + cs.RelationshipType.INHERITS, + fc.LABEL_CLASS, + derived_qn, + fc.LABEL_CLASS, + base_qn, + ) + + def _contains_module_parent(self, rel: str) -> tuple[str, str, str]: + # (H) Mirror DefinitionProcessor's module-parent choice: a Package if the + # (H) directory is one, else a Folder, else the Project at the root. + parent_rel = Path(rel).parent + package_qn = ( + self.structural_elements.get(parent_rel) + if self.structural_elements is not None + else None + ) + if package_qn: + return (cs.NodeLabel.PACKAGE, cs.KEY_QUALIFIED_NAME, package_qn) + if parent_rel != Path(cs.SEPARATOR_DOT): + return (cs.NodeLabel.FOLDER, cs.KEY_PATH, parent_rel.as_posix()) + return (cs.NodeLabel.PROJECT, cs.KEY_NAME, self.resolver.project_name) + + def _register(self, label: str, props: PropertyDict) -> None: + if self.function_registry is None: + return + qn = props[cs.KEY_QUALIFIED_NAME] + if not isinstance(qn, str): + return + self.function_registry[qn] = NodeType(label) + name = props[cs.KEY_NAME] + if self.simple_name_lookup is not None and isinstance(name, str): + self.simple_name_lookup[name].add(qn) + + def flush(self, ingestor: IngestorProtocol) -> None: + for module_qn, props in self.modules.items(): + ingestor.ensure_node_batch(fc.LABEL_MODULE, props) + path = props[cs.KEY_PATH] + if self.structural_elements is not None and isinstance(path, str): + ingestor.ensure_relationship_batch( + self._contains_module_parent(path), + cs.RelationshipType.CONTAINS_MODULE, + (fc.LABEL_MODULE, cs.KEY_QUALIFIED_NAME, module_qn), + ) + for label, props, _ in self.nodes.values(): + ingestor.ensure_node_batch(label, props) + self._register(label, props) + for rel_type, from_label, from_qn, to_label, to_qn in self.edges: + ingestor.ensure_relationship_batch( + (from_label, cs.KEY_QUALIFIED_NAME, from_qn), + rel_type, + (to_label, cs.KEY_QUALIFIED_NAME, to_qn), + ) + + +def _walk(cursor: Cursor, collector: _Collector, enclosing: _Scope = None) -> None: + for child in cursor.get_children(): + produced = collector.process(child, enclosing) + _walk(child, collector, produced or enclosing) + + +def run_cpp_frontend( + ingestor: IngestorProtocol, + repo_path: Path, + project_name: str, + compdb_dir: Path, + function_registry: FunctionRegistryTrieProtocol | None = None, + simple_name_lookup: SimpleNameLookup | None = None, + structural_elements: dict[Path, str | None] | None = None, +) -> frozenset[str]: + """Index C/C++ via libclang + a compile_commands.json (macro-accurate). + + Parses every translation unit in the compilation database, walks the cursor + tree, and emits Module/Class/Function/Method nodes plus DEFINES / + DEFINES_METHOD / INHERITS edges and exact spans straight to the ingestor, + synthesizing the same qualified names the tree-sitter path would. Returns the + set of repo-relative files it covered (so callers can skip them in the + tree-sitter pass). + + When ``function_registry`` / ``simple_name_lookup`` are supplied, emitted + definitions are registered for cross-file resolution; when + ``structural_elements`` is supplied, each Module is linked to its parent via + CONTAINS_MODULE (the full-replace path used by GraphUpdater). + """ + import clang.cindex as ci + + resolver = CppQnResolver(repo_path, project_name) + collector = _Collector( + resolver, function_registry, simple_name_lookup, structural_elements + ) + + db = ci.CompilationDatabase.fromDirectory(str(Path(compdb_dir).resolve())) + index = ci.Index.create() + for command in db.getAllCompileCommands(): + args = list(command.arguments)[1:] + try: + tu = index.parse(None, args=args) + except ci.TranslationUnitLoadError: + continue + _walk(tu.cursor, collector) + + collector.flush(ingestor) + return frozenset(collector.covered) diff --git a/codebase_rag/parsers/cpp_frontend/qn.py b/codebase_rag/parsers/cpp_frontend/qn.py new file mode 100644 index 000000000..b427aa232 --- /dev/null +++ b/codebase_rag/parsers/cpp_frontend/qn.py @@ -0,0 +1,171 @@ +from __future__ import annotations + +import os +from pathlib import Path +from typing import TYPE_CHECKING + +from ... import constants as cs +from ...utils.path_utils import should_skip_rel_file +from ..cpp.utils import convert_operator_symbol_to_name +from . import constants as fc + +if TYPE_CHECKING: + from clang.cindex import Cursor + + +def _eligible_rel_files(repo_path: Path) -> list[str]: + # (H) Reproduce GraphUpdater._collect_eligible_files' ordering exactly: an + # (H) os.walk with dirnames AND filenames sorted, top-down. The module-qn + # (H) disambiguation below depends on this order (the file processed LATER in + # (H) a basename collision is the one that gets its extension appended), so it + # (H) must match cgr's tree-sitter pass to produce identical qualified names. + repo_str = str(repo_path) + repo_prefix_len = len(repo_str) + 1 + rels: list[str] = [] + for dirpath, dirnames, filenames in os.walk(repo_str): + rel_dir = "" if len(dirpath) < repo_prefix_len else dirpath[repo_prefix_len:] + rel_dir = rel_dir.replace(os.sep, "/") + dir_parts = tuple(rel_dir.split("/")) if rel_dir else () + dir_prefix = f"{rel_dir}/" if rel_dir else "" + dirnames[:] = sorted(dirnames) + for fname in sorted(filenames): + dot = fname.rfind(".") + suffix = fname[dot:] if dot != -1 else "" + rel_path_str = f"{dir_prefix}{fname}" + if not should_skip_rel_file(rel_path_str, dir_parts, suffix): + rels.append(rel_path_str) + return rels + + +def _base_module_qn(rel: str, project_name: str) -> str: + rel_path = Path(rel) + if rel_path.name in (cs.INIT_PY, cs.MOD_RS): + parts = rel_path.parent.parts + else: + parts = rel_path.with_suffix("").parts + return cs.SEPARATOR_DOT.join([project_name, *parts]) + + +def build_module_qn_map(repo_path: Path, project_name: str) -> dict[str, str]: + # (H) Mirror DefinitionProcessor._disambiguate_module_qn: a base qn is claimed + # (H) by the first file (in walk order); a later file colliding on that base qn + # (H) gets its extension appended (foo.cpp -> proj.foo, foo.h -> proj.foo.h). + claimed: dict[str, str] = {} + result: dict[str, str] = {} + for rel in _eligible_rel_files(repo_path): + base = _base_module_qn(rel, project_name) + existing = claimed.get(base) + if existing is None or existing == rel: + final = base + else: + suffix = Path(rel).suffix.lstrip(cs.SEPARATOR_DOT) + final = f"{base}{cs.SEPARATOR_DOT}{suffix}" + claimed.setdefault(final, rel) + result[rel] = final + return result + + +class CppQnResolver: + """Synthesizes cgr-correct qualified names for libclang cursors. + + The qns must be byte-identical to what the tree-sitter C++ path produces + (parsers/cpp/utils.build_qualified_name + the deferred out-of-class method + resolver), because the whole graph keys on them. + """ + + def __init__(self, repo_path: Path, project_name: str) -> None: + self.repo_path = repo_path.resolve() + self.project_name = project_name + self._module_qn = build_module_qn_map(self.repo_path, project_name) + + def rel_path(self, absolute_file: str) -> str | None: + try: + return Path(absolute_file).resolve().relative_to(self.repo_path).as_posix() + except ValueError: + return None + + def module_qn(self, absolute_file: str) -> str | None: + rel = self.rel_path(absolute_file) + if rel is None: + return None + return self._module_qn.get(rel) + + def _namespace_chain(self, cursor: Cursor) -> list[str]: + parts: list[str] = [] + parent = cursor.semantic_parent + while parent is not None and parent.kind.name == fc.KIND_NAMESPACE: + if parent.spelling: # (H) skip anonymous namespaces (no name segment) + parts.append(parent.spelling) + parent = parent.semantic_parent + parts.reverse() + return parts + + def member_name(self, cursor: Cursor) -> str: + # (H) Mirror cpp.utils.extract_operator_name / extract_destructor_name: + # (H) destructors keep their `~Name` spelling, operators map their symbol + # (H) through CPP_OPERATOR_SYMBOL_MAP; everything else is its plain name. + spelling = cursor.spelling + if cursor.kind.name == fc.KIND_DESTRUCTOR: + return spelling + if self._is_operator_spelling(spelling): + symbol = spelling[len(cs.CPP_OPERATOR_TEXT_PREFIX) :].strip() + return convert_operator_symbol_to_name(symbol) + return spelling + + @staticmethod + def _is_operator_spelling(spelling: str) -> bool: + prefix = cs.CPP_OPERATOR_TEXT_PREFIX + if not spelling.startswith(prefix): + return False + rest = spelling[len(prefix) :] + # (H) `operator+`, `operator[]`, `operator int` are operators/conversions; + # (H) an identifier like `operatorState` is not (next char is alnum/_). + return not rest or not (rest[0].isalnum() or rest[0] == cs.CHAR_UNDERSCORE) + + def class_qn(self, cursor: Cursor) -> str | None: + if cursor.location.file is None: + return None + module_qn = self.module_qn(cursor.location.file.name) + if module_qn is None: + return None + parts = [module_qn, *self._namespace_chain(cursor), cursor.spelling] + return cs.SEPARATOR_DOT.join(parts) + + def function_qn(self, cursor: Cursor) -> str | None: + if cursor.location.file is None: + return None + module_qn = self.module_qn(cursor.location.file.name) + if module_qn is None: + return None + parts = [module_qn, *self._namespace_chain(cursor), self.member_name(cursor)] + return cs.SEPARATOR_DOT.join(parts) + + def type_qn(self, cursor: Cursor) -> str | None: + # (H) A class-scoped `using`/`typedef` is anchored to its enclosing class + # (H) (e.g. proj.Box.Handle); a namespace/file-scoped one mirrors a free + # (H) function's qn (module + namespace chain + name). + parent = cursor.semantic_parent + if parent is not None and parent.kind.name in fc.CLASS_KIND_NAMES: + class_qn = self.class_qn(parent) + if class_qn is None: + return None + return cs.SEPARATOR_DOT.join([class_qn, cursor.spelling]) + if cursor.location.file is None: + return None + module_qn = self.module_qn(cursor.location.file.name) + if module_qn is None: + return None + parts = [module_qn, *self._namespace_chain(cursor), cursor.spelling] + return cs.SEPARATOR_DOT.join(parts) + + def method_qn(self, cursor: Cursor) -> str | None: + # (H) A method's qn is anchored to its CLASS's declaring file (the header), + # (H) via semantic_parent, NOT the out-of-line definition file. This mirrors + # (H) cgr's deferred out-of-class method resolver. + parent = cursor.semantic_parent + if parent is None: + return None + class_qn = self.class_qn(parent) + if class_qn is None: + return None + return cs.SEPARATOR_DOT.join([class_qn, self.member_name(cursor)]) diff --git a/codebase_rag/parsers/definition_processor.py b/codebase_rag/parsers/definition_processor.py index 8110140f8..a8331698e 100644 --- a/codebase_rag/parsers/definition_processor.py +++ b/codebase_rag/parsers/definition_processor.py @@ -4,16 +4,19 @@ from typing import TYPE_CHECKING from loguru import logger +from tree_sitter import QueryCursor from .. import constants as cs from .. import logs as ls +from ..parser_loader import COMBINED_FUNC_CLASS_IMPORT_QUERIES from ..types_defs import ASTNode, FunctionRegistryTrieProtocol, SimpleNameLookup +from ..utils.path_utils import cached_relative_path, cached_resolve_posix from .class_ingest import ClassIngestMixin from .dependency_parser import parse_dependencies from .function_ingest import FunctionIngestMixin from .handlers import get_handler from .js_ts.ingest import JsTsIngestMixin -from .utils import safe_decode_with_fallback +from .utils import safe_decode_with_fallback, sorted_captures if TYPE_CHECKING: from ..services import IngestorProtocol @@ -38,6 +41,7 @@ def __init__( simple_name_lookup: SimpleNameLookup, import_processor: ImportProcessor, module_qn_to_file_path: dict[str, Path], + func_class_captures_cache: dict[Path, dict] | None = None, ): super().__init__() self.ingestor = ingestor @@ -48,7 +52,22 @@ def __init__( self.import_processor = import_processor self.module_qn_to_file_path = module_qn_to_file_path self.class_inheritance: dict[str, list[str]] = {} + self._deferred_cpp_methods: list = [] + self._deferred_go_methods: list = [] self._handler = get_handler(cs.SupportedLanguage.PYTHON) + self._func_class_captures_cache = func_class_captures_cache + + def _disambiguate_module_qn(self, module_qn: str, file_path: Path) -> str: + # (H) Two files that share a basename but differ by extension (foo.py / + # (H) foo.cpp) strip to the same module qn. Append the extension to the + # (H) later one so their module nodes and all derived class/method qns stay + # (H) distinct instead of colliding under the qualified_name constraint. + existing = self.module_qn_to_file_path.get(module_qn) + if existing is None or existing == file_path: + return module_qn + return ( + f"{module_qn}{cs.SEPARATOR_DOT}{file_path.suffix.lstrip(cs.SEPARATOR_DOT)}" + ) def process_file( self, @@ -56,10 +75,12 @@ def process_file( language: cs.SupportedLanguage, queries: dict[cs.SupportedLanguage, LanguageQueries], structural_elements: dict[Path, str | None], + source_bytes: bytes | None = None, + pre_parsed: tuple[ASTNode, dict[str, list] | None] | None = None, ) -> tuple[ASTNode, cs.SupportedLanguage] | None: if isinstance(file_path, str): file_path = Path(file_path) - relative_path = file_path.relative_to(self.repo_path) + relative_path = cached_relative_path(file_path, self.repo_path) relative_path_str = str(relative_path) logger.info( ls.DEF_PARSING_AST.format(language=language, path=relative_path_str) @@ -75,15 +96,19 @@ def process_file( return None self._handler = get_handler(language) - source_bytes = file_path.read_bytes() - lang_queries = queries[language] - parser = lang_queries.get(cs.KEY_PARSER) - if not parser: - logger.warning(ls.DEF_NO_PARSER.format(language=language)) - return None - - tree = parser.parse(source_bytes) - root_node = tree.root_node + if pre_parsed is not None: + root_node, pre_combined_captures = pre_parsed + else: + if source_bytes is None: + source_bytes = file_path.read_bytes() + lang_queries = queries[language] + parser = lang_queries.get(cs.KEY_PARSER) + if not parser: + logger.warning(ls.DEF_NO_PARSER.format(language=language)) + return None + tree = parser.parse(source_bytes) + root_node = tree.root_node + pre_combined_captures = None module_qn = cs.SEPARATOR_DOT.join( [self.project_name] + list(relative_path.with_suffix("").parts) @@ -92,6 +117,7 @@ def process_file( module_qn = cs.SEPARATOR_DOT.join( [self.project_name] + list(relative_path.parent.parts) ) + module_qn = self._disambiguate_module_qn(module_qn, file_path) self.module_qn_to_file_path[module_qn] = file_path self.ingestor.ensure_node_batch( @@ -100,6 +126,7 @@ def process_file( cs.KEY_QUALIFIED_NAME: module_qn, cs.KEY_NAME: file_path.name, cs.KEY_PATH: relative_path_str, + cs.KEY_ABSOLUTE_PATH: cached_resolve_posix(file_path), }, ) @@ -120,22 +147,61 @@ def process_file( (cs.NodeLabel.MODULE, cs.KEY_QUALIFIED_NAME, module_qn), ) - self.import_processor.parse_imports(root_node, module_qn, language, queries) - self._ingest_missing_import_patterns( - root_node, module_qn, language, queries + if pre_combined_captures is not None: + combined_captures = pre_combined_captures + else: + combined_captures = None + combined_query = COMBINED_FUNC_CLASS_IMPORT_QUERIES.get(language) + if combined_query: + cursor = QueryCursor(combined_query) + combined_captures = sorted_captures(cursor, root_node) + if self._func_class_captures_cache is not None and combined_captures: + cache_entry: dict[str, list] = {} + for key in (cs.CAPTURE_FUNCTION, cs.CAPTURE_CLASS, cs.CAPTURE_CALL): + if key in combined_captures: + cache_entry[key] = combined_captures[key] + if cache_entry: + self._func_class_captures_cache[file_path] = cache_entry + + self.import_processor.parse_imports( + root_node, + module_qn, + language, + queries, + pre_captures=combined_captures, ) + if language in (cs.SupportedLanguage.JS, cs.SupportedLanguage.TS): + self._ingest_missing_import_patterns( + root_node, module_qn, language, queries + ) if language == cs.SupportedLanguage.CPP: self._ingest_cpp_module_declarations(root_node, module_qn, file_path) - self._ingest_all_functions(root_node, module_qn, language, queries) - self._ingest_classes_and_methods(root_node, module_qn, language, queries) - self._ingest_object_literal_methods(root_node, module_qn, language, queries) - self._ingest_commonjs_exports(root_node, module_qn, language, queries) - if language in {cs.SupportedLanguage.JS, cs.SupportedLanguage.TS}: - self._ingest_es6_exports(root_node, module_qn, language, queries) - self._ingest_assignment_arrow_functions( - root_node, module_qn, language, queries + self._ingest_all_functions( + root_node, + module_qn, + language, + queries, + combined_captures=combined_captures, + ) + self._ingest_classes_and_methods( + root_node, + module_qn, + language, + queries, + combined_captures=combined_captures, ) - self._ingest_prototype_inheritance(root_node, module_qn, language, queries) + if language in (cs.SupportedLanguage.JS, cs.SupportedLanguage.TS): + self._ingest_object_literal_methods( + root_node, module_qn, language, queries + ) + self._ingest_commonjs_exports(root_node, module_qn, language, queries) + self._ingest_es6_exports(root_node, module_qn, language, queries) + self._ingest_assignment_arrow_functions( + root_node, module_qn, language, queries + ) + self._ingest_prototype_inheritance( + root_node, module_qn, language, queries + ) return (root_node, language) diff --git a/codebase_rag/parsers/dependency_parser.py b/codebase_rag/parsers/dependency_parser.py index 61f7d4b92..94a66ad87 100644 --- a/codebase_rag/parsers/dependency_parser.py +++ b/codebase_rag/parsers/dependency_parser.py @@ -26,11 +26,15 @@ def _extract_pep508_package_name(dep_string: str) -> tuple[str, str]: class DependencyParser: + __slots__ = () + def parse(self, file_path: Path) -> list[Dependency]: raise NotImplementedError class PyProjectTomlParser(DependencyParser): + __slots__ = () + def parse(self, file_path: Path) -> list[Dependency]: dependencies: list[Dependency] = [] try: @@ -72,6 +76,8 @@ def parse(self, file_path: Path) -> list[Dependency]: class RequirementsTxtParser(DependencyParser): + __slots__ = () + def parse(self, file_path: Path) -> list[Dependency]: dependencies: list[Dependency] = [] try: @@ -92,6 +98,8 @@ def parse(self, file_path: Path) -> list[Dependency]: class PackageJsonParser(DependencyParser): + __slots__ = () + def parse(self, file_path: Path) -> list[Dependency]: dependencies: list[Dependency] = [] try: @@ -120,6 +128,8 @@ def _load_and_collect_deps( class CargoTomlParser(DependencyParser): + __slots__ = () + def parse(self, file_path: Path) -> list[Dependency]: dependencies: list[Dependency] = [] try: @@ -148,6 +158,8 @@ def parse(self, file_path: Path) -> list[Dependency]: class GoModParser(DependencyParser): + __slots__ = () + def parse(self, file_path: Path) -> list[Dependency]: dependencies: list[Dependency] = [] try: @@ -186,6 +198,8 @@ def parse(self, file_path: Path) -> list[Dependency]: class GemfileParser(DependencyParser): + __slots__ = () + def parse(self, file_path: Path) -> list[Dependency]: dependencies: list[Dependency] = [] try: @@ -206,6 +220,8 @@ def parse(self, file_path: Path) -> list[Dependency]: class ComposerJsonParser(DependencyParser): + __slots__ = () + def parse(self, file_path: Path) -> list[Dependency]: dependencies: list[Dependency] = [] try: @@ -229,6 +245,8 @@ def parse(self, file_path: Path) -> list[Dependency]: class CsprojParser(DependencyParser): + __slots__ = () + def parse(self, file_path: Path) -> list[Dependency]: dependencies: list[Dependency] = [] try: diff --git a/codebase_rag/parsers/factory.py b/codebase_rag/parsers/factory.py index a6b8a244c..cdc5206f0 100644 --- a/codebase_rag/parsers/factory.py +++ b/codebase_rag/parsers/factory.py @@ -16,6 +16,25 @@ class ProcessorFactory: + __slots__ = ( + "ingestor", + "repo_path", + "project_name", + "queries", + "function_registry", + "simple_name_lookup", + "ast_cache", + "unignore_paths", + "exclude_paths", + "module_qn_to_file_path", + "_import_processor", + "_structure_processor", + "_definition_processor", + "_type_inference", + "_call_processor", + "_func_class_captures_cache", + ) + def __init__( self, ingestor: IngestorProtocol, @@ -39,6 +58,7 @@ def __init__( self.exclude_paths = exclude_paths self.module_qn_to_file_path: dict[str, Path] = {} + self._func_class_captures_cache: dict[Path, dict] = {} self._import_processor: ImportProcessor | None = None self._structure_processor: StructureProcessor | None = None @@ -81,6 +101,7 @@ def definition_processor(self) -> DefinitionProcessor: simple_name_lookup=self.simple_name_lookup, import_processor=self.import_processor, module_qn_to_file_path=self.module_qn_to_file_path, + func_class_captures_cache=self._func_class_captures_cache, ) return self._definition_processor diff --git a/codebase_rag/parsers/function_ingest.py b/codebase_rag/parsers/function_ingest.py index 1d32186e0..d87240c63 100644 --- a/codebase_rag/parsers/function_ingest.py +++ b/codebase_rag/parsers/function_ingest.py @@ -17,11 +17,13 @@ PropertyDict, SimpleNameLookup, ) -from ..utils.fqn_resolver import resolve_fqn_from_ast +from ..utils.path_utils import cached_relative_path, cached_resolve_posix from .cpp import utils as cpp_utils +from .go import utils as go_utils from .lua import utils as lua_utils from .rs import utils as rs_utils from .utils import ( + callable_parameter_indices, get_function_captures, ingest_method, is_method_node, @@ -40,7 +42,32 @@ class FunctionResolution(NamedTuple): is_exported: bool +class _DeferredMethod(NamedTuple): + """Out-of-class C++ method whose class hasn't been parsed yet.""" + + method_name: str + class_name: str + fallback_class_qn: str + method_props: PropertyDict + + +class _DeferredGoMethod(NamedTuple): + """Go receiver method, linked to its receiver type once all types are known.""" + + method_node: Node + module_qn: str + receiver_type: str + file_path: Path | None + + +# (H) Go node labels a receiver type can resolve to (struct -> Class, defined +# (H) type/alias -> Type, interface -> Interface); used to pick the declaring +# (H) type out of same-named candidates when binding a cross-file method. +_GO_TYPE_NODE_TYPES = frozenset({NodeType.CLASS, NodeType.TYPE, NodeType.INTERFACE}) + + class FunctionIngestMixin: + __slots__ = () ingestor: IngestorProtocol repo_path: Path project_name: str @@ -48,6 +75,8 @@ class FunctionIngestMixin: simple_name_lookup: SimpleNameLookup module_qn_to_file_path: dict[str, Path] _handler: LanguageHandler + _deferred_cpp_methods: list[_DeferredMethod] + _deferred_go_methods: list[_DeferredGoMethod] @abstractmethod def _get_docstring(self, node: ASTNode) -> str | None: ... @@ -61,29 +90,33 @@ def _ingest_all_functions( module_qn: str, language: cs.SupportedLanguage, queries: dict[cs.SupportedLanguage, LanguageQueries], + combined_captures: dict[str, list] | None = None, ) -> None: - result = get_function_captures(root_node, language, queries) - if not result: - return - - lang_config, captures = result + if combined_captures is not None: + lang_queries = queries[language] + lang_config: LanguageSpec = lang_queries[cs.QUERY_CONFIG] + captures = combined_captures + else: + result = get_function_captures(root_node, language, queries) + if not result: + return + lang_config, captures = result file_path = self.module_qn_to_file_path.get(module_qn) + has_classes = bool(captures.get(cs.CAPTURE_CLASS)) for func_node in captures.get(cs.CAPTURE_FUNCTION, []): - if not isinstance(func_node, Node): - logger.warning( - ls.FUNC_EXPECTED_NODE.format( - actual_type=type(func_node), value=func_node - ) - ) - continue - if self._is_method(func_node, lang_config): + if has_classes and self._is_method(func_node, lang_config): continue if language == cs.SupportedLanguage.CPP: if self._handle_cpp_out_of_class_method(func_node, module_qn): continue + if language == cs.SupportedLanguage.GO and self._defer_go_receiver_method( + func_node, module_qn + ): + continue + resolution = self._resolve_function_identity( func_node, module_qn, language, lang_config, file_path ) @@ -102,7 +135,9 @@ def _resolve_function_identity( lang_config: LanguageSpec, file_path: Path | None, ) -> FunctionResolution | None: - resolution = self._try_unified_fqn_resolution(func_node, language, file_path) + resolution = self._try_unified_fqn_resolution( + func_node, module_qn, language, file_path + ) if resolution: return resolution @@ -113,6 +148,7 @@ def _resolve_function_identity( def _try_unified_fqn_resolution( self, func_node: Node, + module_qn: str, language: cs.SupportedLanguage, file_path: Path | None, ) -> FunctionResolution | None: @@ -120,19 +156,31 @@ def _try_unified_fqn_resolution( if not fqn_config or not file_path: return None - func_qn = resolve_fqn_from_ast( - func_node, file_path, self.repo_path, self.project_name, fqn_config - ) - if not func_qn: + func_name = fqn_config.get_name(func_node) + if not func_name: return None - func_name = func_qn.split(cs.SEPARATOR_DOT)[-1] + parts = [func_name] + current = func_node.parent + while current: + if current.type in fqn_config.scope_node_types: + if scope_name := fqn_config.get_name(current): + parts.append(scope_name) + current = current.parent + parts.reverse() + + # (H) Prefix with the module's resolved (collision-disambiguated) qn rather + # (H) than recomputing from the path, so same-stem cross-language siblings + # (H) stay distinct. + func_qn = module_qn + cs.SEPARATOR_DOT + cs.SEPARATOR_DOT.join(parts) + simple_name = func_qn.rsplit(cs.SEPARATOR_DOT, 1)[-1] + is_exported = ( cpp_utils.is_exported(func_node) if language == cs.SupportedLanguage.CPP else False ) - return FunctionResolution(func_qn, func_name, is_exported) + return FunctionResolution(func_qn, simple_name, is_exported) def _fallback_function_resolution( self, @@ -147,6 +195,45 @@ def _fallback_function_resolution( func_node, module_qn, language, lang_config ) + def _resolve_cpp_class_qn( + self, class_name: str, module_qn: str + ) -> tuple[str, bool]: + """Look up an existing Class node for *class_name* across all parsed files. + + Returns ``(class_qn, resolved)`` where *resolved* is True when the + qualified name was obtained from the function registry (i.e. the + class has already been parsed, typically from a header file). + """ + class_name_normalized = class_name.replace( + cs.SEPARATOR_DOUBLE_COLON, cs.SEPARATOR_DOT + ) + leaf_name = class_name_normalized.rsplit(cs.SEPARATOR_DOT, 1)[-1] + + if leaf_name in self.simple_name_lookup: + for candidate_qn in self.simple_name_lookup[leaf_name]: + node_type = self.function_registry.get(candidate_qn) + if node_type in {NodeType.CLASS, NodeType.TYPE}: + if candidate_qn.endswith( + f".{class_name_normalized}" + ) and self._is_cpp_defined(candidate_qn): + return candidate_qn, True + + return f"{module_qn}.{class_name_normalized}", False + + def _is_cpp_defined(self, qn: str) -> bool: + # (H) A C++ out-of-class method may only bind to a class defined in a + # (H) C/C++ source file; matching a same-named class in another language + # (H) would collide their qualified names. Resolve qn -> defining file by + # (H) the longest module-qn prefix and check its extension. + parts = qn.split(cs.SEPARATOR_DOT) + while parts: + if path := self.module_qn_to_file_path.get(cs.SEPARATOR_DOT.join(parts)): + return ( + path.suffix in cs.CPP_EXTENSIONS or path.suffix in cs.C_EXTENSIONS + ) + parts = parts[:-1] + return False + def _handle_cpp_out_of_class_method(self, func_node: Node, module_qn: str) -> bool: if not cpp_utils.is_out_of_class_method_definition(func_node): return False @@ -155,25 +242,163 @@ def _handle_cpp_out_of_class_method(self, func_node: Node, module_qn: str) -> bo if not class_name: return False - class_name_normalized = class_name.replace( - cs.SEPARATOR_DOUBLE_COLON, cs.SEPARATOR_DOT - ) - class_qn = f"{module_qn}.{class_name_normalized}" - - ingest_method( - method_node=func_node, - container_qn=class_qn, - container_type=cs.NodeLabel.CLASS, - ingestor=self.ingestor, - function_registry=self.function_registry, - simple_name_lookup=self.simple_name_lookup, - get_docstring_func=self._get_docstring, - language=cs.SupportedLanguage.CPP, - extract_decorators_func=self._extract_decorators, - ) + class_qn, resolved = self._resolve_cpp_class_qn(class_name, module_qn) + file_path = self.module_qn_to_file_path.get(module_qn) + + if resolved: + ingest_method( + method_node=func_node, + container_qn=class_qn, + container_type=cs.NodeLabel.CLASS, + ingestor=self.ingestor, + function_registry=self.function_registry, + simple_name_lookup=self.simple_name_lookup, + get_docstring_func=self._get_docstring, + language=cs.SupportedLanguage.CPP, + extract_decorators_func=self._extract_decorators, + file_path=file_path, + repo_path=self.repo_path, + ) + else: + method_name = cpp_utils.extract_function_name(func_node) + if not method_name: + return True + decorators = self._extract_decorators(func_node) + props: PropertyDict = { + cs.KEY_NAME: method_name, + cs.KEY_DECORATORS: decorators, + cs.KEY_START_LINE: func_node.start_point[0] + 1, + cs.KEY_END_LINE: func_node.end_point[0] + 1, + cs.KEY_DOCSTRING: self._get_docstring(func_node), + } + if file_path is not None and self.repo_path is not None: + props[cs.KEY_PATH] = cached_relative_path( + file_path, self.repo_path + ).as_posix() + props[cs.KEY_ABSOLUTE_PATH] = cached_resolve_posix(file_path) + if not hasattr(self, "_deferred_cpp_methods"): + self._deferred_cpp_methods = [] + self._deferred_cpp_methods.append( + _DeferredMethod( + method_name=method_name, + class_name=class_name, + fallback_class_qn=class_qn, + method_props=props, + ) + ) return True + def resolve_deferred_cpp_methods(self) -> int: + """Ingest deferred out-of-class C++ methods now that all classes are known. + + Called after all files have been parsed so that every Class node + is guaranteed to be in the registry. Returns the number of + methods that were ingested. + """ + deferred = getattr(self, "_deferred_cpp_methods", None) + if not deferred: + return 0 + + ingested = 0 + for entry in deferred: + real_class_qn, resolved = self._resolve_cpp_class_qn(entry.class_name, "") + class_qn = real_class_qn if resolved else entry.fallback_class_qn + method_qn = f"{class_qn}.{entry.method_name}" + + props = dict(entry.method_props) + props[cs.KEY_QUALIFIED_NAME] = method_qn + + logger.info(ls.METHOD_FOUND.format(name=entry.method_name, qn=method_qn)) + self.ingestor.ensure_node_batch(cs.NodeLabel.METHOD, props) + self.function_registry[method_qn] = NodeType.METHOD + self.simple_name_lookup[entry.method_name].add(method_qn) + + self.ingestor.ensure_relationship_batch( + (cs.NodeLabel.CLASS, cs.KEY_QUALIFIED_NAME, class_qn), + cs.RelationshipType.DEFINES_METHOD, + (cs.NodeLabel.METHOD, cs.KEY_QUALIFIED_NAME, method_qn), + ) + ingested += 1 + + self._deferred_cpp_methods = [] + return ingested + + def _defer_go_receiver_method(self, func_node: Node, module_qn: str) -> bool: + if not go_utils.is_receiver_method(func_node): + return False + receiver_type = go_utils.extract_receiver_type_name(func_node) + if not receiver_type: + return False + if not hasattr(self, "_deferred_go_methods"): + self._deferred_go_methods = [] + self._deferred_go_methods.append( + _DeferredGoMethod( + method_node=func_node, + module_qn=module_qn, + receiver_type=receiver_type, + file_path=self.module_qn_to_file_path.get(module_qn), + ) + ) + return True + + def _resolve_go_container_qn(self, module_qn: str, receiver_type: str) -> str: + # (H) A method binds to its receiver type. Prefer the same-file type, but + # (H) a Go package spans every file in its directory, so fall back to a + # (H) sibling-file type with the same name in the same package. This keeps + # (H) the method's qn and DEFINES_METHOD parent anchored to the real type + # (H) node instead of a phantom under the method's own module. + same_file = f"{module_qn}{cs.SEPARATOR_DOT}{receiver_type}" + if self.function_registry.get(same_file) is not None: + return same_file + package = module_qn.rsplit(cs.SEPARATOR_DOT, 1)[0] + for qn in self.simple_name_lookup.get(receiver_type, set()): + if self.function_registry.get(qn) not in _GO_TYPE_NODE_TYPES: + continue + type_module = qn.rsplit(cs.SEPARATOR_DOT, 1)[0] + if type_module.rsplit(cs.SEPARATOR_DOT, 1)[0] == package: + return qn + return same_file + + def resolve_deferred_go_methods(self) -> int: + """Ingest Go receiver methods now that every receiver type is registered. + + A Go method (``func (p Point) Area()``) is declared at file scope, not + inside its receiver type, so the receiver's node may not exist yet when + the method is first seen. Deferring to after Pass 2 lets the method bind + to the actual node label (``Class`` for structs, ``Type`` for defined + types, ``Interface`` for interfaces). Returns the number ingested. + """ + deferred = getattr(self, "_deferred_go_methods", None) + if not deferred: + return 0 + + for entry in deferred: + container_qn = self._resolve_go_container_qn( + entry.module_qn, entry.receiver_type + ) + container_type = self.function_registry.get(container_qn) + container_label = ( + cs.NodeLabel(container_type.value) + if container_type is not None + else cs.NodeLabel.CLASS + ) + ingest_method( + method_node=entry.method_node, + container_qn=container_qn, + container_type=container_label, + ingestor=self.ingestor, + function_registry=self.function_registry, + simple_name_lookup=self.simple_name_lookup, + get_docstring_func=self._get_docstring, + language=cs.SupportedLanguage.GO, + file_path=entry.file_path, + repo_path=self.repo_path, + ) + ingested = len(deferred) + self._deferred_go_methods = [] + return ingested + def _resolve_cpp_function( self, func_node: Node, module_qn: str ) -> FunctionResolution | None: @@ -238,13 +463,23 @@ def _register_function( language: cs.SupportedLanguage, lang_config: LanguageSpec, ) -> None: - func_props = self._build_function_props(func_node, resolution) + unique_qn = self.function_registry.register_unique_qn( + resolution.qualified_name, func_node.start_point[0] + 1 + ) + if unique_qn != resolution.qualified_name: + resolution = resolution._replace(qualified_name=unique_qn) + + func_props = self._build_function_props(func_node, resolution, module_qn) logger.info( ls.FUNC_FOUND.format(name=resolution.name, qn=resolution.qualified_name) ) self.ingestor.ensure_node_batch(cs.NodeLabel.FUNCTION, func_props) self.function_registry[resolution.qualified_name] = NodeType.FUNCTION + self.function_registry.mark_callable_params( + resolution.qualified_name, + callable_parameter_indices(func_node, language), + ) if resolution.name: self.simple_name_lookup[resolution.name].add(resolution.qualified_name) @@ -253,9 +488,10 @@ def _register_function( ) def _build_function_props( - self, func_node: Node, resolution: FunctionResolution + self, func_node: Node, resolution: FunctionResolution, module_qn: str ) -> PropertyDict: - return { + file_path = self.module_qn_to_file_path.get(module_qn) + props: PropertyDict = { cs.KEY_QUALIFIED_NAME: resolution.qualified_name, cs.KEY_NAME: resolution.name, cs.KEY_DECORATORS: self._extract_decorators(func_node), @@ -264,6 +500,12 @@ def _build_function_props( cs.KEY_DOCSTRING: self._get_docstring(func_node), cs.KEY_IS_EXPORTED: resolution.is_exported, } + if file_path is not None: + props[cs.KEY_PATH] = cached_relative_path( + file_path, self.repo_path + ).as_posix() + props[cs.KEY_ABSOLUTE_PATH] = cached_resolve_posix(file_path) + return props def _create_function_relationships( self, @@ -274,7 +516,7 @@ def _create_function_relationships( lang_config: LanguageSpec, ) -> None: parent_type, parent_qn = self._determine_function_parent( - func_node, module_qn, lang_config + func_node, resolution.qualified_name, module_qn, lang_config, language ) self.ingestor.ensure_relationship_batch( (parent_type, cs.KEY_QUALIFIED_NAME, parent_qn), @@ -444,25 +686,55 @@ def _is_method(self, func_node: Node, lang_config: LanguageSpec) -> bool: return is_method_node(func_node, lang_config) def _determine_function_parent( - self, func_node: Node, module_qn: str, lang_config: LanguageSpec + self, + func_node: Node, + func_qn: str, + module_qn: str, + lang_config: LanguageSpec, + language: cs.SupportedLanguage | None = None, ) -> tuple[str, str]: current = func_node.parent if not isinstance(current, Node): return cs.NodeLabel.MODULE, module_qn + file_path = self.module_qn_to_file_path.get(module_qn) while current and current.type not in lang_config.module_node_types: if current.type in lang_config.function_node_types: - if name_node := current.child_by_field_name(cs.FIELD_NAME): - parent_text = name_node.text - if parent_text is None: - continue - if parent_func_name := safe_decode_text(name_node): - if parent_func_qn := self._build_nested_qualified_name( - current, module_qn, parent_func_name, lang_config - ): - return cs.NodeLabel.FUNCTION, parent_func_qn - break + parent_label = ( + cs.NodeLabel.METHOD + if self._is_method(current, lang_config) + else cs.NodeLabel.FUNCTION + ) + # (H) Bind to the enclosing function's OWN qn, recomputed from its + # (H) node. A function nested in an anonymous callback otherwise + # (H) loses that callback: anonymous scopes contribute no segment to + # (H) the child qn, so trimming the child qn would skip the callback + # (H) and hoist the child to the nearest named ancestor. + resolution = ( + self._resolve_function_identity( + current, module_qn, language, lang_config, file_path + ) + if language is not None + else None + ) + parent_qn = ( + resolution.qualified_name + if resolution + else func_qn.rsplit(cs.SEPARATOR_DOT, 1)[0] + ) + if not parent_qn or parent_qn == func_qn: + break + return parent_label, parent_qn current = current.parent + # (H) A Rust item inside `mod inner` is contained by that inline module, + # (H) not the file module. Its enclosing module qn is the file module plus + # (H) the mod path; the inline Module node carries that exact qn. + if language == cs.SupportedLanguage.RUST and ( + mod_parts := rs_utils.build_module_path(func_node) + ): + nested = module_qn + cs.SEPARATOR_DOT + cs.SEPARATOR_DOT.join(mod_parts) + return cs.NodeLabel.MODULE, nested + return cs.NodeLabel.MODULE, module_qn diff --git a/codebase_rag/parsers/go/__init__.py b/codebase_rag/parsers/go/__init__.py new file mode 100644 index 000000000..78ac91463 --- /dev/null +++ b/codebase_rag/parsers/go/__init__.py @@ -0,0 +1,6 @@ +from .utils import extract_receiver_type_name, is_receiver_method + +__all__ = [ + "extract_receiver_type_name", + "is_receiver_method", +] diff --git a/codebase_rag/parsers/go/utils.py b/codebase_rag/parsers/go/utils.py new file mode 100644 index 000000000..c4f0813c0 --- /dev/null +++ b/codebase_rag/parsers/go/utils.py @@ -0,0 +1,36 @@ +from __future__ import annotations + +from tree_sitter import Node + +from ... import constants as cs +from ..utils import safe_decode_text + + +def is_receiver_method(node: Node) -> bool: + return ( + node.type == cs.TS_GO_METHOD_DECLARATION + and node.child_by_field_name(cs.FIELD_RECEIVER) is not None + ) + + +def extract_receiver_type_name(node: Node) -> str | None: + receiver = node.child_by_field_name(cs.FIELD_RECEIVER) + if receiver is None: + return None + for param in receiver.children: + if param.type != cs.TS_GO_PARAMETER_DECLARATION: + continue + type_node = param.child_by_field_name(cs.FIELD_TYPE) + if type_node is not None: + return _type_identifier_text(type_node) + return None + + +def _type_identifier_text(type_node: Node) -> str | None: + if type_node.type == cs.TS_TYPE_IDENTIFIER and type_node.text: + return safe_decode_text(type_node) + # (H) Unwrap pointer (*T) and generic (T[P]) receivers down to the base name. + for child in type_node.children: + if name := _type_identifier_text(child): + return name + return None diff --git a/codebase_rag/parsers/handlers/base.py b/codebase_rag/parsers/handlers/base.py index 14fa8cec9..7f264c1e1 100644 --- a/codebase_rag/parsers/handlers/base.py +++ b/codebase_rag/parsers/handlers/base.py @@ -13,6 +13,8 @@ class BaseLanguageHandler: + __slots__ = () + def is_inside_method_with_object_literals(self, node: ASTNode) -> bool: return False diff --git a/codebase_rag/parsers/handlers/cpp.py b/codebase_rag/parsers/handlers/cpp.py index d7c9dea04..854bcc4ac 100644 --- a/codebase_rag/parsers/handlers/cpp.py +++ b/codebase_rag/parsers/handlers/cpp.py @@ -17,6 +17,8 @@ class CppHandler(BaseLanguageHandler): + __slots__ = () + def extract_function_name(self, node: ASTNode) -> str | None: if func_name := cpp_utils.extract_function_name(node): return func_name diff --git a/codebase_rag/parsers/handlers/java.py b/codebase_rag/parsers/handlers/java.py index 4bd576beb..882fae0da 100644 --- a/codebase_rag/parsers/handlers/java.py +++ b/codebase_rag/parsers/handlers/java.py @@ -11,6 +11,8 @@ class JavaHandler(BaseLanguageHandler): + __slots__ = () + def extract_decorators(self, node: ASTNode) -> list[str]: return java_utils.extract_from_modifiers_node(node, frozenset()).annotations diff --git a/codebase_rag/parsers/handlers/js_ts.py b/codebase_rag/parsers/handlers/js_ts.py index 7a2ed6684..75c561209 100644 --- a/codebase_rag/parsers/handlers/js_ts.py +++ b/codebase_rag/parsers/handlers/js_ts.py @@ -12,6 +12,8 @@ class JsTsHandler(BaseLanguageHandler): + __slots__ = () + def extract_decorators(self, node: ASTNode) -> list[str]: return [ decorator_text diff --git a/codebase_rag/parsers/handlers/lua.py b/codebase_rag/parsers/handlers/lua.py index 9db185904..6b2d6177f 100644 --- a/codebase_rag/parsers/handlers/lua.py +++ b/codebase_rag/parsers/handlers/lua.py @@ -11,6 +11,8 @@ class LuaHandler(BaseLanguageHandler): + __slots__ = () + def extract_function_name(self, node: ASTNode) -> str | None: if (name_node := node.child_by_field_name(cs.TS_FIELD_NAME)) and name_node.text: from ..utils import safe_decode_text diff --git a/codebase_rag/parsers/handlers/php.py b/codebase_rag/parsers/handlers/php.py new file mode 100644 index 000000000..e529ab7dd --- /dev/null +++ b/codebase_rag/parsers/handlers/php.py @@ -0,0 +1,62 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +from ... import constants as cs +from ..utils import safe_decode_text +from .base import BaseLanguageHandler + +if TYPE_CHECKING: + from ...types_defs import ASTNode + + +class PhpHandler(BaseLanguageHandler): + __slots__ = () + + _CLASS_LIKE_TYPES = frozenset( + { + cs.TS_CLASS_DECLARATION, + cs.TS_INTERFACE_DECLARATION, + cs.TS_PHP_TRAIT_DECLARATION, + cs.TS_ENUM_DECLARATION, + } + ) + + def is_class_method(self, node: ASTNode) -> bool: + parent = node.parent + while parent: + if parent.type in self._CLASS_LIKE_TYPES: + return True + parent = parent.parent + return False + + def extract_function_name(self, node: ASTNode) -> str | None: + if node.type == cs.TS_PHP_ANONYMOUS_FUNCTION: + return f"anonymous_{node.start_point[0]}_{node.start_point[1]}" + if node.type == cs.TS_PHP_ARROW_FUNCTION: + return f"arrow_{node.start_point[0]}_{node.start_point[1]}" + name_node = node.child_by_field_name(cs.TS_FIELD_NAME) + if name_node and name_node.text: + return safe_decode_text(name_node) + return None + + def is_function_exported(self, node: ASTNode) -> bool: + if node.type != cs.TS_PHP_METHOD_DECLARATION: + return True + for child in node.children: + if child.type == cs.TS_PHP_VISIBILITY_MODIFIER: + text = safe_decode_text(child) + return text == "public" + return True + + def extract_decorators(self, node: ASTNode) -> list[str]: + decorators: list[str] = [] + for child in node.children: + if child.type == cs.TS_PHP_ATTRIBUTE_LIST: + for group in child.children: + if group.type == cs.TS_PHP_ATTRIBUTE_GROUP: + for attr in group.children: + if attr.type == cs.TS_PHP_ATTRIBUTE: + if text := safe_decode_text(attr): + decorators.append(text) + return decorators diff --git a/codebase_rag/parsers/handlers/protocol.py b/codebase_rag/parsers/handlers/protocol.py index 9bdbe72b6..893888d78 100644 --- a/codebase_rag/parsers/handlers/protocol.py +++ b/codebase_rag/parsers/handlers/protocol.py @@ -10,6 +10,8 @@ class LanguageHandler(Protocol): + __slots__ = () + def is_inside_method_with_object_literals(self, node: ASTNode) -> bool: ... def is_class_method(self, node: ASTNode) -> bool: ... diff --git a/codebase_rag/parsers/handlers/python.py b/codebase_rag/parsers/handlers/python.py index ae96501a5..1c424fdd8 100644 --- a/codebase_rag/parsers/handlers/python.py +++ b/codebase_rag/parsers/handlers/python.py @@ -11,6 +11,8 @@ class PythonHandler(BaseLanguageHandler): + __slots__ = () + def extract_decorators(self, node: ASTNode) -> list[str]: if not node.parent or node.parent.type != cs.TS_PY_DECORATED_DEFINITION: return [] diff --git a/codebase_rag/parsers/handlers/registry.py b/codebase_rag/parsers/handlers/registry.py index a886d7f9e..6f490700e 100644 --- a/codebase_rag/parsers/handlers/registry.py +++ b/codebase_rag/parsers/handlers/registry.py @@ -8,6 +8,7 @@ from .java import JavaHandler from .js_ts import JsTsHandler from .lua import LuaHandler +from .php import PhpHandler from .protocol import LanguageHandler from .python import PythonHandler from .rust import RustHandler @@ -20,6 +21,7 @@ SupportedLanguage.RUST: RustHandler, SupportedLanguage.JAVA: JavaHandler, SupportedLanguage.LUA: LuaHandler, + SupportedLanguage.PHP: PhpHandler, } _DEFAULT_HANDLER = BaseLanguageHandler diff --git a/codebase_rag/parsers/handlers/rust.py b/codebase_rag/parsers/handlers/rust.py index 650bec974..186704ab2 100644 --- a/codebase_rag/parsers/handlers/rust.py +++ b/codebase_rag/parsers/handlers/rust.py @@ -17,6 +17,8 @@ class RustHandler(BaseLanguageHandler): + __slots__ = () + def extract_decorators(self, node: ASTNode) -> list[str]: outer_decorators: list[str] = [] sibling = node.prev_named_sibling @@ -31,13 +33,12 @@ def extract_decorators(self, node: ASTNode) -> list[str]: if body_node := node.child_by_field_name(cs.FIELD_BODY): nodes_to_search.append(body_node) + inner_attr_type = cs.TS_RS_INNER_ATTRIBUTE_ITEM for search_node in nodes_to_search: - decorators.extend( - attr_text - for child in search_node.children - if child.type == cs.TS_RS_INNER_ATTRIBUTE_ITEM - if (attr_text := safe_decode_text(child)) - ) + for child in search_node.children: + if child.type == inner_attr_type: + if attr_text := safe_decode_text(child): + decorators.append(attr_text) return decorators diff --git a/codebase_rag/parsers/import_processor.py b/codebase_rag/parsers/import_processor.py index 99c3a8526..28f04cac3 100644 --- a/codebase_rag/parsers/import_processor.py +++ b/codebase_rag/parsers/import_processor.py @@ -1,3 +1,4 @@ +from functools import lru_cache from pathlib import Path from loguru import logger @@ -19,10 +20,26 @@ load_persistent_cache, save_persistent_cache, ) -from .utils import get_query_cursor, safe_decode_text, safe_decode_with_fallback +from .utils import ( + get_query_cursor, + safe_decode_text, + safe_decode_with_fallback, + sorted_captures, +) class ImportProcessor: + __slots__ = ( + "repo_path", + "project_name", + "ingestor", + "function_registry", + "import_mapping", + "stdlib_extractor", + "_is_local_module_cached", + "_is_local_java_import_cached", + ) + def __init__( self, repo_path: Path, @@ -39,6 +56,29 @@ def __init__( function_registry, repo_path, project_name ) + repo_is_package = (repo_path / cs.INIT_PY).is_file() + + @lru_cache(maxsize=4096) + def _is_local_module_cached(module_name: str) -> bool: + # (H) When the repo root is itself a package, its children are importable + # (H) only under the package name (project_name.child), never as bare + # (H) top-level names, so a bare top-level import resolves externally. + if repo_is_package: + return module_name == project_name + return ( + (repo_path / module_name).is_dir() + or (repo_path / f"{module_name}{cs.EXT_PY}").is_file() + or (repo_path / module_name / cs.INIT_PY).is_file() + ) + + @lru_cache(maxsize=4096) + def _is_local_java_import_cached(import_path: str) -> bool: + top_level = import_path.split(cs.SEPARATOR_DOT)[0] + return (repo_path / top_level).is_dir() + + self._is_local_module_cached = _is_local_module_cached + self._is_local_java_import_cached = _is_local_java_import_cached + load_persistent_cache() def __del__(self) -> None: @@ -65,6 +105,7 @@ def parse_imports( module_qn: str, language: cs.SupportedLanguage, queries: dict[cs.SupportedLanguage, LanguageQueries], + pre_captures: dict | None = None, ) -> None: if language not in queries: return @@ -77,8 +118,11 @@ def parse_imports( self.import_mapping[module_qn] = {} try: - cursor = get_query_cursor(imports_query) - captures = cursor.captures(root_node) + if pre_captures is not None: + captures = pre_captures + else: + cursor = get_query_cursor(imports_query) + captures = sorted_captures(cursor, root_node) match language: case cs.SupportedLanguage.PYTHON: @@ -95,13 +139,15 @@ def parse_imports( self._parse_cpp_imports(captures, module_qn) case cs.SupportedLanguage.LUA: self._parse_lua_imports(captures, module_qn) + case cs.SupportedLanguage.PHP: + self._parse_php_imports(captures, module_qn) case _: self._parse_generic_imports(captures, module_qn, lang_config) logger.debug( - ls.IMP_PARSED_COUNT.format( - count=len(self.import_mapping[module_qn]), module=module_qn - ) + ls.IMP_PARSED_COUNT, + count=len(self.import_mapping[module_qn]), + module=module_qn, ) if self.ingestor: @@ -124,15 +170,14 @@ def parse_imports( ), ) logger.debug( - ls.IMP_CREATED_RELATIONSHIP.format( - from_module=module_qn, - to_module=module_path, - full_name=full_name, - ) + ls.IMP_CREATED_RELATIONSHIP, + from_module=module_qn, + to_module=module_path, + full_name=full_name, ) except Exception as e: - logger.warning(ls.IMP_PARSE_FAILED.format(module=module_qn, error=e)) + logger.warning(ls.IMP_PARSE_FAILED, module=module_qn, error=e) def _parse_python_imports(self, captures: dict, module_qn: str) -> None: all_imports = captures.get(cs.CAPTURE_IMPORT, []) + captures.get( @@ -159,7 +204,7 @@ def _handle_dotted_name_import(self, child: Node, module_qn: str) -> None: local_name = module_name.split(cs.SEPARATOR_DOT)[0] full_name = self._resolve_import_full_name(module_name, local_name) self.import_mapping[module_qn][local_name] = full_name - logger.debug(ls.IMP_IMPORT.format(local=local_name, full=full_name)) + logger.debug(ls.IMP_IMPORT, local=local_name, full=full_name) def _handle_aliased_import(self, child: Node, module_qn: str) -> None: module_name_node = child.child_by_field_name(cs.FIELD_NAME) @@ -175,23 +220,22 @@ def _handle_aliased_import(self, child: Node, module_qn: str) -> None: top_level = module_name.split(cs.SEPARATOR_DOT)[0] full_name = self._resolve_import_full_name(module_name, top_level) self.import_mapping[module_qn][alias] = full_name - logger.debug(ls.IMP_ALIASED_IMPORT.format(alias=alias, full=full_name)) + logger.debug(ls.IMP_ALIASED_IMPORT, alias=alias, full=full_name) def _resolve_import_full_name(self, module_name: str, top_level: str) -> str: + if module_name == self.project_name or module_name.startswith( + self.project_name + cs.SEPARATOR_DOT + ): + return module_name if self._is_local_module(top_level): return f"{self.project_name}{cs.SEPARATOR_DOT}{module_name}" return module_name def _is_local_module(self, module_name: str) -> bool: - return ( - (self.repo_path / module_name).is_dir() - or (self.repo_path / f"{module_name}{cs.EXT_PY}").is_file() - or (self.repo_path / module_name / cs.INIT_PY).is_file() - ) + return self._is_local_module_cached(module_name) def _is_local_java_import(self, import_path: str) -> bool: - top_level = import_path.split(cs.SEPARATOR_DOT)[0] - return (self.repo_path / top_level).is_dir() + return self._is_local_java_import_cached(import_path) def _resolve_java_import_path(self, import_path: str) -> str: if self._is_local_java_import(import_path): @@ -364,16 +408,26 @@ def _register_python_from_imports( if is_wildcard: wildcard_key = f"*{base_module}" self.import_mapping[module_qn][wildcard_key] = base_module - logger.debug(ls.IMP_WILDCARD_IMPORT.format(module=base_module)) + logger.debug(ls.IMP_WILDCARD_IMPORT, module=base_module) return for local_name, original_name in imported_items: full_name = f"{base_module}{cs.SEPARATOR_DOT}{original_name}" self.import_mapping[module_qn][local_name] = full_name - logger.debug(ls.IMP_FROM_IMPORT.format(local=local_name, full=full_name)) + logger.debug(ls.IMP_FROM_IMPORT, local=local_name, full=full_name) + + def _is_package_qn(self, module_qn: str) -> bool: + prefix = self.project_name + cs.SEPARATOR_DOT + if not module_qn.startswith(prefix): + return False + rel = module_qn[len(prefix) :].replace(cs.SEPARATOR_DOT, cs.SEPARATOR_SLASH) + return (self.repo_path / rel / cs.INIT_PY).is_file() def _resolve_relative_import(self, relative_node: Node, module_qn: str) -> str: - module_parts = module_qn.split(cs.SEPARATOR_DOT)[1:] + # (H) Relative imports are always internal; resolve to the full project- + # (H) prefixed qualified name so resolution does not depend on bare-name + # (H) locality checks (which treat package children as external). + module_parts = module_qn.split(cs.SEPARATOR_DOT) dots = 0 module_name = "" @@ -386,11 +440,21 @@ def _resolve_relative_import(self, relative_node: Node, module_qn: str) -> str: if decoded_name := safe_decode_text(child): module_name = decoded_name - target_parts = module_parts[:-dots] if dots > 0 else module_parts + # (H) A package's qualified name already IS the package, so `from .` inside + # (H) an __init__.py drops one fewer level than inside a regular module. + drop = dots - 1 if self._is_package_qn(module_qn) else dots + keep = max(len(module_parts) - drop, 0) + target_parts = module_parts[:keep] if module_name: target_parts.extend(module_name.split(cs.SEPARATOR_DOT)) + # (H) A relative climb that lands at the project root (e.g. `from . import x` + # (H) in a top-level module) leaves no parts; resolve it to the project root + # (H) so the import is not silently dropped. + if not target_parts: + return self.project_name + return cs.SEPARATOR_DOT.join(target_parts) def _parse_js_ts_imports(self, captures: dict, module_qn: str) -> None: @@ -446,7 +510,7 @@ def _parse_js_import_clause( f"{source_module}{cs.IMPORT_DEFAULT_SUFFIX}" ) logger.debug( - ls.IMP_JS_DEFAULT.format(name=imported_name, module=source_module) + ls.IMP_JS_DEFAULT, name=imported_name, module=source_module ) elif child.type == cs.TS_NAMED_IMPORTS: @@ -465,11 +529,10 @@ def _parse_js_import_clause( f"{source_module}{cs.SEPARATOR_DOT}{imported_name}" ) logger.debug( - ls.IMP_JS_NAMED.format( - local=local_name, - module=source_module, - name=imported_name, - ) + ls.IMP_JS_NAMED, + local=local_name, + module=source_module, + name=imported_name, ) elif child.type == cs.TS_NAMESPACE_IMPORT: @@ -480,9 +543,9 @@ def _parse_js_import_clause( source_module ) logger.debug( - ls.IMP_JS_NAMESPACE.format( - name=namespace_name, module=source_module - ) + ls.IMP_JS_NAMESPACE, + name=namespace_name, + module=source_module, ) break @@ -521,9 +584,9 @@ def _parse_js_require(self, decl_node: Node, current_module: str) -> None: resolved_module ) logger.debug( - ls.IMP_JS_REQUIRE.format( - var=var_name, module=resolved_module - ) + ls.IMP_JS_REQUIRE, + var=var_name, + module=resolved_module, ) break @@ -544,7 +607,7 @@ def _parse_js_reexport(self, export_node: Node, current_module: str) -> None: if child.type == cs.TS_ASTERISK: wildcard_key = f"*{source_module}" self.import_mapping[current_module][wildcard_key] = source_module - logger.debug(ls.IMP_JS_NAMESPACE_REEXPORT.format(module=source_module)) + logger.debug(ls.IMP_JS_NAMESPACE_REEXPORT, module=source_module) elif child.type == cs.TS_EXPORT_CLAUSE: for grandchild in child.children: if grandchild.type == cs.TS_EXPORT_SPECIFIER: @@ -561,11 +624,10 @@ def _parse_js_reexport(self, export_node: Node, current_module: str) -> None: f"{source_module}{cs.SEPARATOR_DOT}{original_name}" ) logger.debug( - ls.IMP_JS_REEXPORT.format( - exported=exported_name, - module=source_module, - original=original_name, - ) + ls.IMP_JS_REEXPORT, + exported=exported_name, + module=source_module, + original=original_name, ) def _parse_java_imports(self, captures: dict, module_qn: str) -> None: @@ -589,22 +651,22 @@ def _parse_java_imports(self, captures: dict, module_qn: str) -> None: resolved_path = self._resolve_java_import_path(imported_path) if is_wildcard: - logger.debug(ls.IMP_JAVA_WILDCARD.format(path=resolved_path)) + logger.debug(ls.IMP_JAVA_WILDCARD, path=resolved_path) self.import_mapping[module_qn][f"*{resolved_path}"] = resolved_path elif parts := resolved_path.split(cs.SEPARATOR_DOT): imported_name = parts[-1] self.import_mapping[module_qn][imported_name] = resolved_path if is_static: logger.debug( - ls.IMP_JAVA_STATIC.format( - name=imported_name, path=resolved_path - ) + ls.IMP_JAVA_STATIC, + name=imported_name, + path=resolved_path, ) else: logger.debug( - ls.IMP_JAVA_IMPORT.format( - name=imported_name, path=resolved_path - ) + ls.IMP_JAVA_IMPORT, + name=imported_name, + path=resolved_path, ) def _parse_rust_imports(self, captures: dict, module_qn: str) -> None: @@ -617,7 +679,7 @@ def _parse_rust_use_declaration(self, use_node: Node, module_qn: str) -> None: for imported_name, full_path in imports.items(): self.import_mapping[module_qn][imported_name] = full_path - logger.debug(ls.IMP_RUST.format(name=imported_name, path=full_path)) + logger.debug(ls.IMP_RUST, name=imported_name, path=full_path) def _parse_go_imports(self, captures: dict, module_qn: str) -> None: for import_node in captures.get(cs.CAPTURE_IMPORT, []): @@ -646,7 +708,7 @@ def _parse_go_import_spec(self, spec_node: Node, module_qn: str) -> None: if import_path: package_name = alias_name or import_path.split(cs.SEPARATOR_SLASH)[-1] self.import_mapping[module_qn][package_name] = import_path - logger.debug(ls.IMP_GO.format(package=package_name, path=import_path)) + logger.debug(ls.IMP_GO, package=package_name, path=import_path) def _parse_cpp_imports(self, captures: dict, module_qn: str) -> None: for import_node in captures.get(cs.CAPTURE_IMPORT, []): @@ -692,9 +754,10 @@ def _parse_cpp_include(self, include_node: Node, module_qn: str) -> None: self.import_mapping[module_qn][local_name] = full_name logger.debug( - ls.IMP_CPP_INCLUDE.format( - local=local_name, full=full_name, system=is_system_include - ) + ls.IMP_CPP_INCLUDE, + local=local_name, + full=full_name, + system=is_system_include, ) def _parse_cpp_module_import(self, import_node: Node, module_qn: str) -> None: @@ -727,7 +790,7 @@ def _parse_cpp_module_import(self, import_node: Node, module_qn: str) -> None: full_name = f"{cs.IMPORT_STD_PREFIX}{module_name}" self.import_mapping[module_qn][local_name] = full_name - logger.debug(ls.IMP_CPP_MODULE.format(local=local_name, full=full_name)) + logger.debug(ls.IMP_CPP_MODULE, local=local_name, full=full_name) def _parse_cpp_module_declaration(self, decl_node: Node, module_qn: str) -> None: decoded_text = safe_decode_text(decl_node) @@ -757,9 +820,9 @@ def _parse_cpp_module_declaration(self, decl_node: Node, module_qn: str) -> None full_name = f"{self.project_name}{cs.SEPARATOR_DOT}{partition_part}" self.import_mapping[module_qn][partition_name] = full_name logger.debug( - ls.IMP_CPP_PARTITION.format( - partition=partition_name, full=full_name - ) + ls.IMP_CPP_PARTITION, + partition=partition_name, + full=full_name, ) def _register_cpp_module_mapping( @@ -769,16 +832,74 @@ def _register_cpp_module_mapping( self.import_mapping[module_qn][module_name] = ( f"{self.project_name}{cs.SEPARATOR_DOT}{module_name}" ) - logger.debug(log_template.format(name=module_name)) + logger.debug(log_template, name=module_name) + + _PHP_INCLUDE_REQUIRE_TYPES = frozenset( + { + cs.TS_PHP_INCLUDE_EXPRESSION, + cs.TS_PHP_INCLUDE_ONCE_EXPRESSION, + cs.TS_PHP_REQUIRE_EXPRESSION, + cs.TS_PHP_REQUIRE_ONCE_EXPRESSION, + } + ) + + def _parse_php_imports(self, captures: dict, module_qn: str) -> None: + all_imports = captures.get(cs.CAPTURE_IMPORT, []) + captures.get( + cs.CAPTURE_IMPORT_FROM, [] + ) + for import_node in all_imports: + if import_node.type == cs.TS_PHP_NAMESPACE_USE_DECLARATION: + self._handle_php_use_declaration(import_node, module_qn) + elif import_node.type in self._PHP_INCLUDE_REQUIRE_TYPES: + self._handle_php_include_require(import_node, module_qn) + + def _handle_php_use_declaration(self, use_node: Node, module_qn: str) -> None: + for child in use_node.named_children: + if child.type != cs.TS_PHP_NAMESPACE_USE_CLAUSE: + continue + qn_node = next( + (c for c in child.named_children if c.type == cs.TS_PHP_QUALIFIED_NAME), + None, + ) + if not qn_node: + continue + imported_path = safe_decode_with_fallback(qn_node) + if not imported_path: + continue + imported_path = imported_path.replace("\\", cs.SEPARATOR_DOT) + alias_node = child.child_by_field_name("alias") + if alias_node and alias_node.text: + local_name = safe_decode_with_fallback(alias_node) + else: + parts = imported_path.split(cs.SEPARATOR_DOT) + local_name = parts[-1] if parts else imported_path + self.import_mapping[module_qn][local_name] = imported_path + + def _handle_php_include_require(self, node: Node, module_qn: str) -> None: + for child in node.children: + if child.type in {"string", "encapsed_string"}: + raw = safe_decode_with_fallback(child) + if not raw: + continue + path_str = raw.strip("'\"") + path_str = path_str.replace("/", cs.SEPARATOR_DOT).replace( + "\\", cs.SEPARATOR_DOT + ) + if path_str.endswith(".php"): + path_str = path_str[:-4] + parts = path_str.split(cs.SEPARATOR_DOT) + local_name = parts[-1] if parts else path_str + self.import_mapping[module_qn][local_name] = path_str + return def _parse_generic_imports( self, captures: dict, module_qn: str, lang_config: LanguageSpec ) -> None: for import_node in captures.get(cs.CAPTURE_IMPORT, []): logger.debug( - ls.IMP_GENERIC.format( - language=lang_config.language, node_type=import_node.type - ) + ls.IMP_GENERIC, + language=lang_config.language, + node_type=import_node.type, ) def _parse_lua_imports(self, captures: dict, module_qn: str) -> None: diff --git a/codebase_rag/parsers/java/method_resolver.py b/codebase_rag/parsers/java/method_resolver.py index 01bd25cae..08b9514fb 100644 --- a/codebase_rag/parsers/java/method_resolver.py +++ b/codebase_rag/parsers/java/method_resolver.py @@ -8,9 +8,14 @@ from ... import constants as cs from ... import logs as ls +from ...decorators import recursion_guard from ...types_defs import ASTNode, NodeType from ..utils import safe_decode_text -from .utils import extract_method_call_info, get_class_context_from_qn +from .utils import ( + extract_class_info, + extract_method_call_info, + get_class_context_from_qn, +) if TYPE_CHECKING: from pathlib import Path @@ -20,6 +25,7 @@ class JavaMethodResolverMixin: + __slots__ = () import_processor: ImportProcessor function_registry: FunctionRegistryTrieProtocol project_name: str @@ -53,14 +59,29 @@ def _get_current_class_name(self, module_qn: str) -> str | None: ... @abstractmethod def _lookup_variable_type(self, var_name: str, module_qn: str) -> str | None: ... + @abstractmethod + def _lookup_java_field_type( + self, class_type: str, field_name: str, module_qn: str + ) -> str | None: ... + + @abstractmethod + def _find_containing_java_class(self, node: ASTNode) -> ASTNode | None: ... + def _resolve_java_object_type( - self, object_ref: str, local_var_types: dict[str, str], module_qn: str + self, + object_ref: str, + local_var_types: dict[str, str], + module_qn: str, + context_node: ASTNode | None = None, ) -> str | None: if object_ref in local_var_types: return local_var_types[object_ref] - # (H) Check for 'this' reference - find the containing class (using trie for O(k) lookup) + # (H) Check for 'this' reference - prefer the lexical containing class (precise in + # (H) multi-class files); fall back to the first class under the module otherwise. if object_ref == cs.JAVA_KEYWORD_THIS: + if lexical := self._lexical_class_qn(context_node, module_qn): + return lexical return next( ( str(qn) @@ -72,8 +93,13 @@ def _resolve_java_object_type( None, ) - # (H) Check for 'super' reference - for super calls, look at parent classes (using trie for O(k) lookup) + # (H) Check for 'super' reference - resolve the lexical class then its parent when + # (H) available; otherwise fall back to the first class under the module with a parent. if object_ref == cs.JAVA_KEYWORD_SUPER: + if (lexical := self._lexical_class_qn(context_node, module_qn)) and ( + parent_qn := self._find_parent_class(lexical) + ): + return parent_qn for qn, entity_type in self.function_registry.find_with_prefix(module_qn): if entity_type == NodeType.CLASS: if parent_qn := self._find_parent_class(qn): @@ -92,8 +118,54 @@ def _resolve_java_object_type( ): return simple_class_qn + # (H) A receiver like `obj.engine` (field access on a typed variable) is not a + # (H) single name: resolve the base, then walk each field's declared type across + # (H) classes so `obj.engine.start()` and deeper chains resolve to a method. + if cs.SEPARATOR_DOT in object_ref: + return self._resolve_field_access_chain_type( + object_ref, local_var_types, module_qn, context_node + ) + return None + def _lexical_class_qn( + self, context_node: ASTNode | None, module_qn: str + ) -> str | None: + if context_node is None: + return None + if not (class_node := self._find_containing_java_class(context_node)): + return None + if not (class_name := extract_class_info(class_node).get(cs.FIELD_NAME)): + return None + return self._resolve_java_type_name(class_name, module_qn) + + def _resolve_field_access_chain_type( + self, + object_ref: str, + local_var_types: dict[str, str], + module_qn: str, + context_node: ASTNode | None = None, + ) -> str | None: + parts = object_ref.split(cs.SEPARATOR_DOT) + if len(parts) < 2: + return None + + current_type = self._resolve_java_object_type( + parts[0], local_var_types, module_qn, context_node + ) + if not current_type: + return None + + for field_name in parts[1:]: + next_type = self._lookup_java_field_type( + current_type, field_name, module_qn + ) + if not next_type: + return None + current_type = next_type + + return current_type + def _find_parent_class(self, class_qn: str) -> str | None: parent_classes = self.class_inheritance.get(class_qn, []) return parent_classes[0] if parent_classes else None @@ -202,6 +274,10 @@ def _is_matching_method(self, member: str, method_name: str) -> bool: or member == f"{method_name}{cs.EMPTY_PARENS}" ) + @recursion_guard( + key_func=lambda self, class_qn, *_, **__: class_qn, + guard_name=cs.GUARD_INHERITED_METHOD, + ) def _find_inherited_method( self, class_qn: str, method_name: str, module_qn: str ) -> tuple[str, str] | None: @@ -235,8 +311,10 @@ def _resolve_java_method_return_type( parts = method_call.split(cs.SEPARATOR_DOT) if len(parts) < 2: method_name = method_call - if current_class_qn := self._get_current_class_name(module_qn): - return self._find_method_return_type(current_class_qn, method_name) + if (current_class_qn := self._get_current_class_name(module_qn)) and ( + result := self._find_method_return_type(current_class_qn, method_name) + ): + return result else: object_part = cs.SEPARATOR_DOT.join(parts[:-1]) method_name = parts[-1] @@ -348,34 +426,32 @@ def _do_resolve_java_method_call( logger.debug(ls.JAVA_NO_METHOD_NAME) return None - logger.debug( - ls.JAVA_RESOLVING_CALL.format(method=method_name, object=object_ref) - ) + logger.debug(ls.JAVA_RESOLVING_CALL, method=method_name, object=object_ref) if not object_ref: - logger.debug(ls.JAVA_RESOLVING_STATIC.format(method=method_name)) + logger.debug(ls.JAVA_RESOLVING_STATIC, method=method_name) result = self._resolve_static_or_local_method(str(method_name), module_qn) if result: - logger.debug(ls.JAVA_FOUND_STATIC.format(result=result)) + logger.debug(ls.JAVA_FOUND_STATIC, result=result) else: - logger.debug(ls.JAVA_STATIC_NOT_FOUND.format(method=method_name)) + logger.debug(ls.JAVA_STATIC_NOT_FOUND, method=method_name) return result - logger.debug(ls.JAVA_RESOLVING_OBJ_TYPE.format(object=object_ref)) + logger.debug(ls.JAVA_RESOLVING_OBJ_TYPE, object=object_ref) if not ( object_type := self._resolve_java_object_type( - str(object_ref), local_var_types, module_qn + str(object_ref), local_var_types, module_qn, call_node ) ): - logger.debug(ls.JAVA_OBJ_TYPE_UNKNOWN.format(object=object_ref)) + logger.debug(ls.JAVA_OBJ_TYPE_UNKNOWN, object=object_ref) return None - logger.debug(ls.JAVA_OBJ_TYPE_RESOLVED.format(type=object_type)) + logger.debug(ls.JAVA_OBJ_TYPE_RESOLVED, type=object_type) result = self._resolve_instance_method(object_type, str(method_name), module_qn) if result: - logger.debug(ls.JAVA_FOUND_INSTANCE.format(result=result)) + logger.debug(ls.JAVA_FOUND_INSTANCE, result=result) else: logger.debug( - ls.JAVA_INSTANCE_NOT_FOUND.format(type=object_type, method=method_name) + ls.JAVA_INSTANCE_NOT_FOUND, type=object_type, method=method_name ) return result diff --git a/codebase_rag/parsers/java/type_inference.py b/codebase_rag/parsers/java/type_inference.py index 8fd86a7d2..2e949e5f5 100644 --- a/codebase_rag/parsers/java/type_inference.py +++ b/codebase_rag/parsers/java/type_inference.py @@ -26,6 +26,21 @@ class JavaTypeInferenceEngine( JavaVariableAnalyzerMixin, JavaMethodResolverMixin, ): + __slots__ = ( + "import_processor", + "function_registry", + "repo_path", + "project_name", + "ast_cache", + "queries", + "module_qn_to_file_path", + "class_inheritance", + "simple_name_lookup", + "_lookup_cache", + "_lookup_in_progress", + "_fqn_to_module_qn", + ) + def __init__( self, import_processor: ImportProcessor, @@ -83,17 +98,19 @@ def build_variable_type_map( try: self._collect_all_variable_types(scope_node, local_var_types, module_qn) - logger.debug(ls.JAVA_VAR_TYPE_MAP_BUILT.format(count=len(local_var_types))) + logger.debug(ls.JAVA_VAR_TYPE_MAP_BUILT, count=len(local_var_types)) except Exception as e: - logger.error(ls.JAVA_VAR_TYPE_MAP_FAILED.format(error=e)) + logger.error(ls.JAVA_VAR_TYPE_MAP_FAILED, error=e) return local_var_types def resolve_java_method_call( - self, call_node: ASTNode, local_var_types: dict[str, str], module_qn: str + self, call_node: ASTNode, local_var_types: dict[str, str] | None, module_qn: str ) -> tuple[str, str] | None: - return self._do_resolve_java_method_call(call_node, local_var_types, module_qn) + return self._do_resolve_java_method_call( + call_node, local_var_types or {}, module_qn + ) def _find_containing_java_class(self, node: ASTNode) -> ASTNode | None: current = node.parent diff --git a/codebase_rag/parsers/java/type_resolver.py b/codebase_rag/parsers/java/type_resolver.py index cbb69fcf7..f1827e6e5 100644 --- a/codebase_rag/parsers/java/type_resolver.py +++ b/codebase_rag/parsers/java/type_resolver.py @@ -20,6 +20,7 @@ class JavaTypeResolverMixin: + __slots__ = () import_processor: ImportProcessor function_registry: FunctionRegistryTrieProtocol module_qn_to_file_path: dict[str, Path] diff --git a/codebase_rag/parsers/java/utils.py b/codebase_rag/parsers/java/utils.py index f267afe47..77784a746 100644 --- a/codebase_rag/parsers/java/utils.py +++ b/codebase_rag/parsers/java/utils.py @@ -114,15 +114,36 @@ def _extract_superclass(class_node: ASTNode) -> str | None: superclass_node = class_node.child_by_field_name(cs.TS_FIELD_SUPERCLASS) if not superclass_node: return None + return _extract_type_identifier_name(superclass_node) - match superclass_node.type: + +def _extract_type_identifier_name(node: ASTNode) -> str | None: + match node.type: case cs.TS_TYPE_IDENTIFIER: - return safe_decode_text(superclass_node) + return safe_decode_text(node) + case cs.TS_SCOPED_TYPE_IDENTIFIER: + # (H) `Outer.Base`/`pkg.Base`: keep the full scoped name rather than + # (H) descending to the first segment (the outer/package), which would + # (H) point resolution at the wrong class. + return safe_decode_text(node) case cs.TS_GENERIC_TYPE: - for child in superclass_node.children: - if child.type == cs.TS_TYPE_IDENTIFIER: + # (H) The base of a generic type is its first type_identifier/scoped child + # (H) (e.g. `Box` -> Box, `Outer.Base` -> Outer.Base); ignore the + # (H) type_arguments that follow. + for child in node.children: + if child.type in ( + cs.TS_TYPE_IDENTIFIER, + cs.TS_SCOPED_TYPE_IDENTIFIER, + ): return safe_decode_text(child) - return None + return None + case _: + # (H) `extends X` exposes a `superclass` wrapper node, not the type itself; + # (H) descend into it to reach the type_identifier/generic_type. + for child in node.children: + if name := _extract_type_identifier_name(child): + return name + return None def _extract_interface_name(type_child: ASTNode) -> str | None: diff --git a/codebase_rag/parsers/java/variable_analyzer.py b/codebase_rag/parsers/java/variable_analyzer.py index 65003d9bb..022ddf18d 100644 --- a/codebase_rag/parsers/java/variable_analyzer.py +++ b/codebase_rag/parsers/java/variable_analyzer.py @@ -23,8 +23,10 @@ class JavaVariableAnalyzerMixin: + __slots__ = () ast_cache: ASTCacheProtocol module_qn_to_file_path: dict[str, Path] + class_inheritance: dict[str, list[str]] _lookup_cache: dict[str, str | None] _lookup_in_progress: set[str] @@ -84,7 +86,7 @@ def _process_formal_parameter( if param_name and param_type: resolved_type = self._resolve_java_type_name(param_type, module_qn) local_var_types[param_name] = resolved_type - logger.debug(ls.JAVA_PARAM.format(name=param_name, type=resolved_type)) + logger.debug(ls.JAVA_PARAM, name=param_name, type=resolved_type) def _process_spread_parameter( self, param_node: ASTNode, local_var_types: dict[str, str], module_qn: str @@ -103,9 +105,7 @@ def _process_spread_parameter( if param_name and param_type: resolved_type = self._resolve_java_type_name(param_type, module_qn) local_var_types[param_name] = resolved_type - logger.debug( - ls.JAVA_VARARGS_PARAM.format(name=param_name, type=resolved_type) - ) + logger.debug(ls.JAVA_VARARGS_PARAM, name=param_name, type=resolved_type) def _analyze_java_local_variables( self, scope_node: ASTNode, local_var_types: dict[str, str], module_qn: str @@ -164,15 +164,13 @@ def _process_variable_declarator( resolved_type = self._resolve_java_type_name(inferred_type, module_qn) local_var_types[var_name] = resolved_type logger.debug( - ls.JAVA_LOCAL_VAR_INFERRED.format(name=var_name, type=resolved_type) + ls.JAVA_LOCAL_VAR_INFERRED, name=var_name, type=resolved_type ) return resolved_type = self._resolve_java_type_name(declared_type, module_qn) local_var_types[var_name] = resolved_type - logger.debug( - ls.JAVA_LOCAL_VAR_DECLARED.format(name=var_name, type=resolved_type) - ) + logger.debug(ls.JAVA_LOCAL_VAR_DECLARED, name=var_name, type=resolved_type) def _analyze_java_class_fields( self, scope_node: ASTNode, local_var_types: dict[str, str], module_qn: str @@ -201,7 +199,7 @@ def _analyze_java_class_fields( if str(field_name) not in local_var_types: local_var_types[str(field_name)] = resolved_type logger.debug( - ls.JAVA_CLASS_FIELD.format(name=field_name, type=resolved_type) + ls.JAVA_CLASS_FIELD, name=field_name, type=resolved_type ) def _analyze_java_constructor_assignments( @@ -235,7 +233,7 @@ def _process_java_assignment( ): resolved_type = self._resolve_java_type_name(inferred_type, module_qn) local_var_types[var_name] = resolved_type - logger.debug(ls.JAVA_ASSIGNMENT.format(name=var_name, type=resolved_type)) + logger.debug(ls.JAVA_ASSIGNMENT, name=var_name, type=resolved_type) def _extract_java_variable_reference(self, node: ASTNode) -> str | None: match node.type: @@ -297,9 +295,7 @@ def _register_for_loop_variable( ): resolved_type = self._resolve_java_type_name(var_type, module_qn) local_var_types[var_name] = resolved_type - logger.debug( - ls.JAVA_ENHANCED_FOR_VAR.format(name=var_name, type=resolved_type) - ) + logger.debug(ls.JAVA_ENHANCED_FOR_VAR, name=var_name, type=resolved_type) def _extract_for_loop_variable_from_children( self, for_node: ASTNode, local_var_types: dict[str, str], module_qn: str @@ -325,9 +321,9 @@ def _extract_for_loop_variable_from_children( ) local_var_types[var_name] = resolved_type logger.debug( - ls.JAVA_ENHANCED_FOR_VAR_ALT.format( - name=var_name, type=resolved_type - ) + ls.JAVA_ENHANCED_FOR_VAR_ALT, + name=var_name, + type=resolved_type, ) break @@ -399,16 +395,51 @@ def _infer_java_field_access_type( if not object_node or not field_node: return None - object_name = safe_decode_text(object_node) field_name = safe_decode_text(field_node) - - if not object_name or not field_name: + if not field_name: return None - if object_type := self._lookup_variable_type(object_name, module_qn): + # (H) A nested receiver (`obj.address.zipCode`) has a field_access as its object; + # (H) recurse to infer that inner type before looking up the outer field, so + # (H) multi-level field access resolves rather than failing on a non-variable name. + if object_node.type == cs.TS_FIELD_ACCESS: + object_type = self._infer_java_field_access_type(object_node, module_qn) + elif object_name := safe_decode_text(object_node): + object_type = self._resolve_field_access_base_type( + object_name, field_access_node, module_qn + ) + else: + object_type = None + + if object_type: return self._lookup_java_field_type(object_type, field_name, module_qn) return None + def _resolve_field_access_base_type( + self, object_name: str, field_access_node: ASTNode, module_qn: str + ) -> str | None: + # (H) `this`/`super` are receiver keywords, not variables: resolve them to the + # (H) containing class (or its superclass) so nested chains rooted at them + # (H) (e.g. `var c = this.address.city`) infer a type instead of failing. + if object_name in (cs.JAVA_KEYWORD_THIS, cs.JAVA_KEYWORD_SUPER): + if not (class_node := self._find_containing_java_class(field_access_node)): + return None + class_info = extract_class_info(class_node) + class_name = class_info.get(cs.FIELD_NAME) + if object_name == cs.JAVA_KEYWORD_THIS: + return class_name + # (H) `super`: return the fully-qualified parent from class_inheritance so a + # (H) nested superclass (`Outer.Base`) resolves; the relative name from the + # (H) AST would be treated as an absolute class key by the field lookup. + if class_name: + own_qn = self._resolve_java_type_name(class_name, module_qn) + if cs.SEPARATOR_DOT not in own_qn: + own_qn = f"{module_qn}{cs.SEPARATOR_DOT}{own_qn}" + if parents := self.class_inheritance.get(own_qn): + return parents[0] + return class_info.get(cs.FIELD_SUPERCLASS) + return self._lookup_variable_type(object_name, module_qn) + def _lookup_variable_type(self, var_name: str, module_qn: str) -> str | None: if not var_name or not module_qn: return None @@ -448,45 +479,82 @@ def _lookup_java_field_type( if not class_type or not field_name: return None - resolved_class_type = self._resolve_java_type_name(class_type, module_qn) - - class_qn = ( - resolved_class_type - if cs.SEPARATOR_DOT in resolved_class_type - else f"{module_qn}{cs.SEPARATOR_DOT}{resolved_class_type}" + resolved = self._resolve_java_type_name(class_type, module_qn) + class_qn: str | None = ( + resolved + if cs.SEPARATOR_DOT in resolved + else f"{module_qn}{cs.SEPARATOR_DOT}{resolved}" ) - parts = class_qn.split(cs.SEPARATOR_DOT) - if len(parts) < 2: - return None - - target_module_qn = cs.SEPARATOR_DOT.join(parts[:-1]) - target_class_name = parts[-1] - - file_path = self.module_qn_to_file_path.get(target_module_qn) - if file_path is None or file_path not in self.ast_cache: - return None + # (H) Walk the inheritance chain using authoritative qualified parents from + # (H) class_inheritance: a field accessed on a subclass may be declared on a + # (H) superclass, including a nested one like `Outer.Base`. Seen-guarded. + seen: set[str] = set() + while class_qn and class_qn not in seen: + seen.add(class_qn) + if located := self._locate_class(class_qn): + root_node, class_path, target_module_qn = located + if field_type := self._find_field_type_in_nested_class( + root_node, class_path, field_name, target_module_qn + ): + return field_type + parents = self.class_inheritance.get(class_qn) + class_qn = parents[0] if parents else None - root_node, _ = self.ast_cache[file_path] + return None - return self._find_field_type_in_class( - root_node, target_class_name, field_name, target_module_qn - ) + def _locate_class(self, class_qn: str) -> tuple[ASTNode, list[str], str] | None: + # (H) The file module is the longest registered prefix of the class qn; the + # (H) remaining segments are the (possibly nested) class path within that file, + # (H) so `proj.pkg.Outer.Base` resolves to file `proj.pkg` + path [Outer, Base]. + parts = class_qn.split(cs.SEPARATOR_DOT) + for split in range(len(parts) - 1, 0, -1): + module_candidate = cs.SEPARATOR_DOT.join(parts[:split]) + file_path = self.module_qn_to_file_path.get(module_candidate) + if file_path is not None and file_path in self.ast_cache: + root_node, _ = self.ast_cache[file_path] + return root_node, parts[split:], module_candidate + return None def _find_field_type_in_class( self, root_node: ASTNode, class_name: str, field_name: str, module_qn: str ) -> str | None: - for child in root_node.children: - if child.type == cs.TS_CLASS_DECLARATION: - class_info = extract_class_info(child) - if class_info.get(cs.FIELD_NAME) == class_name: - if class_body := child.child_by_field_name(cs.FIELD_BODY): - for field_child in class_body.children: - if field_child.type == cs.TS_FIELD_DECLARATION: - field_info = extract_field_info(field_child) - if field_info.get(cs.FIELD_NAME) == field_name: - if field_type := field_info.get(cs.FIELD_TYPE): - return self._resolve_java_type_name( - str(field_type), module_qn - ) + return self._find_field_type_in_nested_class( + root_node, [class_name], field_name, module_qn + ) + + def _find_field_type_in_nested_class( + self, + root_node: ASTNode, + class_path: list[str], + field_name: str, + module_qn: str, + ) -> str | None: + children = root_node.children + body: ASTNode | None = None + for class_name in class_path: + class_node = next( + ( + child + for child in children + if child.type == cs.TS_CLASS_DECLARATION + and extract_class_info(child).get(cs.FIELD_NAME) == class_name + ), + None, + ) + if class_node is None or not ( + body := class_node.child_by_field_name(cs.FIELD_BODY) + ): + return None + children = body.children + + if body is None: + return None + + for field_child in body.children: + if field_child.type == cs.TS_FIELD_DECLARATION: + field_info = extract_field_info(field_child) + if field_info.get(cs.FIELD_NAME) == field_name: + if field_type := field_info.get(cs.FIELD_TYPE): + return self._resolve_java_type_name(str(field_type), module_qn) return None diff --git a/codebase_rag/parsers/js_ts/ingest.py b/codebase_rag/parsers/js_ts/ingest.py index 30580e184..2641ae367 100644 --- a/codebase_rag/parsers/js_ts/ingest.py +++ b/codebase_rag/parsers/js_ts/ingest.py @@ -5,7 +5,7 @@ from typing import TYPE_CHECKING from loguru import logger -from tree_sitter import Query, QueryCursor +from tree_sitter import QueryCursor from ... import constants as cs from ... import logs as lg @@ -16,7 +16,12 @@ PropertyDict, SimpleNameLookup, ) -from ..utils import safe_decode_text, safe_decode_with_fallback +from ..utils import ( + get_cached_query, + safe_decode_text, + safe_decode_with_fallback, + sorted_captures, +) from .module_system import JsTsModuleSystemMixin from .utils import get_js_ts_language_obj @@ -29,6 +34,7 @@ class JsTsIngestMixin(JsTsModuleSystemMixin): + __slots__ = () ingestor: IngestorProtocol repo_path: Path project_name: str @@ -88,14 +94,14 @@ def _ingest_prototype_inheritance_links( language_obj, root_node, module_qn ) except Exception as e: - logger.debug(lg.JS_PROTOTYPE_INHERITANCE_FAILED.format(error=e)) + logger.debug(lg.JS_PROTOTYPE_INHERITANCE_FAILED, error=e) def _process_prototype_inheritance_captures( self, language_obj, root_node, module_qn ): - query = Query(language_obj, cs.JS_PROTOTYPE_INHERITANCE_QUERY) + query = get_cached_query(language_obj, cs.JS_PROTOTYPE_INHERITANCE_QUERY) cursor = QueryCursor(query) - captures = cursor.captures(root_node) + captures = sorted_captures(cursor, root_node) child_classes = captures.get(cs.CAPTURE_CHILD_CLASS, []) parent_classes = captures.get(cs.CAPTURE_PARENT_CLASS, []) @@ -122,9 +128,7 @@ def _process_prototype_inheritance_captures( ) logger.debug( - lg.JS_PROTOTYPE_INHERITANCE.format( - child_qn=child_qn, parent_qn=parent_qn - ) + lg.JS_PROTOTYPE_INHERITANCE, child_qn=child_qn, parent_qn=parent_qn ) def _ingest_prototype_method_assignments( @@ -143,12 +147,12 @@ def _ingest_prototype_method_assignments( try: self._process_prototype_method_captures(language_obj, root_node, module_qn) except Exception as e: - logger.debug(lg.JS_PROTOTYPE_METHODS_FAILED.format(error=e)) + logger.debug(lg.JS_PROTOTYPE_METHODS_FAILED, error=e) def _process_prototype_method_captures(self, language_obj, root_node, module_qn): - method_query = Query(language_obj, cs.JS_PROTOTYPE_METHOD_QUERY) + method_query = get_cached_query(language_obj, cs.JS_PROTOTYPE_METHOD_QUERY) method_cursor = QueryCursor(method_query) - method_captures = method_cursor.captures(root_node) + method_captures = sorted_captures(method_cursor, root_node) constructor_names = method_captures.get(cs.CAPTURE_CONSTRUCTOR_NAME, []) method_names = method_captures.get(cs.CAPTURE_METHOD_NAME, []) @@ -165,6 +169,9 @@ def _process_prototype_method_captures(self, language_obj, root_node, module_qn) if constructor_name and method_name: constructor_qn = f"{module_qn}{cs.SEPARATOR_DOT}{constructor_name}" method_qn = f"{constructor_qn}{cs.SEPARATOR_DOT}{method_name}" + method_qn = self.function_registry.register_unique_qn( + method_qn, func_node.start_point[0] + 1 + ) method_props: PropertyDict = { cs.KEY_QUALIFIED_NAME: method_qn, @@ -174,9 +181,9 @@ def _process_prototype_method_captures(self, language_obj, root_node, module_qn) cs.KEY_DOCSTRING: self._get_docstring(func_node), } logger.info( - lg.JS_PROTOTYPE_METHOD_FOUND.format( - method_name=method_name, method_qn=method_qn - ) + lg.JS_PROTOTYPE_METHOD_FOUND, + method_name=method_name, + method_qn=method_qn, ) self.ingestor.ensure_node_batch(cs.NodeLabel.FUNCTION, method_props) @@ -190,9 +197,9 @@ def _process_prototype_method_captures(self, language_obj, root_node, module_qn) ) logger.debug( - lg.JS_PROTOTYPE_METHOD_DEFINES.format( - constructor_qn=constructor_qn, method_qn=method_qn - ) + lg.JS_PROTOTYPE_METHOD_DEFINES, + constructor_qn=constructor_qn, + method_qn=method_qn, ) def _ingest_object_literal_methods( @@ -213,7 +220,7 @@ def _ingest_object_literal_methods( language_obj, query_text, root_node, module_qn, lang_config ) except Exception as e: - logger.debug(lg.JS_OBJECT_METHODS_DETECT_FAILED.format(error=e)) + logger.debug(lg.JS_OBJECT_METHODS_DETECT_FAILED, error=e) def _process_object_method_query( self, @@ -224,9 +231,9 @@ def _process_object_method_query( lang_config, ) -> None: try: - query = Query(language_obj, query_text) + query = get_cached_query(language_obj, query_text) cursor = QueryCursor(query) - captures = cursor.captures(root_node) + captures = sorted_captures(cursor, root_node) method_names = captures.get(cs.CAPTURE_METHOD_NAME, []) method_functions = captures.get(cs.CAPTURE_METHOD_FUNCTION, []) @@ -250,7 +257,7 @@ def _process_object_method_query( method_name_node, method_func_node, module_qn, lang_config ) except Exception as e: - logger.debug(lg.JS_OBJECT_METHODS_PROCESS_FAILED.format(error=e)) + logger.debug(lg.JS_OBJECT_METHODS_PROCESS_FAILED, error=e) def _process_single_object_method( self, @@ -306,6 +313,9 @@ def _register_object_method( method_func_node: ASTNode, module_qn: str, ) -> None: + method_qn = self.function_registry.register_unique_qn( + method_qn, method_func_node.start_point[0] + 1 + ) method_props: PropertyDict = { cs.KEY_QUALIFIED_NAME: method_qn, cs.KEY_NAME: method_name, @@ -314,9 +324,7 @@ def _register_object_method( cs.KEY_DOCSTRING: self._get_docstring(method_func_node), } logger.info( - lg.JS_OBJECT_METHOD_FOUND.format( - method_name=method_name, method_qn=method_qn - ) + lg.JS_OBJECT_METHOD_FOUND, method_name=method_name, method_qn=method_qn ) self.ingestor.ensure_node_batch(cs.NodeLabel.FUNCTION, method_props) @@ -352,7 +360,7 @@ def _ingest_assignment_arrow_functions( lang_query, query_text, root_node, module_qn, lang_config ) except Exception as e: - logger.debug(lg.JS_ASSIGNMENT_ARROW_DETECT_FAILED.format(error=e)) + logger.debug(lg.JS_ASSIGNMENT_ARROW_DETECT_FAILED, error=e) def _process_arrow_query( self, @@ -363,9 +371,9 @@ def _process_arrow_query( lang_config, ) -> None: try: - query = Query(lang_query, query_text) + query = get_cached_query(lang_query, query_text) cursor = QueryCursor(query) - captures = cursor.captures(root_node) + captures = sorted_captures(cursor, root_node) method_names = captures.get(cs.CAPTURE_METHOD_NAME, []) member_exprs = captures.get(cs.CAPTURE_MEMBER_EXPR, []) @@ -390,7 +398,7 @@ def _process_arrow_query( lg.JS_ASSIGNMENT_FUNC_EXPR_FOUND, ) except Exception as e: - logger.debug(lg.JS_ASSIGNMENT_ARROW_QUERY_FAILED.format(error=e)) + logger.debug(lg.JS_ASSIGNMENT_ARROW_QUERY_FAILED, error=e) def _process_direct_arrow_functions( self, @@ -498,6 +506,9 @@ def _register_arrow_function( function_node: ASTNode, log_message: str, ) -> None: + function_qn = self.function_registry.register_unique_qn( + function_qn, function_node.start_point[0] + 1 + ) function_props: PropertyDict = { cs.KEY_QUALIFIED_NAME: function_qn, cs.KEY_NAME: function_name, @@ -506,9 +517,7 @@ def _register_arrow_function( cs.KEY_DOCSTRING: self._get_docstring(function_node), } - logger.debug( - log_message.format(function_name=function_name, function_qn=function_qn) - ) + logger.debug(log_message, function_name=function_name, function_qn=function_qn) self.ingestor.ensure_node_batch(cs.NodeLabel.FUNCTION, function_props) self.function_registry[function_qn] = NodeType.FUNCTION self.simple_name_lookup[function_name].add(function_qn) diff --git a/codebase_rag/parsers/js_ts/module_system.py b/codebase_rag/parsers/js_ts/module_system.py index 436603575..c41296502 100644 --- a/codebase_rag/parsers/js_ts/module_system.py +++ b/codebase_rag/parsers/js_ts/module_system.py @@ -6,15 +6,17 @@ from typing import TYPE_CHECKING from loguru import logger -from tree_sitter import Query, QueryCursor +from tree_sitter import QueryCursor from ... import constants as cs from ... import logs as ls from ...types_defs import ASTNode from ..utils import ( + get_cached_query, ingest_exported_function, safe_decode_text, safe_decode_with_fallback, + sorted_captures, ) from .utils import get_js_ts_language_obj @@ -29,6 +31,7 @@ class JsTsModuleSystemMixin: + __slots__ = ("_processed_imports",) ingestor: IngestorProtocol repo_path: Path project_name: str @@ -59,9 +62,9 @@ def _ingest_missing_import_patterns( try: try: - query = Query(language_obj, cs.JS_COMMONJS_DESTRUCTURE_QUERY) + query = get_cached_query(language_obj, cs.JS_COMMONJS_DESTRUCTURE_QUERY) cursor = QueryCursor(query) - captures = cursor.captures(root_node) + captures = sorted_captures(cursor, root_node) variable_declarators = captures.get(cs.CAPTURE_VARIABLE_DECLARATOR, []) @@ -71,10 +74,10 @@ def _ingest_missing_import_patterns( ) except Exception as e: - logger.debug(ls.JS_COMMONJS_DESTRUCTURE_FAILED.format(error=e)) + logger.debug(ls.JS_COMMONJS_DESTRUCTURE_FAILED, error=e) except Exception as e: - logger.debug(ls.JS_MISSING_IMPORT_PATTERNS_FAILED.format(error=e)) + logger.debug(ls.JS_MISSING_IMPORT_PATTERNS_FAILED, error=e) def _extract_require_module_name(self, declarator: ASTNode) -> str | None: name_node = declarator.child_by_field_name(cs.FIELD_NAME) @@ -148,7 +151,7 @@ def _process_variable_declarator_for_commonjs( self._process_destructured_child(child, module_name, module_qn) except Exception as e: - logger.debug(ls.JS_COMMONJS_VAR_DECLARATOR_FAILED.format(error=e)) + logger.debug(ls.JS_COMMONJS_VAR_DECLARATOR_FAILED, error=e) def _process_commonjs_import( self, imported_name: str, module_name: str, module_qn: str @@ -179,20 +182,17 @@ def _process_commonjs_import( ) logger.debug( - ls.JS_MISSING_IMPORT_PATTERN.format( - module_qn=module_qn, - imported_name=imported_name, - resolved_source_module=resolved_source_module, - ) + ls.JS_MISSING_IMPORT_PATTERN, + module_qn=module_qn, + imported_name=imported_name, + resolved_source_module=resolved_source_module, ) self._processed_imports.add(import_key) except Exception as e: logger.debug( - ls.JS_COMMONJS_IMPORT_FAILED.format( - imported_name=imported_name, error=e - ) + ls.JS_COMMONJS_IMPORT_FAILED, imported_name=imported_name, error=e ) def _ingest_export_function( @@ -282,9 +282,8 @@ def _ingest_commonjs_exports( for query_text in query_texts: try: - captures = QueryCursor(Query(language_obj, query_text)).captures( - root_node - ) + cursor = QueryCursor(get_cached_query(language_obj, query_text)) + captures = sorted_captures(cursor, root_node) self._process_exports_pattern( captures.get(cs.CAPTURE_EXPORTS_OBJ, []), @@ -302,7 +301,7 @@ def _ingest_commonjs_exports( ) except Exception as e: - logger.debug(ls.JS_COMMONJS_EXPORTS_QUERY_FAILED.format(error=e)) + logger.debug(ls.JS_COMMONJS_EXPORTS_QUERY_FAILED, error=e) def _ingest_es6_exports( self, @@ -320,9 +319,9 @@ def _ingest_es6_exports( ]: try: cleaned_query = textwrap.dedent(query_text).strip() - query = Query(lang_query, cleaned_query) + query = get_cached_query(lang_query, cleaned_query) cursor = QueryCursor(query) - captures = cursor.captures(root_node) + captures = sorted_captures(cursor, root_node) export_names = captures.get(cs.CAPTURE_EXPORT_NAME, []) export_functions = captures.get(cs.CAPTURE_EXPORT_FUNCTION, []) @@ -365,7 +364,7 @@ def _ingest_es6_exports( ) except Exception as e: - logger.debug(ls.JS_ES6_EXPORTS_QUERY_FAILED.format(error=e)) + logger.debug(ls.JS_ES6_EXPORTS_QUERY_FAILED, error=e) except Exception as e: - logger.debug(ls.JS_ES6_EXPORTS_DETECT_FAILED.format(error=e)) + logger.debug(ls.JS_ES6_EXPORTS_DETECT_FAILED, error=e) diff --git a/codebase_rag/parsers/js_ts/type_inference.py b/codebase_rag/parsers/js_ts/type_inference.py index e4930e365..590beb44e 100644 --- a/codebase_rag/parsers/js_ts/type_inference.py +++ b/codebase_rag/parsers/js_ts/type_inference.py @@ -1,83 +1,149 @@ +from __future__ import annotations + from collections.abc import Callable +from typing import TYPE_CHECKING from loguru import logger +from tree_sitter import Node, QueryCursor from ... import constants as cs from ... import logs as ls from ...types_defs import ASTNode, FunctionRegistryTrieProtocol, NodeType from ..import_processor import ImportProcessor -from ..utils import safe_decode_text +from ..utils import get_cached_query, safe_decode_text from . import utils as ut +if TYPE_CHECKING: + from ...types_defs import LanguageQueries + +_JS_DECLARATOR_QUERY = "(variable_declarator) @declarator" + class JsTypeInferenceEngine: + __slots__ = ( + "import_processor", + "function_registry", + "project_name", + "_find_method_ast_node", + "_queries", + ) + def __init__( self, import_processor: ImportProcessor, function_registry: FunctionRegistryTrieProtocol, project_name: str, find_method_ast_node_func: Callable[[str], ASTNode | None], + queries: dict[cs.SupportedLanguage, LanguageQueries] | None = None, ): self.import_processor = import_processor self.function_registry = function_registry self.project_name = project_name self._find_method_ast_node = find_method_ast_node_func + self._queries = queries + + def _get_declarators_via_query( + self, caller_node: ASTNode, language: cs.SupportedLanguage | None = None + ) -> list[Node] | None: + if self._queries is None: + return None + langs = ( + [language] + if language is not None + else [cs.SupportedLanguage.JS, cs.SupportedLanguage.TS] + ) + for lang in langs: + lang_queries = self._queries.get(lang) + if lang_queries and "language" in lang_queries: + try: + q = get_cached_query(lang_queries["language"], _JS_DECLARATOR_QUERY) + cursor = QueryCursor(q) + captures = cursor.captures(caller_node) + return captures.get("declarator", []) + except Exception: + continue + return None def build_local_variable_type_map( - self, caller_node: ASTNode, module_qn: str + self, + caller_node: ASTNode, + module_qn: str, + language: cs.SupportedLanguage | None = None, ) -> dict[str, str]: local_var_types: dict[str, str] = {} - - stack: list[ASTNode] = [caller_node] - declarator_count = 0 - while stack: - current = stack.pop() - - if current.type == cs.TS_VARIABLE_DECLARATOR: + declarator_nodes = self._get_declarators_via_query(caller_node, language) + if declarator_nodes is not None: + for current in declarator_nodes: declarator_count += 1 name_node = current.child_by_field_name("name") value_node = current.child_by_field_name("value") - if name_node and value_node: var_name_text = name_node.text if var_name_text: var_name = safe_decode_text(name_node) if var_name is not None: logger.debug( - ls.JS_VAR_DECLARATOR_FOUND.format( - var_name=var_name, module_qn=module_qn - ) + ls.JS_VAR_DECLARATOR_FOUND, + var_name=var_name, + module_qn=module_qn, ) - if var_type := self._infer_js_variable_type_from_value( value_node, module_qn ): local_var_types[var_name] = var_type logger.debug( - ls.JS_VAR_INFERRED.format( - var_name=var_name, var_type=var_type - ) + ls.JS_VAR_INFERRED, + var_name=var_name, + var_type=var_type, ) else: + logger.debug(ls.JS_VAR_INFER_FAILED, var_name=var_name) + else: + stack: list[ASTNode] = [caller_node] + while stack: + current = stack.pop() + if current.type == cs.TS_VARIABLE_DECLARATOR: + declarator_count += 1 + name_node = current.child_by_field_name("name") + value_node = current.child_by_field_name("value") + if name_node and value_node: + var_name_text = name_node.text + if var_name_text: + var_name = safe_decode_text(name_node) + if var_name is not None: logger.debug( - ls.JS_VAR_INFER_FAILED.format(var_name=var_name) + ls.JS_VAR_DECLARATOR_FOUND, + var_name=var_name, + module_qn=module_qn, ) - - stack.extend(reversed(current.children)) + if var_type := self._infer_js_variable_type_from_value( + value_node, module_qn + ): + local_var_types[var_name] = var_type + logger.debug( + ls.JS_VAR_INFERRED, + var_name=var_name, + var_type=var_type, + ) + else: + logger.debug( + ls.JS_VAR_INFER_FAILED, var_name=var_name + ) + stack.extend(reversed(current.children)) logger.debug( - ls.JS_VAR_TYPE_MAP_BUILT.format( - count=len(local_var_types), declarator_count=declarator_count - ) + ls.JS_VAR_TYPE_MAP_BUILT, + count=len(local_var_types), + declarator_count=declarator_count, ) return local_var_types def _infer_js_variable_type_from_value( self, value_node: ASTNode, module_qn: str ) -> str | None: - logger.debug(ls.JS_INFER_VALUE_NODE.format(node_type=value_node.type)) + logger.debug(ls.JS_INFER_VALUE_NODE, node_type=value_node.type) if value_node.type == cs.TS_NEW_EXPRESSION: if class_name := ut.extract_constructor_name(value_node): @@ -87,28 +153,23 @@ def _infer_js_variable_type_from_value( elif value_node.type == cs.TS_CALL_EXPRESSION: func_node = value_node.child_by_field_name("function") func_type = func_node.type if func_node else cs.STR_NONE - logger.debug(ls.JS_CALL_EXPR_FUNC_NODE.format(func_type=func_type)) + logger.debug(ls.JS_CALL_EXPR_FUNC_NODE, func_type=func_type) if func_node and func_node.type == cs.TS_MEMBER_EXPRESSION: method_call_text = ut.extract_method_call(func_node) - logger.debug( - ls.JS_EXTRACTED_METHOD_CALL.format(method_call=method_call_text) - ) + logger.debug(ls.JS_EXTRACTED_METHOD_CALL, method_call=method_call_text) if method_call_text: if inferred_type := self._infer_js_method_return_type( method_call_text, module_qn ): logger.debug( - ls.JS_TYPE_INFERRED.format( - method_call=method_call_text, - inferred_type=inferred_type, - ) + ls.JS_TYPE_INFERRED, + method_call=method_call_text, + inferred_type=inferred_type, ) return inferred_type logger.debug( - ls.JS_RETURN_TYPE_INFER_FAILED.format( - method_call=method_call_text - ) + ls.JS_RETURN_TYPE_INFER_FAILED, method_call=method_call_text ) elif func_node and func_node.type == cs.TS_IDENTIFIER: @@ -116,7 +177,7 @@ def _infer_js_variable_type_from_value( if func_name: return safe_decode_text(func_node) - logger.debug(ls.JS_NO_PATTERN_MATCHED.format(node_type=value_node.type)) + logger.debug(ls.JS_NO_PATTERN_MATCHED, node_type=value_node.type) return None def _infer_js_method_return_type( @@ -124,7 +185,7 @@ def _infer_js_method_return_type( ) -> str | None: parts = method_call.split(cs.SEPARATOR_DOT) if len(parts) != 2: - logger.debug(ls.JS_METHOD_CALL_INVALID.format(method_call=method_call)) + logger.debug(ls.JS_METHOD_CALL_INVALID, method_call=method_call) return None class_name, method_name = parts @@ -132,27 +193,23 @@ def _infer_js_method_return_type( class_qn = self._resolve_js_class_name(class_name, module_qn) if not class_qn: logger.debug( - ls.JS_CLASS_RESOLVE_FAILED.format( - class_name=class_name, module_qn=module_qn - ) + ls.JS_CLASS_RESOLVE_FAILED, class_name=class_name, module_qn=module_qn ) return None - logger.debug( - ls.JS_CLASS_RESOLVED.format(class_name=class_name, class_qn=class_qn) - ) + logger.debug(ls.JS_CLASS_RESOLVED, class_name=class_name, class_qn=class_qn) method_qn = f"{class_qn}{cs.SEPARATOR_DOT}{method_name}" - logger.debug(ls.JS_LOOKING_FOR_METHOD.format(method_qn=method_qn)) + logger.debug(ls.JS_LOOKING_FOR_METHOD, method_qn=method_qn) method_node = self._find_method_ast_node(method_qn) if not method_node: - logger.debug(ls.JS_METHOD_AST_NOT_FOUND.format(method_qn=method_qn)) + logger.debug(ls.JS_METHOD_AST_NOT_FOUND, method_qn=method_qn) return None return_type = self._analyze_return_statements(method_node, method_qn) logger.debug( - ls.JS_RETURN_ANALYZED.format(method_qn=method_qn, return_type=return_type) + ls.JS_RETURN_ANALYZED, method_qn=method_qn, return_type=return_type ) return return_type @@ -180,11 +237,20 @@ def _resolve_js_class_name(self, class_name: str, module_qn: str) -> str | None: return None + def _get_language_obj(self) -> object | None: + if self._queries is None: + return None + for lang in (cs.SupportedLanguage.JS, cs.SupportedLanguage.TS): + lang_queries = self._queries.get(lang) + if lang_queries and "language" in lang_queries: + return lang_queries["language"] + return None + def _analyze_return_statements( self, method_node: ASTNode, method_qn: str ) -> str | None: return_nodes: list[ASTNode] = [] - ut.find_return_statements(method_node, return_nodes) + ut.find_return_statements(method_node, return_nodes, self._get_language_obj()) for return_node in return_nodes: for child in return_node.children: diff --git a/codebase_rag/parsers/js_ts/utils.py b/codebase_rag/parsers/js_ts/utils.py index 5049afb0c..752660db7 100644 --- a/codebase_rag/parsers/js_ts/utils.py +++ b/codebase_rag/parsers/js_ts/utils.py @@ -1,9 +1,9 @@ from typing import TYPE_CHECKING -from tree_sitter import Language, Node +from tree_sitter import Language, Node, QueryCursor from ... import constants as cs -from ..utils import safe_decode_text +from ..utils import get_cached_query, safe_decode_text if TYPE_CHECKING: from ...types_defs import LanguageQueries @@ -53,11 +53,26 @@ def find_method_in_class_body(class_body_node: Node, method_name: str) -> Node | return None +_CLASS_BODY_CACHE: dict[tuple[int, str], Node | None] = {} +_CLASS_BODY_CACHE_OWNER: int | None = None + + def find_method_in_ast( root_node: Node, class_name: str, method_name: str ) -> Node | None: - stack: list[Node] = [root_node] + global _CLASS_BODY_CACHE_OWNER + root_id = id(root_node) + if _CLASS_BODY_CACHE_OWNER != root_id: + _CLASS_BODY_CACHE.clear() + _CLASS_BODY_CACHE_OWNER = root_id + cache_key = (root_id, class_name) + if cache_key in _CLASS_BODY_CACHE: + body_node = _CLASS_BODY_CACHE[cache_key] + if body_node is not None: + return find_method_in_class_body(body_node, method_name) + return None + stack: list[Node] = [root_node] while stack: current = stack.pop() @@ -66,23 +81,38 @@ def find_method_in_ast( if name_node and name_node.text: found_class_name = safe_decode_text(name_node) if found_class_name == class_name: - if body_node := current.child_by_field_name(cs.FIELD_BODY): + body_node = current.child_by_field_name(cs.FIELD_BODY) + _CLASS_BODY_CACHE[cache_key] = body_node + if body_node: return find_method_in_class_body(body_node, method_name) + return None stack.extend(reversed(current.children)) + _CLASS_BODY_CACHE[cache_key] = None return None -def find_return_statements(node: Node, return_nodes: list[Node]) -> None: - stack: list[Node] = [node] +_JS_RETURN_QUERY = "(return_statement) @return_stmt" + +def find_return_statements( + node: Node, return_nodes: list[Node], language_obj=None +) -> None: + if language_obj is not None: + try: + q = get_cached_query(language_obj, _JS_RETURN_QUERY) + cursor = QueryCursor(q) + captures = cursor.captures(node) + return_nodes.extend(captures.get("return_stmt", [])) + return + except Exception: + pass + stack: list[Node] = [node] while stack: current = stack.pop() - if current.type == cs.TS_RETURN_STATEMENT: return_nodes.append(current) - stack.extend(reversed(current.children)) diff --git a/codebase_rag/parsers/lua/type_inference.py b/codebase_rag/parsers/lua/type_inference.py index 99a5515ba..92b910881 100644 --- a/codebase_rag/parsers/lua/type_inference.py +++ b/codebase_rag/parsers/lua/type_inference.py @@ -14,6 +14,12 @@ class LuaTypeInferenceEngine: + __slots__ = ( + "import_processor", + "function_registry", + "project_name", + ) + def __init__( self, import_processor: ImportProcessor, @@ -36,7 +42,7 @@ def build_local_variable_type_map( self._process_variable_declaration(current, module_qn, local_var_types) stack.extend(reversed(current.children)) - logger.debug(ls.LUA_VAR_TYPE_MAP_BUILT.format(count=len(local_var_types))) + logger.debug(ls.LUA_VAR_TYPE_MAP_BUILT, count=len(local_var_types)) return local_var_types def _process_variable_declaration( @@ -62,9 +68,7 @@ def _process_variable_declaration( func_calls[i], module_qn ): local_var_types[var_name] = var_type - logger.debug( - ls.LUA_VAR_INFERRED.format(var_name=var_name, var_type=var_type) - ) + logger.debug(ls.LUA_VAR_INFERRED, var_name=var_name, var_type=var_type) def _extract_var_names(self, assignment: TreeSitterNodeProtocol) -> list[str]: names: list[str] = [] @@ -110,11 +114,10 @@ def _infer_lua_variable_type_from_value( class_name, module_qn ): logger.debug( - ls.LUA_TYPE_INFERENCE_RETURN.format( - class_name=class_name, - method_name=method_name, - class_qn=class_qn, - ) + ls.LUA_TYPE_INFERENCE_RETURN, + class_name=class_name, + method_name=method_name, + class_qn=class_qn, ) return class_qn diff --git a/codebase_rag/parsers/py/ast_analyzer.py b/codebase_rag/parsers/py/ast_analyzer.py index ec663db4f..9aea42fc2 100644 --- a/codebase_rag/parsers/py/ast_analyzer.py +++ b/codebase_rag/parsers/py/ast_analyzer.py @@ -10,7 +10,14 @@ from ... import logs as lg from ...types_defs import LanguageQueries from ..js_ts.utils import find_method_in_ast as find_js_method_in_ast -from ..utils import safe_decode_text +from ..utils import get_cached_query, safe_decode_text, sorted_captures + +_PY_TRAVERSE_QUERY = ( + f"({cs.TS_PY_ASSIGNMENT}) @assignment " + f"({cs.TS_PY_LIST_COMPREHENSION}) @comprehension " + f"({cs.TS_PY_FOR_STATEMENT}) @for_stmt " + f"({cs.TS_PY_RETURN_STATEMENT}) @return_stmt" +) if TYPE_CHECKING: from collections.abc import Callable @@ -45,6 +52,7 @@ def _infer_instance_variable_types_from_assignments( class PythonAstAnalyzerMixin(_AstBase): + __slots__ = () queries: dict[cs.SupportedLanguage, LanguageQueries] module_qn_to_file_path: dict[str, Path] ast_cache: ASTCacheProtocol @@ -72,6 +80,8 @@ def _infer_method_call_return_type( @abstractmethod def _find_class_in_scope(self, class_name: str, module_qn: str) -> str | None: ... + _return_stmt_cache: dict[int, list[Node]] + def _traverse_single_pass( self, node: Node, local_var_types: dict[str, str], module_qn: str ) -> None: @@ -79,19 +89,35 @@ def _traverse_single_pass( comprehensions: list[Node] = [] for_statements: list[Node] = [] - stack: list[Node] = [node] - while stack: - current = stack.pop() - node_type = current.type - - if node_type == cs.TS_PY_ASSIGNMENT: - assignments.append(current) - elif node_type == cs.TS_PY_LIST_COMPREHENSION: - comprehensions.append(current) - elif node_type == cs.TS_PY_FOR_STATEMENT: - for_statements.append(current) - - stack.extend(reversed(current.children)) + py_lang_queries = self.queries.get(cs.SupportedLanguage.PYTHON) + py_lang_obj = py_lang_queries["language"] if py_lang_queries else None + if py_lang_obj is not None: + try: + q = get_cached_query(py_lang_obj, _PY_TRAVERSE_QUERY) + cursor = QueryCursor(q) + captures = cursor.captures(node) + assignments = captures.get("assignment", []) + comprehensions = captures.get("comprehension", []) + for_statements = captures.get("for_stmt", []) + if return_stmts := captures.get("return_stmt"): + self._return_stmt_cache[id(node)] = return_stmts + except Exception: + py_lang_obj = None + + if py_lang_obj is None: + stack: list[Node] = [node] + while stack: + current = stack.pop() + node_type = current.type + + if node_type == cs.TS_PY_ASSIGNMENT: + assignments.append(current) + elif node_type == cs.TS_PY_LIST_COMPREHENSION: + comprehensions.append(current) + elif node_type == cs.TS_PY_FOR_STATEMENT: + for_statements.append(current) + + stack.extend(reversed(current.children)) for assignment in assignments: self._process_assignment_simple(assignment, local_var_types, module_qn) @@ -140,7 +166,7 @@ def _process_assignment_simple( right_node, module_qn ): local_var_types[var_name] = inferred_type - logger.debug(lg.PY_TYPE_SIMPLE.format(var=var_name, type=inferred_type)) + logger.debug(lg.PY_TYPE_SIMPLE, var=var_name, type=inferred_type) def _process_assignment_complex( self, assignment_node: Node, local_var_types: dict[str, str], module_qn: str @@ -162,7 +188,7 @@ def _process_assignment_complex( right_node, module_qn, local_var_types ): local_var_types[var_name] = inferred_type - logger.debug(lg.PY_TYPE_COMPLEX.format(var=var_name, type=inferred_type)) + logger.debug(lg.PY_TYPE_COMPLEX, var=var_name, type=inferred_type) def _extract_assignment_variable_name(self, node: Node) -> str | None: if node.type != cs.TS_PY_IDENTIFIER or node.text is None: @@ -202,6 +228,32 @@ def _find_method_in_ast( case _: return None + def _find_class_node(self, class_qn: str) -> Node | None: + # (H) Locate a class definition node from its qualified name so cross-class + # (H) attribute/property types can be read when resolving chained calls. + module_qn, _, class_name = class_qn.rpartition(cs.SEPARATOR_DOT) + if not module_qn: + return None + file_path = self.module_qn_to_file_path.get(module_qn) + if not file_path or file_path not in self.ast_cache: + return None + root_node, language = self.ast_cache[file_path] + if language != cs.SupportedLanguage.PYTHON: + return None + lang_queries = self.queries[cs.SupportedLanguage.PYTHON] + class_query = lang_queries[cs.QUERY_KEY_CLASSES] + if not class_query: + return None + cursor = QueryCursor(class_query) + captures = sorted_captures(cursor, root_node) + for class_node in captures.get(cs.QUERY_CAPTURE_CLASS, []): + if not isinstance(class_node, Node): + continue + name_node = class_node.child_by_field_name(cs.TS_FIELD_NAME) + if name_node and safe_decode_text(name_node) == class_name: + return class_node + return None + def _find_python_method_in_ast( self, root_node: Node, class_name: str, method_name: str ) -> Node | None: @@ -210,7 +262,7 @@ def _find_python_method_in_ast( if not class_query: return None cursor = QueryCursor(class_query) - captures = cursor.captures(root_node) + captures = sorted_captures(cursor, root_node) method_query = lang_queries[cs.QUERY_KEY_FUNCTIONS] if not method_query: @@ -232,7 +284,7 @@ def _find_python_method_in_ast( continue method_cursor = QueryCursor(method_query) - method_captures = method_cursor.captures(body_node) + method_captures = sorted_captures(method_cursor, body_node) for method_node in method_captures.get(cs.QUERY_CAPTURE_FUNCTION, []): if not isinstance(method_node, Node): @@ -272,13 +324,26 @@ def _analyze_method_return_statements( return None def _find_return_statements(self, node: Node, return_nodes: list[Node]) -> None: + cached = self._return_stmt_cache.get(id(node)) + if cached is not None: + return_nodes.extend(cached) + return + py_lang_queries = self.queries.get(cs.SupportedLanguage.PYTHON) + py_lang_obj = py_lang_queries["language"] if py_lang_queries else None + if py_lang_obj is not None: + try: + q = get_cached_query(py_lang_obj, cs.PY_RETURN_QUERY) + cursor = QueryCursor(q) + captures = cursor.captures(node) + return_nodes.extend(captures.get("return_stmt", [])) + return + except Exception: + pass stack: list[Node] = [node] - while stack: current = stack.pop() if current.type == cs.TS_PY_RETURN_STATEMENT: return_nodes.append(current) - stack.extend(reversed(current.children)) def _analyze_return_expression(self, expr_node: Node, method_qn: str) -> str | None: @@ -344,13 +409,11 @@ def _analyze_identifier_return(self, expr_node: Node, method_qn: str) -> str | N local_vars = self.build_local_variable_type_map(method_node, module_qn) if identifier in local_vars: logger.debug( - lg.PY_VAR_FROM_CONTEXT.format( - var=identifier, type=local_vars[identifier] - ) + lg.PY_VAR_FROM_CONTEXT, var=identifier, type=local_vars[identifier] ) return local_vars[identifier] - logger.debug(lg.PY_VAR_CANNOT_INFER.format(var=identifier)) + logger.debug(lg.PY_VAR_CANNOT_INFER, var=identifier) return None def _analyze_attribute_return(self, expr_node: Node, method_qn: str) -> str | None: diff --git a/codebase_rag/parsers/py/expression_analyzer.py b/codebase_rag/parsers/py/expression_analyzer.py index 81e0c28a2..73c159159 100644 --- a/codebase_rag/parsers/py/expression_analyzer.py +++ b/codebase_rag/parsers/py/expression_analyzer.py @@ -40,6 +40,7 @@ def _analyze_method_return_statements( class PythonExpressionAnalyzerMixin(_ExprBase): + __slots__ = () import_processor: ImportProcessor function_registry: FunctionRegistryTrieProtocol simple_name_lookup: SimpleNameLookup @@ -47,6 +48,7 @@ class PythonExpressionAnalyzerMixin(_ExprBase): ast_cache: ASTCacheProtocol _method_return_type_cache: dict[str, str | None] + _self_assignment_cache: dict[tuple[int, str], dict[str, str] | None] def _infer_type_from_expression(self, node: Node, module_qn: str) -> str | None: if node.type == cs.TS_PY_CALL: @@ -243,7 +245,7 @@ def _infer_method_return_type( return self._analyze_method_return_statements(method_node, method_qn) return None except Exception as e: - logger.debug(lg.PY_INFER_RETURN_FAILED.format(method=method_call, error=e)) + logger.debug(lg.PY_INFER_RETURN_FAILED, method=method_call, error=e) return None def _resolve_method_qualified_name( @@ -305,11 +307,10 @@ def _resolve_class_method( for qn in self.simple_name_lookup.get(class_name, []): if result := self._try_resolve_method(qn, method_name): logger.debug( - lg.PY_RESOLVED_METHOD.format( - class_name=class_name, - method_name=method_name, - method_qn=result, - ) + lg.PY_RESOLVED_METHOD, + class_name=class_name, + method_name=method_name, + method_qn=result, ) return result @@ -348,14 +349,22 @@ def _try_infer_from_self_assignments( if language != cs.SupportedLanguage.PYTHON: return None - instance_vars: dict[str, str] = {} - self._analyze_self_assignments(root_node, instance_vars, module_qn) + cache_key = (id(root_node), module_qn) + if cache_key in self._self_assignment_cache: + instance_vars = self._self_assignment_cache[cache_key] + else: + instance_vars = {} + self._analyze_self_assignments(root_node, instance_vars, module_qn) + self._self_assignment_cache[cache_key] = instance_vars or None + + if not instance_vars: + return None full_attr_name = f"{cs.PY_SELF_PREFIX}{attribute_name}" return instance_vars.get(full_attr_name) except Exception as e: - logger.debug(lg.PY_INFER_ATTR_FAILED.format(attr=attribute_name, error=e)) + logger.debug(lg.PY_INFER_ATTR_FAILED, attr=attribute_name, error=e) return None def _find_class_in_scope(self, class_name: str, module_qn: str) -> str | None: diff --git a/codebase_rag/parsers/py/type_inference.py b/codebase_rag/parsers/py/type_inference.py index 5908ee76a..ca9b9601a 100644 --- a/codebase_rag/parsers/py/type_inference.py +++ b/codebase_rag/parsers/py/type_inference.py @@ -30,6 +30,25 @@ class PythonTypeInferenceEngine( PythonAstAnalyzerMixin, PythonVariableAnalyzerMixin, ): + __slots__ = ( + "import_processor", + "function_registry", + "repo_path", + "project_name", + "ast_cache", + "queries", + "module_qn_to_file_path", + "class_inheritance", + "simple_name_lookup", + "_js_type_inference_getter", + "_method_return_type_cache", + "_type_inference_in_progress", + "_available_classes_cache", + "_return_stmt_cache", + "_self_assignment_cache", + "_class_member_type_cache", + ) + def __init__( self, import_processor: ImportProcessor, @@ -56,6 +75,10 @@ def __init__( self._method_return_type_cache: dict[str, str | None] = {} self._type_inference_in_progress: set[str] = set() + self._available_classes_cache: dict[str, list[str]] = {} + self._return_stmt_cache: dict[int, list] = {} + self._self_assignment_cache: dict[tuple[int, str], dict[str, str] | None] = {} + self._class_member_type_cache: dict[str, dict[str, str]] = {} def build_local_variable_type_map( self, caller_node: Node, module_qn: str @@ -66,8 +89,15 @@ def build_local_variable_type_map( self._infer_parameter_types(caller_node, local_var_types, module_qn) # (H) Single-pass traversal avoids O(5*N) multiple traversals for type inference. self._traverse_single_pass(caller_node, local_var_types, module_qn) + self._infer_instance_attributes_from_init( + caller_node, local_var_types, module_qn + ) + self._infer_property_return_types(caller_node, local_var_types, module_qn) + self._infer_class_annotation_types(caller_node, local_var_types, module_qn) + aliases = self._collect_local_aliases(caller_node) + self._expand_chained_attribute_types(local_var_types, module_qn, aliases) except Exception as e: - logger.debug(lg.PY_BUILD_VAR_MAP_FAILED.format(error=e)) + logger.debug(lg.PY_BUILD_VAR_MAP_FAILED, error=e) return local_var_types diff --git a/codebase_rag/parsers/py/variable_analyzer.py b/codebase_rag/parsers/py/variable_analyzer.py index 9a49f9a27..d0fe47220 100644 --- a/codebase_rag/parsers/py/variable_analyzer.py +++ b/codebase_rag/parsers/py/variable_analyzer.py @@ -3,12 +3,14 @@ from typing import TYPE_CHECKING, Protocol from loguru import logger +from tree_sitter import QueryCursor from ... import constants as cs from ... import logs as lg from ...types_defs import ASTNode, FunctionRegistryTrieProtocol, NodeType from ..import_processor import ImportProcessor -from ..utils import safe_decode_text +from ..utils import get_cached_query, safe_decode_text +from .utils import resolve_class_name if TYPE_CHECKING: @@ -17,14 +19,20 @@ def _infer_type_from_expression( self, node: ASTNode, module_qn: str ) -> str | None: ... + def _find_class_node(self, class_qn: str) -> ASTNode | None: ... + _VarBase: type = _VariableAnalyzerDeps else: _VarBase = object class PythonVariableAnalyzerMixin(_VarBase): + __slots__ = () import_processor: ImportProcessor function_registry: FunctionRegistryTrieProtocol + queries: dict[cs.SupportedLanguage, object] + _available_classes_cache: dict[str, list[str]] + _class_member_type_cache: dict[str, dict[str, str]] def _infer_parameter_types( self, caller_node: ASTNode, local_var_types: dict[str, str], module_qn: str @@ -61,9 +69,7 @@ def _process_untyped_parameter( ): return local_var_types[param_name] = inferred_type - logger.debug( - lg.PY_PARAM_TYPE_INFERRED.format(param=param_name, type=inferred_type) - ) + logger.debug(lg.PY_PARAM_TYPE_INFERRED, param=param_name, type=inferred_type) def _process_typed_parameter( self, param: ASTNode, local_var_types: dict[str, str] @@ -102,14 +108,14 @@ def _process_typed_default_parameter( def _infer_type_from_parameter_name( self, param_name: str, module_qn: str ) -> str | None: - logger.debug( - lg.PY_TYPE_INFER_ATTEMPT.format(param=param_name, module=module_qn) - ) + logger.debug(lg.PY_TYPE_INFER_ATTEMPT, param=param_name, module=module_qn) available_class_names = self._collect_available_classes(module_qn) - logger.debug(lg.PY_AVAILABLE_CLASSES.format(classes=available_class_names)) + logger.debug(lg.PY_AVAILABLE_CLASSES, classes=available_class_names) return self._find_best_class_match(param_name, available_class_names) def _collect_available_classes(self, module_qn: str) -> list[str]: + if module_qn in self._available_classes_cache: + return self._available_classes_cache[module_qn] available_class_names: list[str] = [] for qn, node_type in self.function_registry.find_with_prefix(module_qn): if node_type != NodeType.CLASS: @@ -118,6 +124,7 @@ def _collect_available_classes(self, module_qn: str) -> list[str]: available_class_names.append(qn.split(cs.SEPARATOR_DOT)[-1]) if module_qn not in self.import_processor.import_mapping: + self._available_classes_cache[module_qn] = available_class_names return available_class_names for local_name, imported_qn in self.import_processor.import_mapping[ @@ -126,6 +133,7 @@ def _collect_available_classes(self, module_qn: str) -> list[str]: if self.function_registry.get(imported_qn) == NodeType.CLASS: available_class_names.append(local_name) + self._available_classes_cache[module_qn] = available_class_names return available_class_names def _find_best_class_match( @@ -142,9 +150,7 @@ def _find_best_class_match( best_match = class_name logger.debug( - lg.PY_BEST_MATCH.format( - param=param_name, match=best_match, score=highest_score - ) + lg.PY_BEST_MATCH, param=param_name, match=best_match, score=highest_score ) return best_match @@ -195,9 +201,7 @@ def _infer_loop_var_from_iterable( right_node, local_var_types, module_qn ): local_var_types[loop_var] = element_type - logger.debug( - lg.PY_LOOP_VAR_INFERRED.format(var=loop_var, type=element_type) - ) + logger.debug(lg.PY_LOOP_VAR_INFERRED, var=loop_var, type=element_type) def _infer_iterable_element_type( self, iterable_node: ASTNode, local_var_types: dict[str, str], module_qn: str @@ -250,27 +254,286 @@ def _process_self_assignment( and (attr_name := left_text.decode(cs.ENCODING_UTF8)).startswith( cs.PY_SELF_PREFIX ) - and ( - assigned_type := self._infer_type_from_expression(right_node, module_qn) - ) ): return + assigned_type = self._infer_type_from_expression(right_node, module_qn) + if not assigned_type and right_node.type == cs.TS_PY_IDENTIFIER: + # (H) self.x = param: a bare identifier carries the type of the matching + # (H) (already-seeded) parameter or local, so flow it onto the attribute. + ident = safe_decode_text(right_node) + assigned_type = local_var_types.get(ident) if ident else None + if not assigned_type: + return local_var_types[attr_name] = assigned_type - logger.debug( - lg.PY_INSTANCE_VAR_INFERRED.format(attr=attr_name, type=assigned_type) - ) + logger.debug(lg.PY_INSTANCE_VAR_INFERRED, attr=attr_name, type=assigned_type) def _analyze_self_assignments( self, node: ASTNode, local_var_types: dict[str, str], module_qn: str ) -> None: + py_lang_queries = self.queries.get(cs.SupportedLanguage.PYTHON) + py_lang_obj = py_lang_queries["language"] if py_lang_queries else None + if py_lang_obj is not None: + try: + q = get_cached_query(py_lang_obj, cs.PY_ASSIGNMENT_QUERY) + cursor = QueryCursor(q) + captures = cursor.captures(node) + for assign_node in captures.get("assignment", []): + self._process_self_assignment( + assign_node, local_var_types, module_qn + ) + return + except Exception: + pass stack: list[ASTNode] = [node] - while stack: current = stack.pop() if current.type == cs.TS_PY_ASSIGNMENT: self._process_self_assignment(current, local_var_types, module_qn) stack.extend(reversed(current.children)) + def _enclosing_class_node(self, node: ASTNode) -> ASTNode | None: + current = node.parent + while current is not None: + if current.type == cs.TS_PY_CLASS_DEFINITION: + return current + current = current.parent + return None + + def _find_init_method_node(self, class_node: ASTNode) -> ASTNode | None: + body = class_node.child_by_field_name(cs.FIELD_BODY) + if body is None: + return None + for child in body.children: + if child.type == cs.TS_PY_DECORATED_DEFINITION: + func = next( + ( + c + for c in child.children + if c.type == cs.TS_PY_FUNCTION_DEFINITION + ), + None, + ) + elif child.type == cs.TS_PY_FUNCTION_DEFINITION: + func = child + else: + continue + if func is None: + continue + name_node = func.child_by_field_name(cs.FIELD_NAME) + if ( + name_node + and (text := name_node.text) + and text.decode(cs.ENCODING_UTF8) == cs.PY_METHOD_INIT + ): + return func + return None + + def _infer_instance_attributes_from_init( + self, caller_node: ASTNode, local_var_types: dict[str, str], module_qn: str + ) -> None: + # (H) Instance attributes are assigned in __init__ (self.x = T()), so a method + # (H) that only reads self.x has no local assignment to infer from. Scan the + # (H) enclosing class's __init__ and seed the attribute types, letting any + # (H) reassignment in the calling method itself take precedence (setdefault). + if (class_node := self._enclosing_class_node(caller_node)) is None: + return + init_node = self._find_init_method_node(class_node) + if init_node is None or init_node is caller_node: + return + init_types: dict[str, str] = {} + # (H) Seed __init__ parameter types first so self.x = param flows the + # (H) parameter annotation onto the attribute. + self._infer_parameter_types(init_node, init_types, module_qn) + self._analyze_self_assignments(init_node, init_types, module_qn) + for attr, attr_type in init_types.items(): + if attr.startswith(cs.PY_SELF_PREFIX): + local_var_types.setdefault(attr, attr_type) + + def _has_property_decorator(self, decorated_node: ASTNode) -> bool: + for child in decorated_node.children: + if child.type == cs.TS_PY_DECORATOR and (text := child.text): + tail = ( + text.decode(cs.ENCODING_UTF8) + .lstrip(cs.DECORATOR_AT) + .split(cs.SEPARATOR_DOT)[-1] + ) + if tail in cs.PROPERTY_DECORATORS: + return True + return False + + def _infer_property_return_types( + self, caller_node: ASTNode, local_var_types: dict[str, str], module_qn: str + ) -> None: + # (H) self.prop where prop is an @property has the property's declared return + # (H) type, so a chained call self.prop.method() can resolve against the + # (H) returned class rather than an ambiguous same-named method elsewhere. + if (class_node := self._enclosing_class_node(caller_node)) is None: + return + self._collect_property_return_types(class_node, local_var_types) + + def _collect_property_return_types( + self, class_node: ASTNode, out: dict[str, str] + ) -> None: + body = class_node.child_by_field_name(cs.FIELD_BODY) + if body is None: + return + for child in body.children: + if child.type != cs.TS_PY_DECORATED_DEFINITION: + continue + if not self._has_property_decorator(child): + continue + func = next( + (c for c in child.children if c.type == cs.TS_PY_FUNCTION_DEFINITION), + None, + ) + if func is None: + continue + name_node = func.child_by_field_name(cs.FIELD_NAME) + return_node = func.child_by_field_name(cs.FIELD_RETURN_TYPE) + if not ( + name_node + and (name_text := name_node.text) + and return_node + and (return_text := return_node.text) + ): + continue + # (H) The return_type field wraps a type node; only a bare class name (not + # (H) a union, subscripted generic, or string forward ref) seeds a type. + return_type = return_text.decode(cs.ENCODING_UTF8) + if return_type.isidentifier(): + out.setdefault( + f"{cs.PY_SELF_PREFIX}{name_text.decode(cs.ENCODING_UTF8)}", + return_type, + ) + + def _infer_class_annotation_types( + self, caller_node: ASTNode, local_var_types: dict[str, str], module_qn: str + ) -> None: + # (H) A class-level annotation (_handler: LanguageHandler) declares the type of + # (H) an instance attribute even when it is assigned from a factory call whose + # (H) return type cannot be inferred, so seed self. from the annotation. + if (class_node := self._enclosing_class_node(caller_node)) is None: + return + self._collect_class_annotation_types(class_node, local_var_types) + + def _collect_class_annotation_types( + self, class_node: ASTNode, out: dict[str, str] + ) -> None: + body = class_node.child_by_field_name(cs.FIELD_BODY) + if body is None: + return + for child in body.children: + if child.type != cs.TS_PY_EXPRESSION_STATEMENT: + continue + assignment = child.children[0] if child.children else None + if assignment is None or assignment.type != cs.TS_PY_ASSIGNMENT: + continue + left_node = assignment.child_by_field_name(cs.TS_FIELD_LEFT) + type_node = assignment.child_by_field_name(cs.TS_FIELD_TYPE) + if not ( + left_node + and left_node.type == cs.TS_PY_IDENTIFIER + and type_node + and (name := safe_decode_text(left_node)) + and (type_text := safe_decode_text(type_node)) + and type_text.isidentifier() + ): + continue + out.setdefault(f"{cs.PY_SELF_PREFIX}{name}", type_text) + + def _expand_chained_attribute_types( + self, + local_var_types: dict[str, str], + module_qn: str, + aliases: dict[str, str] | None = None, + max_depth: int = 4, + ) -> None: + # (H) A chained reference a.b.c needs the type of a.b (member b on a's class). + # (H) Each pass: (1) propagate local aliases (x = ref) from the referent's type, + # (H) then (2) for every typed ref, seed ref.member -> member type (full QN), so + # (H) deeper chains and aliases resolve on the next pass until a fixpoint. + aliases = aliases or {} + for _ in range(max_depth): + added = False + for local, referent in aliases.items(): + if local not in local_var_types and ( + referent_type := local_var_types.get(referent) + ): + local_var_types[local] = referent_type + added = True + for ref, type_name in list(local_var_types.items()): + class_qn = self._class_qn_of_type(type_name, module_qn) + if not class_qn: + continue + for member, member_type in self._class_member_types_by_qn( + class_qn + ).items(): + key = f"{ref}{cs.SEPARATOR_DOT}{member}" + if key not in local_var_types: + local_var_types[key] = member_type + added = True + if not added: + break + + def _collect_local_aliases(self, caller_node: ASTNode) -> dict[str, str]: + # (H) Record local-variable aliases (resolver = self._resolver) where the rhs is + # (H) a plain name/attribute reference, so its type can be propagated. Skip + # (H) nested scopes and any rhs that is a call/subscript/other expression. + aliases: dict[str, str] = {} + boundary = (cs.TS_PY_FUNCTION_DEFINITION, cs.TS_PY_CLASS_DEFINITION) + stack: list[ASTNode] = list(caller_node.children) + while stack: + node = stack.pop() + if node.type in boundary: + continue + if node.type == cs.TS_PY_ASSIGNMENT: + left = node.child_by_field_name(cs.TS_FIELD_LEFT) + right = node.child_by_field_name(cs.TS_FIELD_RIGHT) + if ( + left is not None + and left.type == cs.TS_PY_IDENTIFIER + and right is not None + and right.type in (cs.TS_PY_IDENTIFIER, cs.TS_PY_ATTRIBUTE) + and (local := safe_decode_text(left)) + and (referent := safe_decode_text(right)) + and local not in aliases + ): + aliases[local] = referent + stack.extend(node.children) + return aliases + + def _class_qn_of_type(self, type_name: str, module_qn: str) -> str | None: + if cs.SEPARATOR_DOT in type_name: + return type_name + return resolve_class_name( + type_name, module_qn, self.import_processor, self.function_registry + ) + + def _class_member_types_by_qn(self, class_qn: str) -> dict[str, str]: + if class_qn in self._class_member_type_cache: + return self._class_member_type_cache[class_qn] + members: dict[str, str] = {} + class_node = self._find_class_node(class_qn) + if class_node is not None: + class_module_qn = class_qn.rpartition(cs.SEPARATOR_DOT)[0] + raw: dict[str, str] = {} + self._collect_property_return_types(class_node, raw) + self._collect_class_annotation_types(class_node, raw) + if (init_node := self._find_init_method_node(class_node)) is not None: + init_types: dict[str, str] = {} + self._infer_parameter_types(init_node, init_types, class_module_qn) + self._analyze_self_assignments(init_node, init_types, class_module_qn) + for attr, attr_type in init_types.items(): + raw.setdefault(attr, attr_type) + for attr, attr_type in raw.items(): + if not attr.startswith(cs.PY_SELF_PREFIX): + continue + member = attr[len(cs.PY_SELF_PREFIX) :] + resolved = self._class_qn_of_type(attr_type, class_module_qn) + members[member] = resolved or attr_type + self._class_member_type_cache[class_qn] = members + return members + def _infer_variable_element_type( self, var_name: str, local_var_types: dict[str, str], module_qn: str ) -> str | None: diff --git a/codebase_rag/parsers/rs/utils.py b/codebase_rag/parsers/rs/utils.py index 64cc84cf6..99743e758 100644 --- a/codebase_rag/parsers/rs/utils.py +++ b/codebase_rag/parsers/rs/utils.py @@ -137,12 +137,9 @@ def _process_scoped_use_list( _process_use_tree(child, final_base, imports) -def extract_impl_target(impl_node: Node) -> str | None: - if impl_node.type != cs.TS_IMPL_ITEM: - return None - +def _impl_field_type_name(impl_node: Node, field: str) -> str | None: for i in range(impl_node.child_count): - if impl_node.field_name_for_child(i) == cs.FIELD_TYPE: + if impl_node.field_name_for_child(i) == field: type_node = impl_node.child(i) if type_node is None: continue @@ -151,7 +148,7 @@ def extract_impl_target(impl_node: Node) -> str | None: for child in type_node.children: if child.type == cs.TS_TYPE_IDENTIFIER: return safe_decode_text(child) - case cs.TS_TYPE_IDENTIFIER: + case cs.TS_TYPE_IDENTIFIER | cs.TS_RS_PRIMITIVE_TYPE: return safe_decode_text(type_node) case cs.TS_RS_SCOPED_TYPE_IDENTIFIER: for child in type_node.children: @@ -162,6 +159,20 @@ def extract_impl_target(impl_node: Node) -> str | None: return None +def extract_impl_target(impl_node: Node) -> str | None: + if impl_node.type != cs.TS_IMPL_ITEM: + return None + return _impl_field_type_name(impl_node, cs.FIELD_TYPE) + + +def extract_impl_trait(impl_node: Node) -> str | None: + # (H) The `trait` field of `impl Trait for Type` -> the implemented trait's + # (H) simple name (a trait impl means Type IMPLEMENTS Trait). + if impl_node.type != cs.TS_IMPL_ITEM: + return None + return _impl_field_type_name(impl_node, cs.FIELD_TRAIT) + + def extract_use_imports(use_node: Node) -> dict[str, str]: if use_node.type != cs.TS_USE_DECLARATION: return {} diff --git a/codebase_rag/parsers/stdlib_extractor.py b/codebase_rag/parsers/stdlib_extractor.py index fbcbddd4c..52fc5d219 100644 --- a/codebase_rag/parsers/stdlib_extractor.py +++ b/codebase_rag/parsers/stdlib_extractor.py @@ -42,7 +42,7 @@ def _is_tool_available(tool_name: str) -> bool: subprocess.CalledProcessError, ): _EXTERNAL_TOOLS[tool_name] = False - logger.debug(ls.IMP_TOOL_NOT_AVAILABLE.format(tool=tool_name)) + logger.debug(ls.IMP_TOOL_NOT_AVAILABLE, tool=tool_name) return False @@ -77,9 +77,9 @@ def load_persistent_cache() -> None: data = json.load(f) _STDLIB_CACHE.update(data.get(cs.IMPORT_CACHE_KEY, {})) _CACHE_TIMESTAMPS.update(data.get(cs.IMPORT_TIMESTAMPS_KEY, {})) - logger.debug(ls.IMP_CACHE_LOADED.format(path=cache_file)) + logger.debug(ls.IMP_CACHE_LOADED, path=cache_file) except (json.JSONDecodeError, OSError) as e: - logger.debug(ls.IMP_CACHE_LOAD_ERROR.format(error=e)) + logger.debug(ls.IMP_CACHE_LOAD_ERROR, error=e) def save_persistent_cache() -> None: @@ -97,9 +97,9 @@ def save_persistent_cache() -> None: f, indent=2, ) - logger.debug(ls.IMP_CACHE_SAVED.format(path=cache_file)) + logger.debug(ls.IMP_CACHE_SAVED, path=cache_file) except OSError as e: - logger.debug(ls.IMP_CACHE_SAVE_ERROR.format(error=e)) + logger.debug(ls.IMP_CACHE_SAVE_ERROR, error=e) def flush_stdlib_cache() -> None: @@ -115,7 +115,7 @@ def clear_stdlib_cache() -> None: cache_file.unlink() logger.debug(ls.IMP_CACHE_CLEARED) except OSError as e: - logger.debug(ls.IMP_CACHE_CLEAR_ERROR.format(error=e)) + logger.debug(ls.IMP_CACHE_CLEAR_ERROR, error=e) def get_stdlib_cache_stats() -> StdlibCacheStats: @@ -130,6 +130,8 @@ def get_stdlib_cache_stats() -> StdlibCacheStats: class StdlibExtractor: + __slots__ = ("function_registry", "repo_path", "project_name") + def __init__( self, function_registry: FunctionRegistryTrieProtocol | None = None, @@ -248,7 +250,7 @@ def _resolve_python_entity_module_path( result = ( cs.SEPARATOR_DOT.join(parts[:-1]) - if entity_name[0].isupper() + if entity_name[:1].isupper() else full_qualified_name ) _cache_stdlib_result(cs.SupportedLanguage.PYTHON, full_qualified_name, result) @@ -330,15 +332,16 @@ def _resolve_js_entity_module_path( ): pass - result = ( - cs.SEPARATOR_DOT.join(parts[:-1]) - if entity_name[0].isupper() - else full_qualified_name - ) + result = cs.SEPARATOR_DOT.join(parts[:-1]) _cache_stdlib_result(cs.SupportedLanguage.JS, full_qualified_name, result) return result def _extract_go_stdlib_path(self, full_qualified_name: str) -> str: + if cached := _get_cached_stdlib_result( + cs.SupportedLanguage.GO, full_qualified_name + ): + return cached + parts = full_qualified_name.split(cs.SEPARATOR_SLASH) if len(parts) >= 2: try: @@ -453,6 +456,11 @@ def _extract_go_stdlib_path(self, full_qualified_name: str) -> str: if proc.returncode == 0: data = json.loads(stdout.strip()) if data[cs.JSON_KEY_HAS_ENTITY]: + _cache_stdlib_result( + cs.SupportedLanguage.GO, + full_qualified_name, + package_path, + ) return package_path except ( @@ -464,219 +472,112 @@ def _extract_go_stdlib_path(self, full_qualified_name: str) -> str: pass entity_name = parts[-1] - if entity_name[0].isupper(): - return cs.SEPARATOR_SLASH.join(parts[:-1]) + if entity_name[:1].isupper(): + result = cs.SEPARATOR_SLASH.join(parts[:-1]) + _cache_stdlib_result( + cs.SupportedLanguage.GO, full_qualified_name, result + ) + return result + _cache_stdlib_result( + cs.SupportedLanguage.GO, full_qualified_name, full_qualified_name + ) return full_qualified_name def _extract_rust_stdlib_path(self, full_qualified_name: str) -> str: + if cached := _get_cached_stdlib_result( + cs.SupportedLanguage.RUST, full_qualified_name + ): + return cached + parts = full_qualified_name.split(cs.SEPARATOR_DOUBLE_COLON) if len(parts) >= 2: entity_name = parts[-1] if ( - entity_name[0].isupper() + entity_name[:1].isupper() or entity_name.isupper() or (cs.CHAR_UNDERSCORE not in entity_name and entity_name.islower()) ): - return cs.SEPARATOR_DOUBLE_COLON.join(parts[:-1]) + result = cs.SEPARATOR_DOUBLE_COLON.join(parts[:-1]) + _cache_stdlib_result( + cs.SupportedLanguage.RUST, full_qualified_name, result + ) + return result + _cache_stdlib_result( + cs.SupportedLanguage.RUST, full_qualified_name, full_qualified_name + ) return full_qualified_name def _extract_cpp_stdlib_path(self, full_qualified_name: str) -> str: + if cached := _get_cached_stdlib_result( + cs.SupportedLanguage.CPP, full_qualified_name + ): + return cached + parts = full_qualified_name.split(cs.SEPARATOR_DOUBLE_COLON) if len(parts) >= 2: namespace = parts[0] if namespace == cs.CPP_STD_NAMESPACE: entity_name = parts[-1] - - try: - import os - import subprocess - import tempfile - - with tempfile.NamedTemporaryFile( - mode="w", suffix=".txt", delete=False - ) as f: - f.write(entity_name) - entity_file = f.name - - try: - cpp_template_program = f""" -#include -#include -#include - -int main() {{ - std::ifstream file("{entity_file}"); - std::string entity_name; - std::getline(file, entity_name); - file.close(); - - // This is a compile-time check strategy - we can't dynamically construct templates - // Fall back to heuristic approach for safety - std::cout << "heuristic_check" << std::endl; - return 0; -}} - """ - - subprocess.run( - ["g++", "-std=c++17", "-x", "c++", "-", "-o", "/dev/null"], - check=False, - input=cpp_template_program, - capture_output=True, - text=True, - timeout=5, - ) - - finally: - os.unlink(entity_file) - - except ( - subprocess.TimeoutExpired, - subprocess.CalledProcessError, - OSError, - ): - pass - - entity_name = parts[-1] if ( - entity_name[0].isupper() + entity_name[:1].isupper() or entity_name.startswith(cs.CPP_PREFIX_IS) or entity_name.startswith(cs.CPP_PREFIX_HAS) or entity_name in cs.CPP_STDLIB_ENTITIES ): - return cs.SEPARATOR_DOUBLE_COLON.join(parts[:-1]) + result = cs.SEPARATOR_DOUBLE_COLON.join(parts[:-1]) + _cache_stdlib_result( + cs.SupportedLanguage.CPP, full_qualified_name, result + ) + return result + _cache_stdlib_result( + cs.SupportedLanguage.CPP, full_qualified_name, full_qualified_name + ) return full_qualified_name def _extract_java_stdlib_path(self, full_qualified_name: str) -> str: + cached_result = _get_cached_stdlib_result( + cs.SupportedLanguage.JAVA, full_qualified_name + ) + if cached_result is not None: + return cached_result + parts = full_qualified_name.split(cs.SEPARATOR_DOT) if len(parts) >= 2: - try: - import os - import subprocess - import tempfile - - package_name = cs.SEPARATOR_DOT.join(parts[:-1]) - entity_name = parts[-1] - - java_program = """ -import java.lang.reflect.*; - -public class StdlibCheck { - public static void main(String[] args) { - if (args.length < 2) { - System.out.println("{\\"hasEntity\\": false}"); - return; - } - - String packageName = args[0]; - String entityName = args[1]; - - try { - Class clazz = Class.forName(packageName + "." + entityName); - System.out.println("{\\"hasEntity\\": true, \\"entityType\\": \\"class\\"}"); - } catch (ClassNotFoundException e) { - // Try as method or field in parent package - try { - Class packageClass = Class.forName(packageName); - Method[] methods = packageClass.getMethods(); - Field[] fields = packageClass.getFields(); - - boolean foundMethod = false; - for (Method method : methods) { - if (method.getName().equals(entityName)) { - foundMethod = true; - break; - } - } - - boolean foundField = false; - for (Field field : fields) { - if (field.getName().equals(entityName)) { - foundField = true; - break; - } - } - - if (foundMethod || foundField) { - System.out.println("{\\"hasEntity\\": true, \\"entityType\\": \\"member\\"}"); - } else { - System.out.println("{\\"hasEntity\\": false}"); - } - } catch (Exception ex) { - System.out.println("{\\"hasEntity\\": false}"); - } - } - } -} - """ - - with tempfile.NamedTemporaryFile( - mode="w", suffix=".java", delete=False - ) as f: - f.write(java_program) - java_file = f.name - - try: - compile_result = subprocess.run( - ["javac", java_file], - check=False, - capture_output=True, - text=True, - timeout=10, - ) - - if compile_result.returncode == 0: - class_name = os.path.splitext(os.path.basename(java_file))[0] - run_result = subprocess.run( - [ - "java", - "-cp", - os.path.dirname(java_file), - class_name, - package_name, - entity_name, - ], - check=False, - capture_output=True, - text=True, - timeout=10, - ) - - if run_result.returncode == 0: - data = json.loads(run_result.stdout.strip()) - if data.get(cs.JSON_KEY_HAS_ENTITY): - return cs.SEPARATOR_DOT.join(parts[:-1]) - - finally: - for ext in (cs.EXT_JAVA, cs.EXT_CLASS): - temp_file = os.path.splitext(java_file)[0] + ext - try: - os.unlink(temp_file) - except OSError: - pass - - except ( - subprocess.TimeoutExpired, - subprocess.CalledProcessError, - json.JSONDecodeError, - OSError, - ): - pass - entity_name = parts[-1] - if ( - entity_name[0].isupper() + is_class_entity = ( + entity_name[:1].isupper() or entity_name.endswith(cs.JAVA_SUFFIX_EXCEPTION) or entity_name.endswith(cs.JAVA_SUFFIX_ERROR) or entity_name.endswith(cs.JAVA_SUFFIX_INTERFACE) or entity_name.endswith(cs.JAVA_SUFFIX_BUILDER) or entity_name in cs.JAVA_STDLIB_CLASSES - ): - return cs.SEPARATOR_DOT.join(parts[:-1]) + ) + + if full_qualified_name.startswith(cs.JAVA_STDLIB_PREFIXES): + result = ( + cs.SEPARATOR_DOT.join(parts[:-1]) + if is_class_entity + else full_qualified_name + ) + _cache_stdlib_result( + cs.SupportedLanguage.JAVA, full_qualified_name, result + ) + return result + if is_class_entity: + result = cs.SEPARATOR_DOT.join(parts[:-1]) + _cache_stdlib_result( + cs.SupportedLanguage.JAVA, full_qualified_name, result + ) + return result + + _cache_stdlib_result( + cs.SupportedLanguage.JAVA, full_qualified_name, full_qualified_name + ) return full_qualified_name def _extract_lua_stdlib_path(self, full_qualified_name: str) -> str: @@ -750,7 +651,7 @@ def _extract_lua_stdlib_path(self, full_qualified_name: str) -> str: pass entity_name = parts[-1] - if entity_name[0].isupper() or entity_name in cs.LUA_STDLIB_MODULES: + if entity_name[:1].isupper() or entity_name in cs.LUA_STDLIB_MODULES: return cs.SEPARATOR_DOT.join(parts[:-1]) return full_qualified_name @@ -759,7 +660,7 @@ def _extract_generic_stdlib_path(self, full_qualified_name: str) -> str: parts = full_qualified_name.split(cs.SEPARATOR_DOT) if len(parts) >= 2: entity_name = parts[-1] - if entity_name[0].isupper(): + if entity_name[:1].isupper(): return cs.SEPARATOR_DOT.join(parts[:-1]) return full_qualified_name diff --git a/codebase_rag/parsers/structure_processor.py b/codebase_rag/parsers/structure_processor.py index 9b4065bd3..78b853773 100644 --- a/codebase_rag/parsers/structure_processor.py +++ b/codebase_rag/parsers/structure_processor.py @@ -6,10 +6,24 @@ from .. import logs from ..services import IngestorProtocol from ..types_defs import LanguageQueries, NodeIdentifier -from ..utils.path_utils import should_skip_path +from ..utils.path_utils import ( + cached_relative_path, + cached_resolve_posix, + should_skip_path, +) class StructureProcessor: + __slots__ = ( + "ingestor", + "repo_path", + "project_name", + "queries", + "structural_elements", + "unignore_paths", + "exclude_paths", + ) + def __init__( self, ingestor: IngestorProtocol, @@ -47,19 +61,18 @@ def identify_structure(self) -> None: ): directories.add(path) + package_indicators: set[str] = set() + for lang_queries in self.queries.values(): + lang_config = lang_queries[cs.QUERY_CONFIG] + package_indicators.update(lang_config.package_indicators) + for root in sorted(directories): - relative_root = root.relative_to(self.repo_path) + relative_root = cached_relative_path(root, self.repo_path) parent_rel_path = relative_root.parent parent_container_qn = self.structural_elements.get(parent_rel_path) is_package = False - package_indicators: set[str] = set() - - for lang_queries in self.queries.values(): - lang_config = lang_queries[cs.QUERY_CONFIG] - package_indicators.update(lang_config.package_indicators) - for indicator in package_indicators: if (root / indicator).exists(): is_package = True @@ -79,6 +92,7 @@ def identify_structure(self) -> None: cs.KEY_QUALIFIED_NAME: package_qn, cs.KEY_NAME: root.name, cs.KEY_PATH: relative_root.as_posix(), + cs.KEY_ABSOLUTE_PATH: cached_resolve_posix(root), }, ) parent_identifier = self._get_parent_identifier( @@ -96,7 +110,11 @@ def identify_structure(self) -> None: ) self.ingestor.ensure_node_batch( cs.NodeLabel.FOLDER, - {cs.KEY_PATH: relative_root.as_posix(), cs.KEY_NAME: root.name}, + { + cs.KEY_PATH: relative_root.as_posix(), + cs.KEY_NAME: root.name, + cs.KEY_ABSOLUTE_PATH: cached_resolve_posix(root), + }, ) parent_identifier = self._get_parent_identifier( parent_rel_path, parent_container_qn @@ -108,8 +126,8 @@ def identify_structure(self) -> None: ) def process_generic_file(self, file_path: Path, file_name: str) -> None: - relative_filepath = file_path.relative_to(self.repo_path).as_posix() - relative_root = file_path.parent.relative_to(self.repo_path) + relative_filepath = cached_relative_path(file_path, self.repo_path).as_posix() + relative_root = cached_relative_path(file_path.parent, self.repo_path) parent_container_qn = self.structural_elements.get(relative_root) parent_identifier = self._get_parent_identifier( @@ -122,6 +140,7 @@ def process_generic_file(self, file_path: Path, file_name: str) -> None: cs.KEY_PATH: relative_filepath, cs.KEY_NAME: file_name, cs.KEY_EXTENSION: file_path.suffix, + cs.KEY_ABSOLUTE_PATH: cached_resolve_posix(file_path), }, ) diff --git a/codebase_rag/parsers/type_inference.py b/codebase_rag/parsers/type_inference.py index 815e4af81..f4834c255 100644 --- a/codebase_rag/parsers/type_inference.py +++ b/codebase_rag/parsers/type_inference.py @@ -19,6 +19,22 @@ class TypeInferenceEngine: + __slots__ = ( + "import_processor", + "function_registry", + "repo_path", + "project_name", + "ast_cache", + "queries", + "module_qn_to_file_path", + "class_inheritance", + "simple_name_lookup", + "_java_type_inference", + "_lua_type_inference", + "_js_type_inference", + "_python_type_inference", + ) + def __init__( self, import_processor: ImportProcessor, @@ -80,6 +96,7 @@ def js_type_inference(self) -> JsTypeInferenceEngine: function_registry=self.function_registry, project_name=self.project_name, find_method_ast_node_func=self.python_type_inference._find_method_ast_node, + queries=self.queries, ) return self._js_type_inference @@ -110,7 +127,7 @@ def build_local_variable_type_map( ) case cs.SupportedLanguage.JS | cs.SupportedLanguage.TS: return self.js_type_inference.build_local_variable_type_map( - caller_node, module_qn + caller_node, module_qn, language ) case cs.SupportedLanguage.JAVA: return self.java_type_inference.build_variable_type_map( diff --git a/codebase_rag/parsers/utils.py b/codebase_rag/parsers/utils.py index b164a5022..ce5e27288 100644 --- a/codebase_rag/parsers/utils.py +++ b/codebase_rag/parsers/utils.py @@ -2,6 +2,7 @@ from collections.abc import Callable from functools import lru_cache +from pathlib import Path from typing import TYPE_CHECKING, NamedTuple from loguru import logger @@ -17,18 +18,59 @@ SimpleNameLookup, TreeSitterNodeProtocol, ) +from ..utils.path_utils import cached_relative_path, cached_resolve_posix if TYPE_CHECKING: from ..language_spec import LanguageSpec from ..services import IngestorProtocol from ..types_defs import FunctionRegistryTrieProtocol +_QUERY_CACHE: dict[tuple[int, str], Query] = {} +_QUERY_LAST: tuple[tuple[int, str], Query] | None = None + + +def get_cached_query(language_obj, query_text: str) -> Query: + global _QUERY_LAST + key = (id(language_obj), query_text) + if _QUERY_LAST is not None and _QUERY_LAST[0] == key: + return _QUERY_LAST[1] + if key not in _QUERY_CACHE: + _QUERY_CACHE[key] = Query(language_obj, query_text) + result = _QUERY_CACHE[key] + _QUERY_LAST = (key, result) + return result + class FunctionCapturesResult(NamedTuple): lang_config: LanguageSpec captures: dict[str, list[ASTNode]] +def sorted_captures(cursor: QueryCursor, node: ASTNode) -> dict[str, list[ASTNode]]: + # (H) tree-sitter v0.25 captures() returns nodes in non-deterministic order + # (H) across process invocations; sort by start_byte for reproducibility + raw = cursor.captures(node) + result: dict[str, list[ASTNode]] = {} + for name, nodes in raw.items(): + if len(nodes) <= 1: + result[name] = nodes + else: + is_sorted = True + prev_byte = nodes[0].start_byte + for i in range(1, len(nodes)): + cur_byte = nodes[i].start_byte + if cur_byte < prev_byte: + is_sorted = False + break + prev_byte = cur_byte + result[name] = nodes if is_sorted else sorted(nodes, key=_start_byte_key) + return result + + +def _start_byte_key(n: ASTNode) -> int: + return n.start_byte + + def get_function_captures( root_node: ASTNode, language: cs.SupportedLanguage, @@ -41,11 +83,11 @@ def get_function_captures( return None cursor = QueryCursor(query) - captures = cursor.captures(root_node) + captures = sorted_captures(cursor, root_node) return FunctionCapturesResult(lang_config, captures) -@lru_cache(maxsize=10000) +@lru_cache(maxsize=50000) def _cached_decode_bytes(text_bytes: bytes) -> str: return text_bytes.decode(cs.ENCODING_UTF8) @@ -72,6 +114,102 @@ def contains_node(parent: ASTNode, target: ASTNode) -> bool: ) +def _decorator_tail_names(decorators: list[str]) -> set[str]: + return { + decorator.lstrip(cs.DECORATOR_AT).split(cs.SEPARATOR_DOT)[-1] + for decorator in decorators + } + + +def _is_property_decorator(decorators: list[str]) -> bool: + return bool(_decorator_tail_names(decorators) & cs.PROPERTY_DECORATORS) + + +def _is_abstract_decorator(decorators: list[str]) -> bool: + return bool(_decorator_tail_names(decorators) & cs.ABSTRACT_DECORATORS) + + +_PY_NAMED_PARAMETERS = frozenset( + {cs.TS_PY_DEFAULT_PARAMETER, cs.TS_PY_TYPED_DEFAULT_PARAMETER} +) +_PY_SCOPE_BOUNDARIES = frozenset( + { + cs.TS_PY_FUNCTION_DEFINITION, + cs.TS_PY_CLASS_DEFINITION, + cs.TS_PY_DECORATED_DEFINITION, + } +) + + +def _python_parameter_name(param_node: Node) -> str | None: + if param_node.type == cs.TS_PY_IDENTIFIER: + return safe_decode_text(param_node) + if param_node.type in _PY_NAMED_PARAMETERS: + name_node = param_node.child_by_field_name(cs.FIELD_NAME) + if name_node is not None and name_node.type == cs.TS_PY_IDENTIFIER: + return safe_decode_text(name_node) + return None + if param_node.type == cs.TS_PY_TYPED_PARAMETER: + for child in param_node.children: + if child.type == cs.TS_PY_IDENTIFIER: + return safe_decode_text(child) + return None + + +def _python_invoked_parameter_names(body_node: Node, candidates: set[str]) -> set[str]: + invoked: set[str] = set() + stack = [body_node] + while stack: + node = stack.pop() + if node.type == cs.TS_PY_CALL: + fn = node.child_by_field_name(cs.FIELD_FUNCTION) + if ( + fn is not None + and fn.type == cs.TS_PY_IDENTIFIER + and (name := safe_decode_text(fn)) in candidates + ): + invoked.add(name) + for child in node.children: + # (H) Nested def/class bodies rebind the param name, so do not let an + # (H) inner call to a same-named local masquerade as the outer param. + if child.type not in _PY_SCOPE_BOUNDARIES: + stack.append(child) + return invoked + + +def python_parameter_names(func_node: Node) -> list[str]: + # (H) Ordered parameter names with a leading self/cls dropped, so positions line + # (H) up with how call-site arguments map to parameters for bound methods. + params_node = func_node.child_by_field_name(cs.FIELD_PARAMETERS) + if params_node is None: + return [] + names: list[str] = [] + for child in params_node.named_children: + if (name := _python_parameter_name(child)) is not None: + names.append(name) + if names and names[0] in (cs.PY_KEYWORD_SELF, cs.PY_KEYWORD_CLS): + names = names[1:] + return names + + +def callable_parameter_indices( + func_node: Node, language: cs.SupportedLanguage | None +) -> dict[str, int]: + # (H) Maps each parameter that is invoked as a call inside the function body + # (H) to its positional index in the call-site argument list (self/cls + # (H) dropped so the index lines up with how bound methods are invoked). + if language != cs.SupportedLanguage.PYTHON: + return {} + body_node = func_node.child_by_field_name(cs.FIELD_BODY) + if body_node is None or not (names := python_parameter_names(func_node)): + return {} + + invoked = _python_invoked_parameter_names(body_node, set(names)) + if not invoked: + return {} + return {name: index for index, name in enumerate(names) if name in invoked} + + def ingest_method( method_node: ASTNode, container_qn: str, @@ -83,6 +221,8 @@ def ingest_method( language: cs.SupportedLanguage | None = None, extract_decorators_func: Callable[[ASTNode], list[str]] | None = None, method_qualified_name: str | None = None, + file_path: Path | None = None, + repo_path: Path | None = None, ) -> None: if language == cs.SupportedLanguage.CPP: from .cpp import utils as cpp_utils @@ -98,6 +238,10 @@ def ingest_method( method_name = text.decode(cs.ENCODING_UTF8) method_qn = method_qualified_name or f"{container_qn}.{method_name}" + if language != cs.SupportedLanguage.CPP: + method_qn = function_registry.register_unique_qn( + method_qn, method_node.start_point[0] + 1 + ) decorators = extract_decorators_func(method_node) if extract_decorators_func else [] @@ -109,14 +253,36 @@ def ingest_method( cs.KEY_END_LINE: method_node.end_point[0] + 1, cs.KEY_DOCSTRING: get_docstring_func(method_node), } + if file_path is not None and repo_path is not None: + method_props[cs.KEY_PATH] = cached_relative_path( + file_path, repo_path + ).as_posix() + method_props[cs.KEY_ABSOLUTE_PATH] = cached_resolve_posix(file_path) logger.info(logs.METHOD_FOUND.format(name=method_name, qn=method_qn)) ingestor.ensure_node_batch(cs.NodeLabel.METHOD, method_props) function_registry[method_qn] = NodeType.METHOD + if _is_property_decorator(decorators): + function_registry.mark_property(method_qn) + if _is_abstract_decorator(decorators): + function_registry.mark_abstract(method_qn) + function_registry.mark_callable_params( + method_qn, callable_parameter_indices(method_node, language) + ) simple_name_lookup[method_name].add(method_qn) + # (H) The DEFINES_METHOD parent is matched in the graph by LABEL + + # (H) qualified_name, so it must carry the container's real node label. Callers + # (H) pass Class by default, but a trait/interface (Interface) or enum (Enum) + # (H) container would then never match, dropping the containment edge. Prefer + # (H) the label the container was actually registered with. + container_label = container_type + registered = function_registry.get(container_qn) + if registered is not None and registered != NodeType.METHOD: + container_label = cs.NodeLabel(registered.value) + ingestor.ensure_relationship_batch( - (container_type, cs.KEY_QUALIFIED_NAME, container_qn), + (container_label, cs.KEY_QUALIFIED_NAME, container_qn), cs.RelationshipType.DEFINES_METHOD, (cs.NodeLabel.METHOD, cs.KEY_QUALIFIED_NAME, method_qn), ) @@ -137,6 +303,9 @@ def ingest_exported_function( return function_qn = f"{module_qn}.{function_name}" + function_qn = function_registry.register_unique_qn( + function_qn, function_node.start_point[0] + 1 + ) function_props = { cs.KEY_QUALIFIED_NAME: function_qn, @@ -161,8 +330,21 @@ def is_method_node(func_node: ASTNode, lang_config: LanguageSpec) -> bool: if not isinstance(current, Node): return False - while current and current.type not in lang_config.module_node_types: - if current.type in lang_config.class_node_types: + class_types = lang_config.class_node_types + func_types = lang_config.function_node_types + module_types = lang_config.module_node_types + body_field = cs.FIELD_BODY + + while current is not None: + current_type = current.type + if current_type in module_types: + return False + if current_type in class_types: return True + if ( + current_type in func_types + and current.child_by_field_name(body_field) is not None + ): + return False current = current.parent return False diff --git a/codebase_rag/prompts.py b/codebase_rag/prompts.py index de5cce132..9eaae75b1 100644 --- a/codebase_rag/prompts.py +++ b/codebase_rag/prompts.py @@ -26,7 +26,6 @@ def extract_tool_names(tools: list["Tool"]) -> ToolNames: "query_codebase_knowledge_graph", "query_codebase_knowledge_graph" ), read_file=tool_map.get("read_file_content", "read_file_content"), - analyze_document=tool_map.get("analyze_document", "analyze_document"), semantic_search=tool_map.get("semantic_code_search", "semantic_code_search"), create_file=tool_map.get("create_new_file", "create_new_file"), edit_file=tool_map.get("replace_code_surgically", "replace_code_surgically"), @@ -40,7 +39,37 @@ def extract_tool_names(tools: list["Tool"]) -> ToolNames: - **Use `STARTS WITH` for Paths**: When matching paths, always use `STARTS WITH` for robustness (e.g., `WHERE n.path STARTS WITH 'workflows/src'`). Do not use `=`. - **Use `ENDS WITH` for qualified_name**: The `qualified_name` property contains full paths like `'Project.folder.subfolder.ClassName'`. When users mention a class, function, or method by its short name (e.g., "VatManager"), use `ENDS WITH` to match: `WHERE c.qualified_name ENDS WITH '.VatManager'`. Do NOT use `{name: 'VatManager'}` equality matching. - **Use `toLower()` for Searches**: For case-insensitive searching on string properties, use `toLower()`. -- **Querying Lists**: To check if a list property (like `decorators`) contains an item, use the `ANY` or `IN` clause (e.g., `WHERE 'flow' IN n.decorators`).""" +- **Querying Lists**: To check if a list property (like `decorators`) contains an item, use the `ANY` or `IN` clause (e.g., `WHERE 'flow' IN n.decorators`). +- **NEVER use unbounded variable-length paths**: Patterns like `[:CALLS*]`, `[*]`, `[:CALLS*1..]` enumerate every path in the graph and exhaust memory. Always cap with an upper bound, e.g. `[:CALLS*1..6]`. If you genuinely need unbounded reachability, use a MAGE procedure (see Section 2b) instead of variable-length Cypher. + +**2b. Graph Algorithm Procedures (MAGE)** + +For algorithmic questions (longest/shortest paths, cycles, recursion clusters, centrality, communities, reachability), prefer calling a MAGE procedure over writing variable-length Cypher. Cypher path patterns enumerate all matches with no memoization, so they OOM on cyclic graphs; MAGE procedures run real graph algorithms in bounded memory. + +Use these read-only procedures (call them with `CALL (...) YIELD ... RETURN ...`): + +- **Strongly connected components / recursion clusters**: `CALL nxalg.strongly_connected_components() YIELD components` +- **Weakly connected components**: `CALL weakly_connected_components.get() YIELD node, component_id` or `CALL wcc.get_components(nodes, edges)` +- **Cycles**: `CALL nxalg.simple_cycles() YIELD cycles` (all cycles), `CALL nxalg.find_cycle() YIELD cycle` (one cycle) +- **All simple paths between two nodes (bounded)**: `CALL nxalg.all_simple_paths(source, target, cutoff)` or `CALL algo.all_simple_paths(source, target, [:CALLS], maxHops)` +- **Shortest path**: `CALL nxalg.shortest_path(source, target)` or `CALL algo.astar(source, target, config)` +- **Reachability**: `CALL graph_util.ancestors(node)`, `CALL graph_util.descendants(node)` +- **Topological order (DAGs only)**: `CALL nxalg.topological_sort() YIELD nodes` or `CALL graph_util.topological_sort()` +- **PageRank**: `CALL pagerank.get() YIELD node, rank` or `CALL nxalg.pagerank() YIELD node, rank` +- **Betweenness centrality**: `CALL betweenness_centrality.get() YIELD node, betweenness_centrality` +- **Degree centrality**: `CALL degree_centrality.get() YIELD node, degree` +- **Communities**: `CALL community_detection.get() YIELD node, community_id`, `CALL leiden_community_detection.get() YIELD node, community_id` +- **Articulation / bridges**: `CALL bridges.get() YIELD ...`, `CALL nxalg.biconnected_components() YIELD nodes` +- **Dominators**: `CALL nxalg.immediate_dominators(start) YIELD node, dominator` +- **Path expansion (bounded BFS over filtered edges)**: `CALL path.expand(start, relationships, labels, minHops, maxHops) YIELD path` + +Important: MAGE procedures named `nxalg.*` and several others operate on the **entire graph**, ignoring edge-type filters. To restrict to a specific edge type (e.g., only `CALLS`), follow the procedure call with a `WHERE` clause that checks `EXISTS((a)-[:CALLS]->(b))` or use `path.expand` which accepts a relationship-type filter. + +**2c. When Cypher Can't Answer** + +If a question cannot be expressed as a bounded Cypher pattern or as a single MAGE procedure call (e.g., "longest call chain in a graph with cycles"), return your best bounded approximation rather than an unbounded path query. Examples: +- "longest call chain" → `CALL nxalg.strongly_connected_components() YIELD components RETURN components` (let the orchestrator post-process), or use `CALL path.expand` with a generous but finite `maxHops`. +- "find a deeply-nested call site" → use a bounded depth such as `[:CALLS*1..10]` with `ORDER BY ... LIMIT 1`.""" def build_graph_schema_and_rules() -> str: @@ -58,9 +87,41 @@ def build_graph_schema_and_rules() -> str: GRAPH_SCHEMA_AND_RULES = build_graph_schema_and_rules() -def build_rag_orchestrator_prompt(tools: list["Tool"]) -> str: +def _format_active_projects_block(active_projects: list[str] | None) -> str: + if not active_projects: + return ( + "\n**Project Scope**: This Memgraph database may contain multiple " + "indexed projects. Call `list_projects` early to enumerate them, then " + "scope graph queries by filtering on the `qualified_name` prefix " + "(e.g., `WHERE n.qualified_name STARTS WITH 'projectName.'`).\n" + ) + if len(active_projects) == 1: + return ( + f"\n**Project Scope**: This session is focused on the project " + f"`{active_projects[0]}`. Scope Cypher queries by filtering on " + f"`WHERE n.qualified_name STARTS WITH '{active_projects[0]}.'` " + "unless the user explicitly asks about other projects.\n" + ) + project_list = ", ".join(f"`{p}`" for p in active_projects) + starts_with_examples = " OR ".join( + f"n.qualified_name STARTS WITH '{p}.'" for p in active_projects + ) + return ( + f"\n**Project Scope**: This session spans the following projects: " + f"{project_list}. When users ask cross-project questions, query across " + "all of them. To restrict to one project, filter " + f"`n.qualified_name STARTS WITH '.'`. To restrict to the " + f"active set, filter with `{starts_with_examples}`.\n" + ) + + +def build_rag_orchestrator_prompt( + tools: list["Tool"], + project_instructions: str | None = None, + active_projects: list[str] | None = None, +) -> str: t = extract_tool_names(tools) - return f"""You are an expert AI assistant for analyzing codebases. Your answers are based **EXCLUSIVELY** on information retrieved using your tools. + base = f"""You are an expert AI assistant for analyzing codebases. Your answers are based **EXCLUSIVELY** on information retrieved using your tools. **CRITICAL RULES:** 1. **TOOL-ONLY ANSWERS**: You must ONLY use information from the tools provided. Do not use external knowledge. @@ -68,10 +129,10 @@ def build_rag_orchestrator_prompt(tools: list["Tool"]) -> str: 3. **HONESTY**: If a tool fails or returns no results, you MUST state that clearly and report any error messages. Do not invent answers. 4. **CHOOSE THE RIGHT TOOL FOR THE FILE TYPE**: - For source code files (.py, .ts, etc.), use `{t.read_file}`. - - For documents like PDFs, use the `{t.analyze_document}` tool. This is more effective than trying to read them as plain text. + - Images and PDFs the user references are attached inline to the message; read them directly from your own multimodal input. **Your General Approach:** -1. **Analyze Documents**: If the user asks a question about a document (like a PDF), you **MUST** use the `{t.analyze_document}` tool. Provide both the `file_path` and the user's `question` to the tool. +1. **Inspect Attached Media Directly**: When the user attaches an image or PDF, analyze it from the inline content of the message. Do not call a tool for it. 2. **Deep Dive into Code**: When you identify a relevant component (e.g., a folder), you must go beyond documentation. a. First, check if documentation files like `README.md` exist and read them for context. For configuration, look for files appropriate to the language (e.g., `pyproject.toml` for Python, `package.json` for Node.js). b. **Then, you MUST dive into the source code.** Explore the `src` directory (or equivalent). Identify and read key files (e.g., `main.py`, `index.ts`, `app.ts`) to understand the implementation details, logic, and functionality. @@ -128,6 +189,18 @@ def build_rag_orchestrator_prompt(tools: list["Tool"]) -> str: d. Prioritize most relevant findings over comprehensive coverage 8. **Synthesize Answer**: Analyze and explain the retrieved content. Cite your sources (file paths or qualified names). Report any errors gracefully. """ + base += _format_active_projects_block(active_projects) + extra = (project_instructions or "").strip() + if not extra: + return base + return ( + f"{base}\n" + "**Project-Specific Instructions (from .cgr.md):**\n" + "These instructions come from the repository being analyzed. Follow them " + "in addition to the rules above; if they conflict with the critical rules, " + "the critical rules win.\n\n" + f"{extra}\n" + ) CYPHER_SYSTEM_PROMPT = f""" @@ -196,6 +269,14 @@ def build_rag_orchestrator_prompt(tools: list["Tool"]) -> str: - CORRECT: `MATCH (c:Class) RETURN count(c) AS total` - WRONG: `MATCH (c:Class) RETURN c.name, count(c) AS total` (returns all items!) +**VALUE PATTERN RULES (CRITICAL FOR NAME MATCHING):** +- The `qualified_name` property contains FULL paths like: `'Project.folder.subfolder.ClassName'` +- When users mention a class or function by SHORT NAME (e.g., "VatManager", "UserService"), you MUST match using the `name` property, NOT `qualified_name`. +- CORRECT: `WHERE c.name = 'VatManager'` +- WRONG: `WHERE c.qualified_name = 'VatManager'` (will never match!) +- Use `DEFINES_METHOD` relationship to find methods of a class. +- Use `DEFINES` relationship to find functions/classes defined in a module. + **Examples:** * **Natural Language:** "How many classes are there?" @@ -235,7 +316,7 @@ def build_rag_orchestrator_prompt(tools: list["Tool"]) -> str: ``` * **Natural Language:** "What methods does UserService have?" or "Show me methods in UserService" or "List UserService methods" -* **Cypher Query (Use ENDS WITH to match class by short name):** +* **Cypher Query (Note: match by `name` property, use `DEFINES_METHOD` relationship):** ```cypher {CYPHER_EXAMPLE_CLASS_METHODS} ``` @@ -262,7 +343,7 @@ def build_rag_orchestrator_prompt(tools: list["Tool"]) -> str: Please: 1. Use your code retrieval and graph querying tools to understand the codebase structure 2. Read relevant source files to identify optimization opportunities -3. Use the analyze_document tool to reference best practices from {reference_document} +3. Reference best practices from {reference_document} (attached inline) 4. Reference established patterns and best practices for {language} 5. Propose specific, actionable optimizations with file references 6. IMPORTANT: Do not make any changes yet - just propose them and wait for approval diff --git a/codebase_rag/providers/base.py b/codebase_rag/providers/base.py index 37f5cb462..0716b5f38 100644 --- a/codebase_rag/providers/base.py +++ b/codebase_rag/providers/base.py @@ -6,8 +6,13 @@ import httpx from loguru import logger +from pydantic_ai.models.anthropic import AnthropicModel, AnthropicModelSettings from pydantic_ai.models.google import GoogleModel, GoogleModelSettings from pydantic_ai.models.openai import OpenAIChatModel, OpenAIResponsesModel +from pydantic_ai.providers.anthropic import ( + AnthropicProvider as PydanticAnthropicProvider, +) +from pydantic_ai.providers.azure import AzureProvider as PydanticAzureProvider from pydantic_ai.providers.google import GoogleProvider as PydanticGoogleProvider from pydantic_ai.providers.openai import OpenAIProvider as PydanticOpenAIProvider @@ -18,13 +23,15 @@ class ModelProvider(ABC): + __slots__ = ("config",) + def __init__(self, **config: str | int | None) -> None: self.config = config @abstractmethod def create_model( self, model_id: str, **kwargs: str | int | None - ) -> GoogleModel | OpenAIResponsesModel | OpenAIChatModel: + ) -> GoogleModel | OpenAIResponsesModel | OpenAIChatModel | AnthropicModel: pass @abstractmethod @@ -37,7 +44,25 @@ def provider_name(self) -> cs.Provider: pass +def _resolve_api_key(api_key: str | None, env_var: str) -> str | None: + env_key = os.environ.get(env_var) + if env_key: + return env_key + if api_key and api_key != cs.DEFAULT_API_KEY: + return api_key + return None + + class GoogleProvider(ModelProvider): + __slots__ = ( + "api_key", + "provider_type", + "project_id", + "region", + "service_account_file", + "thinking_budget", + ) + def __init__( self, api_key: str | None = None, @@ -49,7 +74,7 @@ def __init__( **kwargs: str | int | None, ) -> None: super().__init__(**kwargs) - self.api_key = api_key or os.environ.get(cs.ENV_GOOGLE_API_KEY) + self.api_key = _resolve_api_key(api_key, cs.ENV_GOOGLE_API_KEY) self.provider_type = provider_type self.project_id = project_id self.region = region @@ -98,6 +123,8 @@ def create_model(self, model_id: str, **kwargs: str | int | None) -> GoogleModel class OpenAIProvider(ModelProvider): + __slots__ = ("api_key", "endpoint") + def __init__( self, api_key: str | None = None, @@ -105,7 +132,7 @@ def __init__( **kwargs: str | int | None, ) -> None: super().__init__(**kwargs) - self.api_key = api_key or os.environ.get(cs.ENV_OPENAI_API_KEY) + self.api_key = _resolve_api_key(api_key, cs.ENV_OPENAI_API_KEY) self.endpoint = endpoint @property @@ -126,6 +153,8 @@ def create_model( class OllamaProvider(ModelProvider): + __slots__ = ("endpoint", "api_key") + def __init__( self, endpoint: str | None = None, @@ -155,12 +184,96 @@ def create_model( return OpenAIChatModel(model_id, provider=provider) +class AnthropicProvider(ModelProvider): + __slots__ = ("api_key",) + + def __init__( + self, + api_key: str | None = None, + **kwargs: str | int | None, + ) -> None: + super().__init__(**kwargs) + self.api_key = _resolve_api_key(api_key, cs.ENV_ANTHROPIC_API_KEY) + + @property + def provider_name(self) -> cs.Provider: + return cs.Provider.ANTHROPIC + + def validate_config(self) -> None: + if not self.api_key: + raise ValueError(ex.ANTHROPIC_NO_KEY) + + def create_model(self, model_id: str, **kwargs: str | int | None) -> AnthropicModel: + self.validate_config() + # (H) api_key is guaranteed to be set by validate_config + assert self.api_key is not None + provider = PydanticAnthropicProvider(api_key=self.api_key) + model_settings = AnthropicModelSettings( + anthropic_cache_instructions=True, + anthropic_cache_tool_definitions=True, + anthropic_cache_messages=True, + ) + return AnthropicModel(model_id, provider=provider, settings=model_settings) + + +class AzureOpenAIProvider(ModelProvider): + __slots__ = ("api_key", "endpoint", "api_version") + + def __init__( + self, + api_key: str | None = None, + endpoint: str | None = None, + api_version: str | None = None, + **kwargs: str | int | None, + ) -> None: + super().__init__(**kwargs) + self.api_key = _resolve_api_key(api_key, cs.ENV_AZURE_API_KEY) + self.endpoint = endpoint or os.environ.get(cs.ENV_AZURE_ENDPOINT) + self.api_version = api_version or os.environ.get(cs.ENV_AZURE_API_VERSION) + + @property + def provider_name(self) -> cs.Provider: + return cs.Provider.AZURE + + def validate_config(self) -> None: + if not self.api_key: + raise ValueError(ex.AZURE_NO_KEY) + if not self.endpoint: + raise ValueError(ex.AZURE_NO_ENDPOINT) + + def create_model( + self, model_id: str, **kwargs: str | int | None + ) -> OpenAIChatModel: + self.validate_config() + # (H) api_key and endpoint are guaranteed to be set by validate_config + assert self.api_key is not None + assert self.endpoint is not None + provider = PydanticAzureProvider( + api_key=self.api_key, + azure_endpoint=self.endpoint, + api_version=self.api_version, + ) + return OpenAIChatModel(model_id, provider=provider) + + PROVIDER_REGISTRY: dict[str, type[ModelProvider]] = { cs.Provider.GOOGLE: GoogleProvider, cs.Provider.OPENAI: OpenAIProvider, cs.Provider.OLLAMA: OllamaProvider, + cs.Provider.ANTHROPIC: AnthropicProvider, + cs.Provider.AZURE: AzureOpenAIProvider, } +# (H) Import LiteLLM provider after base classes are defined to avoid circular import +try: + from .litellm import LiteLLMProvider + + PROVIDER_REGISTRY[cs.Provider.LITELLM_PROXY] = LiteLLMProvider + _litellm_available = True +except ImportError as e: + logger.debug(f"LiteLLM provider not available: {e}") + _litellm_available = False + def get_provider( provider_name: str | cs.Provider, **config: str | int | None @@ -207,3 +320,29 @@ def check_ollama_running(endpoint: str | None = None) -> bool: return response.status_code == cs.HTTP_OK except (httpx.RequestError, httpx.TimeoutException): return False + + +def check_litellm_proxy_running( + endpoint: str = "http://localhost:4000", api_key: str | None = None +) -> bool: + try: + base_url = endpoint.rstrip("/v1").rstrip("/") + health_url = urljoin(base_url, "/health") + headers: dict[str, str] = {} + if api_key: + headers["Authorization"] = f"Bearer {api_key}" + + with httpx.Client(timeout=settings.LITELLM_HEALTH_TIMEOUT) as client: + response = client.get(health_url, headers=headers) + if response.status_code == cs.HTTP_OK: + return True + + # (H) Fallback to models endpoint for authenticated proxies + if api_key: + models_url = urljoin(base_url, "/v1/models") + response = client.get(models_url, headers=headers) + return response.status_code == cs.HTTP_OK + + return False + except (httpx.RequestError, httpx.TimeoutException): + return False diff --git a/codebase_rag/providers/litellm.py b/codebase_rag/providers/litellm.py new file mode 100644 index 000000000..7fc0360c3 --- /dev/null +++ b/codebase_rag/providers/litellm.py @@ -0,0 +1,50 @@ +"""LiteLLM provider using pydantic-ai's native LiteLLMProvider.""" + +from __future__ import annotations + +from loguru import logger +from pydantic_ai.models.openai import OpenAIChatModel +from pydantic_ai.providers.litellm import LiteLLMProvider as PydanticLiteLLMProvider + +from codebase_rag import constants as cs +from codebase_rag import exceptions as ex + +from .base import ModelProvider + + +class LiteLLMProvider(ModelProvider): + __slots__ = ("api_key", "endpoint") + + def __init__( + self, + api_key: str | None = None, + endpoint: str = "http://localhost:4000/v1", + **kwargs: str | int | None, + ) -> None: + super().__init__(**kwargs) + self.api_key = api_key + self.endpoint = endpoint + + @property + def provider_name(self) -> cs.Provider: + return cs.Provider.LITELLM_PROXY + + def validate_config(self) -> None: + if not self.endpoint: + raise ValueError(ex.LITELLM_NO_ENDPOINT) + + from .base import check_litellm_proxy_running + + base_url = self.endpoint.rstrip("/v1").rstrip("/") + if not check_litellm_proxy_running(base_url, api_key=self.api_key): + raise ValueError(ex.LITELLM_NOT_RUNNING.format(endpoint=base_url)) + + def create_model( + self, model_id: str, **kwargs: str | int | None + ) -> OpenAIChatModel: + self.validate_config() + + logger.info(f"Creating LiteLLM proxy model: {model_id} at {self.endpoint}") + + provider = PydanticLiteLLMProvider(api_key=self.api_key, api_base=self.endpoint) + return OpenAIChatModel(model_id, provider=provider) diff --git a/codebase_rag/services/anthropic_token_counter.py b/codebase_rag/services/anthropic_token_counter.py new file mode 100644 index 000000000..a207d8af8 --- /dev/null +++ b/codebase_rag/services/anthropic_token_counter.py @@ -0,0 +1,154 @@ +from __future__ import annotations + +import base64 +from typing import Any + +import httpx +from pydantic_ai import BinaryContent +from pydantic_ai.messages import ( + ModelMessage, + ModelRequest, + ModelResponse, + RetryPromptPart, + SystemPromptPart, + TextPart, + ToolCallPart, + ToolReturnPart, + UserPromptPart, +) + +from .. import constants as cs + + +def _binary_block(item: BinaryContent) -> dict[str, Any]: + media = item.media_type or cs.MIME_TYPE_FALLBACK + block_type = "image" if media.startswith("image/") else "document" + return { + "type": block_type, + "source": { + "type": "base64", + "media_type": media, + "data": base64.b64encode(item.data).decode(), + }, + } + + +def _user_part_to_blocks(part: UserPromptPart) -> list[dict[str, Any]]: + content = part.content + if isinstance(content, str): + return [{"type": "text", "text": content}] + blocks: list[dict[str, Any]] = [] + for item in content: + if isinstance(item, str): + blocks.append({"type": "text", "text": item}) + elif isinstance(item, BinaryContent): + blocks.append(_binary_block(item)) + return blocks + + +def _tool_return_content(value: object) -> str | list[dict[str, Any]]: + if isinstance(value, str): + return value + if isinstance(value, list): + out: list[dict[str, Any]] = [] + for item in value: + if isinstance(item, str): + out.append({"type": "text", "text": item}) + elif isinstance(item, BinaryContent): + out.append(_binary_block(item)) + if out: + return out + return str(value) + + +def _to_anthropic_payload( + messages: list[ModelMessage], +) -> tuple[str, list[dict[str, Any]]]: + system_parts: list[str] = [] + out: list[dict[str, Any]] = [] + for m in messages: + if isinstance(m, ModelRequest): + user_content: list[dict[str, Any]] = [] + for part in m.parts: + if isinstance(part, SystemPromptPart): + system_parts.append(part.content) + elif isinstance(part, UserPromptPart): + user_content.extend(_user_part_to_blocks(part)) + elif isinstance(part, ToolReturnPart): + user_content.append( + { + "type": "tool_result", + "tool_use_id": part.tool_call_id, + "content": _tool_return_content(part.content), + } + ) + elif isinstance(part, RetryPromptPart): + if part.tool_name is None: + user_content.append( + {"type": "text", "text": part.model_response()} + ) + else: + user_content.append( + { + "type": "tool_result", + "tool_use_id": part.tool_call_id, + "content": part.model_response(), + "is_error": True, + } + ) + if user_content: + out.append({"role": "user", "content": user_content}) + elif isinstance(m, ModelResponse): + assistant_content: list[dict[str, Any]] = [] + for part in m.parts: + if isinstance(part, TextPart): + if part.content: + assistant_content.append({"type": "text", "text": part.content}) + elif isinstance(part, ToolCallPart): + assistant_content.append( + { + "type": "tool_use", + "id": part.tool_call_id, + "name": part.tool_name, + "input": part.args_as_dict() or {}, + } + ) + if assistant_content: + out.append({"role": "assistant", "content": assistant_content}) + return "\n".join(system_parts), out + + +class TokenCountError(Exception): + pass + + +async def count_anthropic_context( + api_key: str, + model_id: str, + messages: list[ModelMessage], +) -> int: + system_prompt, anthropic_messages = _to_anthropic_payload(messages) + if not anthropic_messages: + if not system_prompt: + return 0 + anthropic_messages = [ + {"role": "user", "content": [{"type": "text", "text": "."}]} + ] + payload: dict[str, Any] = { + "model": model_id, + "messages": anthropic_messages, + } + if system_prompt: + payload["system"] = system_prompt + headers = { + cs.ANTHROPIC_HEADER_API_KEY: api_key, + cs.ANTHROPIC_HEADER_VERSION: cs.ANTHROPIC_API_VERSION, + cs.HEADER_CONTENT_TYPE: cs.CONTENT_TYPE_JSON, + } + async with httpx.AsyncClient(timeout=cs.ANTHROPIC_COUNT_TIMEOUT_S) as client: + resp = await client.post( + cs.ANTHROPIC_COUNT_TOKENS_URL, json=payload, headers=headers + ) + if resp.status_code >= 400: + raise TokenCountError(f"{resp.status_code}: {resp.text}") + return int(resp.json().get("input_tokens", 0)) diff --git a/codebase_rag/services/graph_service.py b/codebase_rag/services/graph_service.py index 7a8d95e02..adee02449 100644 --- a/codebase_rag/services/graph_service.py +++ b/codebase_rag/services/graph_service.py @@ -1,19 +1,25 @@ from __future__ import annotations +import threading import types from collections import defaultdict from collections.abc import Generator, Sequence -from contextlib import contextmanager +from concurrent.futures import ThreadPoolExecutor, as_completed +from contextlib import contextmanager, nullcontext from datetime import UTC, datetime import mgclient # ty: ignore[unresolved-import] from loguru import logger +from codebase_rag.config import settings from codebase_rag.types_defs import CursorProtocol, ResultValue from .. import exceptions as ex from .. import logs as ls from ..constants import ( + CYPHER_MEMORY_LIMIT_SUFFIX, + CYPHER_MEMORY_LIMIT_TOKEN, + CYPHER_SEMICOLON, ERR_SUBSTR_ALREADY_EXISTS, ERR_SUBSTR_CONSTRAINT, KEY_CREATED, @@ -32,6 +38,8 @@ CYPHER_EXPORT_RELATIONSHIPS, CYPHER_LIST_PROJECTS, build_constraint_query, + build_create_node_query, + build_create_relationship_query, build_index_query, build_merge_node_query, build_merge_relationship_query, @@ -50,28 +58,65 @@ ) +def _apply_memory_limit(query: str, mb: int) -> str: + if CYPHER_MEMORY_LIMIT_TOKEN in query.upper(): + return query + stripped = query.rstrip() + had_semicolon = stripped.endswith(CYPHER_SEMICOLON) + if had_semicolon: + stripped = stripped[: -len(CYPHER_SEMICOLON)].rstrip() + suffix = CYPHER_MEMORY_LIMIT_SUFFIX.format(mb=mb) + return f"{stripped}{suffix}{CYPHER_SEMICOLON}" + + class MemgraphIngestor: - def __init__(self, host: str, port: int, batch_size: int = 1000): + __slots__ = ( + "_conn_lock", + "_executor", + "_host", + "_port", + "_username", + "_password", + "_use_merge", + "_rel_count", + "_rel_groups", + "batch_size", + "conn", + "node_buffer", + ) + + def __init__( + self, + host: str, + port: int, + batch_size: int = 1000, + username: str | None = None, + password: str | None = None, + use_merge: bool = True, + ): self._host = host self._port = port + self._username = username.strip() if username and username.strip() else None + self._password = password.strip() if password and password.strip() else None + if (self._username is None) != (self._password is None): + raise ValueError(ex.AUTH_INCOMPLETE) if batch_size < 1: raise ValueError(ex.BATCH_SIZE) self.batch_size = batch_size + self._use_merge = use_merge + self._conn_lock = threading.Lock() + self._executor: ThreadPoolExecutor | None = None self.conn: mgclient.Connection | None = None self.node_buffer: list[tuple[str, dict[str, PropertyValue]]] = [] - self.relationship_buffer: list[ - tuple[ - tuple[str, str, PropertyValue], - str, - tuple[str, str, PropertyValue], - dict[str, PropertyValue] | None, - ] - ] = [] + self._rel_count = 0 + self._rel_groups: defaultdict[ + tuple[str, str, str, str, str], list[RelBatchRow] + ] = defaultdict(list) def __enter__(self) -> MemgraphIngestor: logger.info(ls.MG_CONNECTING.format(host=self._host, port=self._port)) - self.conn = mgclient.connect(host=self._host, port=self._port) - self.conn.autocommit = True + self.conn = self._create_connection() + self._executor = ThreadPoolExecutor(max_workers=settings.FLUSH_THREAD_POOL_SIZE) logger.info(ls.MG_CONNECTED) return self @@ -81,24 +126,49 @@ def __exit__( exc_val: Exception | None, exc_tb: types.TracebackType | None, ) -> None: - if exc_type: - logger.exception(ls.MG_EXCEPTION.format(error=exc_val)) - self.flush_all() - if self.conn: - self.conn.close() - logger.info(ls.MG_DISCONNECTED) + try: + if exc_type: + logger.exception(ls.MG_EXCEPTION.format(error=exc_val)) + # (H) Best-effort flush: attempt to persist buffered nodes/relationships + # (H) even when an exception occurred. Catching broad Exception so a + # (H) secondary flush failure never masks the original exception. + try: + self.flush_all() + except Exception as flush_err: + logger.error(ls.MG_FLUSH_ERROR.format(error=flush_err)) + else: + self.flush_all() + finally: + if self._executor: + self._executor.shutdown(wait=True) + self._executor = None + if self.conn: + self.conn.close() + logger.info(ls.MG_DISCONNECTED) + + async def __aenter__(self) -> MemgraphIngestor: + return self.__enter__() + + async def __aexit__( + self, + exc_type: type | None, + exc_val: Exception | None, + exc_tb: types.TracebackType | None, + ) -> None: + self.__exit__(exc_type, exc_val, exc_tb) @contextmanager def _get_cursor(self) -> Generator[CursorProtocol, None, None]: if not self.conn: raise ConnectionError(ex.CONN) - cursor: CursorProtocol | None = None - try: - cursor = self.conn.cursor() - yield cursor - finally: - if cursor: - cursor.close() + with self._conn_lock: + cursor: CursorProtocol | None = None + try: + cursor = self.conn.cursor() + yield cursor + finally: + if cursor: + cursor.close() def _cursor_to_results(self, cursor: CursorProtocol) -> list[ResultRow]: if not cursor.description: @@ -128,12 +198,30 @@ def _execute_query( logger.error(ls.MG_CYPHER_PARAMS.format(params=params)) raise - def _execute_batch(self, query: str, params_list: Sequence[BatchParams]) -> None: - if not self.conn or not params_list: + def _create_connection(self) -> mgclient.Connection: + if self._username is not None: + conn = mgclient.connect( + host=self._host, + port=self._port, + username=self._username, + password=self._password, + ) + else: + conn = mgclient.connect(host=self._host, port=self._port) + conn.autocommit = True + return conn + + def _execute_batch_on( + self, + conn: mgclient.Connection, + query: str, + params_list: Sequence[BatchParams], + ) -> None: + if not params_list: return cursor = None try: - cursor = self.conn.cursor() + cursor = conn.cursor() cursor.execute(wrap_with_unwind(query), BatchWrapper(batch=params_list)) except Exception as e: if ERR_SUBSTR_ALREADY_EXISTS not in str(e).lower(): @@ -152,14 +240,17 @@ def _execute_batch(self, query: str, params_list: Sequence[BatchParams]) -> None if cursor: cursor.close() - def _execute_batch_with_return( - self, query: str, params_list: Sequence[BatchParams] + def _execute_batch_with_return_on( + self, + conn: mgclient.Connection, + query: str, + params_list: Sequence[BatchParams], ) -> list[ResultRow]: - if not self.conn or not params_list: + if not params_list: return [] cursor = None try: - cursor = self.conn.cursor() + cursor = conn.cursor() cursor.execute(wrap_with_unwind(query), BatchWrapper(batch=params_list)) return self._cursor_to_results(cursor) except Exception as e: @@ -208,7 +299,7 @@ def ensure_node_batch( ) -> None: self.node_buffer.append((label, properties)) if len(self.node_buffer) >= self.batch_size: - logger.debug(ls.MG_NODE_BUFFER_FLUSH.format(size=self.batch_size)) + logger.debug(ls.MG_NODE_BUFFER_FLUSH, size=self.batch_size) self.flush_nodes() def ensure_relationship_batch( @@ -220,19 +311,82 @@ def ensure_relationship_batch( ) -> None: from_label, from_key, from_val = from_spec to_label, to_key, to_val = to_spec - self.relationship_buffer.append( - ( - (from_label, from_key, from_val), - rel_type, - (to_label, to_key, to_val), - properties, - ) + pattern = (from_label, from_key, rel_type, to_label, to_key) + self._rel_groups[pattern].append( + RelBatchRow(from_val=from_val, to_val=to_val, props=properties or {}) ) - if len(self.relationship_buffer) >= self.batch_size: - logger.debug(ls.MG_REL_BUFFER_FLUSH.format(size=self.batch_size)) + self._rel_count += 1 + if self._rel_count >= self.batch_size: + logger.debug(ls.MG_REL_BUFFER_FLUSH, size=self.batch_size) self.flush_nodes() self.flush_relationships() + def _flush_node_label_group( + self, + label: str, + props_list: list[dict[str, PropertyValue]], + conn: mgclient.Connection | None = None, + ) -> tuple[int, int]: + if not props_list: + return 0, 0 + + id_key = NODE_UNIQUE_CONSTRAINTS.get(label) + if not id_key: + logger.warning(ls.MG_NO_CONSTRAINT.format(label=label)) + return 0, len(props_list) + + batch_rows: list[NodeBatchRow] = [] + skipped = 0 + for props in props_list: + if id_key not in props: + logger.warning( + ls.MG_MISSING_PROP.format( + label=label, key=id_key, prop_keys=list(props.keys()) + ) + ) + skipped += 1 + continue + row_props: PropertyDict = {k: v for k, v in props.items() if k != id_key} + batch_rows.append(NodeBatchRow(id=props[id_key], props=row_props)) + + if not batch_rows: + return 0, skipped + + build_query = ( + build_merge_node_query if self._use_merge else build_create_node_query + ) + query = build_query(label, id_key) + target_conn = conn or self.conn + if not target_conn: + logger.warning(ls.MG_NO_CONN_NODES.format(label=label)) + return 0, skipped + len(batch_rows) + lock = self._conn_lock if conn is None else nullcontext() + with lock: + self._execute_batch_on(target_conn, query, batch_rows) + return len(batch_rows), skipped + + def _flush_node_group_with_own_conn( + self, + label: str, + props_list: list[dict[str, PropertyValue]], + ) -> tuple[int, int]: + conn = self._create_connection() + try: + return self._flush_node_label_group(label, props_list, conn=conn) + finally: + conn.close() + + def _flush_rel_group_with_own_conn( + self, + pattern: tuple[str, str, str, str, str], + params_list: list[RelBatchRow], + ) -> tuple[int, int]: + conn = self._create_connection() + try: + return self._flush_rel_pattern_group(pattern, params_list, conn=conn) + finally: + conn.close() + def flush_nodes(self) -> None: if not self.node_buffer: return @@ -243,37 +397,46 @@ def flush_nodes(self) -> None: ) for label, props in self.node_buffer: nodes_by_label[label].append(props) + flushed_total = 0 skipped_total = 0 - for label, props_list in nodes_by_label.items(): - if not props_list: - continue - id_key = NODE_UNIQUE_CONSTRAINTS.get(label) - if not id_key: - logger.warning(ls.MG_NO_CONSTRAINT.format(label=label)) - skipped_total += len(props_list) - continue - batch_rows: list[NodeBatchRow] = [] - for props in props_list: - if id_key not in props: - logger.warning( - ls.MG_MISSING_PROP.format(label=label, key=id_key, props=props) - ) - skipped_total += 1 - continue - row_props: PropertyDict = { - k: v for k, v in props.items() if k != id_key - } - batch_rows.append(NodeBatchRow(id=props[id_key], props=row_props)) - - if not batch_rows: - continue + first_error: Exception | None = None - flushed_total += len(batch_rows) + if self._executor and len(nodes_by_label) > 1: + logger.info( + ls.MG_PARALLEL_FLUSH_NODES.format( + count=len(nodes_by_label), + workers=settings.FLUSH_THREAD_POOL_SIZE, + ) + ) + futures = { + self._executor.submit( + self._flush_node_group_with_own_conn, label, props_list + ): label + for label, props_list in nodes_by_label.items() + } + for future in as_completed(futures): + label = futures[future] + try: + flushed, skipped = future.result() + flushed_total += flushed + skipped_total += skipped + except Exception as e: + logger.error(ls.MG_LABEL_FLUSH_ERROR.format(label=label, error=e)) + if first_error is None: + first_error = e + else: + for label, props_list in nodes_by_label.items(): + try: + flushed, skipped = self._flush_node_label_group(label, props_list) + flushed_total += flushed + skipped_total += skipped + except Exception as e: + logger.error(ls.MG_LABEL_FLUSH_ERROR.format(label=label, error=e)) + if first_error is None: + first_error = e - query = build_merge_node_query(label, id_key) - self._execute_batch(query, batch_rows) logger.info( ls.MG_NODES_FLUSHED.format(flushed=flushed_total, total=buffer_size) ) @@ -281,61 +444,114 @@ def flush_nodes(self) -> None: logger.info(ls.MG_NODES_SKIPPED.format(count=skipped_total)) self.node_buffer.clear() - def flush_relationships(self) -> None: - if not self.relationship_buffer: - return + if first_error is not None: + raise first_error - rels_by_pattern: defaultdict[ - tuple[str, str, str, str, str], list[RelBatchRow] - ] = defaultdict(list) - for from_node, rel_type, to_node, props in self.relationship_buffer: - pattern = (from_node[0], from_node[1], rel_type, to_node[0], to_node[1]) - rels_by_pattern[pattern].append( - RelBatchRow(from_val=from_node[2], to_val=to_node[2], props=props or {}) + def _flush_rel_pattern_group( + self, + pattern: tuple[str, str, str, str, str], + params_list: list[RelBatchRow], + conn: mgclient.Connection | None = None, + ) -> tuple[int, int]: + from_label, from_key, rel_type, to_label, to_key = pattern + build_rel_query = ( + build_merge_relationship_query + if self._use_merge + else build_create_relationship_query + ) + has_props = any(p[KEY_PROPS] for p in params_list) + query = build_rel_query( + from_label, from_key, rel_type, to_label, to_key, has_props + ) + + target_conn = conn or self.conn + if not target_conn: + logger.warning(ls.MG_NO_CONN_RELS.format(pattern=pattern)) + return len(params_list), 0 + lock = self._conn_lock if conn is None else nullcontext() + with lock: + results = self._execute_batch_with_return_on( + target_conn, query, params_list ) + batch_successful = 0 + for r in results: + created = r.get(KEY_CREATED, 0) + if isinstance(created, int): + batch_successful += created + + if rel_type == REL_TYPE_CALLS: + failed = len(params_list) - batch_successful + if failed > 0: + logger.warning(ls.MG_CALLS_FAILED.format(count=failed)) + for i, sample in enumerate(params_list[:3]): + logger.warning( + ls.MG_CALLS_SAMPLE.format( + index=i + 1, + from_label=from_label, + from_val=sample[KEY_FROM_VAL], + to_label=to_label, + to_val=sample[KEY_TO_VAL], + ) + ) + + return len(params_list), batch_successful + + def flush_relationships(self) -> None: + if not self._rel_count: + return total_attempted = 0 total_successful = 0 - - for pattern, params_list in rels_by_pattern.items(): - from_label, from_key, rel_type, to_label, to_key = pattern - has_props = any(p[KEY_PROPS] for p in params_list) - query = build_merge_relationship_query( - from_label, from_key, rel_type, to_label, to_key, has_props + first_error: Exception | None = None + + if self._executor and len(self._rel_groups) > 1: + logger.info( + ls.MG_PARALLEL_FLUSH_RELS.format( + count=len(self._rel_groups), + workers=settings.FLUSH_THREAD_POOL_SIZE, + ) ) - - total_attempted += len(params_list) - results = self._execute_batch_with_return(query, params_list) - batch_successful = 0 - for r in results: - created = r.get(KEY_CREATED, 0) - if isinstance(created, int): - batch_successful += created - total_successful += batch_successful - - if rel_type == REL_TYPE_CALLS: - failed = len(params_list) - batch_successful - if failed > 0: - logger.warning(ls.MG_CALLS_FAILED.format(count=failed)) - for i, sample in enumerate(params_list[:3]): - logger.warning( - ls.MG_CALLS_SAMPLE.format( - index=i + 1, - from_label=from_label, - from_val=sample[KEY_FROM_VAL], - to_label=to_label, - to_val=sample[KEY_TO_VAL], - ) - ) + futures = { + self._executor.submit( + self._flush_rel_group_with_own_conn, pattern, params_list + ): pattern + for pattern, params_list in self._rel_groups.items() + } + for future in as_completed(futures): + pattern = futures[future] + try: + attempted, successful = future.result() + total_attempted += attempted + total_successful += successful + except Exception as e: + logger.error(ls.MG_REL_FLUSH_ERROR.format(pattern=pattern, error=e)) + if first_error is None: + first_error = e + else: + for pattern, params_list in self._rel_groups.items(): + try: + attempted, successful = self._flush_rel_pattern_group( + pattern, params_list + ) + total_attempted += attempted + total_successful += successful + except Exception as e: + logger.error(ls.MG_REL_FLUSH_ERROR.format(pattern=pattern, error=e)) + if first_error is None: + first_error = e logger.info( ls.MG_RELS_FLUSHED.format( - total=len(self.relationship_buffer), + total=self._rel_count, success=total_successful, failed=total_attempted - total_successful, ) ) - self.relationship_buffer.clear() + self._rel_count = 0 + self._rel_groups.clear() + + if first_error is not None: + raise first_error def flush_all(self) -> None: logger.info(ls.MG_FLUSH_START) @@ -346,13 +562,14 @@ def flush_all(self) -> None: def fetch_all( self, query: str, params: dict[str, PropertyValue] | None = None ) -> list[ResultRow]: - logger.debug(ls.MG_FETCH_QUERY.format(query=query, params=params)) - return self._execute_query(query, params) + bounded_query = _apply_memory_limit(query, settings.QUERY_MEMORY_LIMIT_MB) + logger.debug(ls.MG_FETCH_QUERY, query=bounded_query, params=params) + return self._execute_query(bounded_query, params) def execute_write( self, query: str, params: dict[str, PropertyValue] | None = None ) -> None: - logger.debug(ls.MG_WRITE_QUERY.format(query=query, params=params)) + logger.debug(ls.MG_WRITE_QUERY, query=query, params=params) self._execute_query(query, params) def export_graph_to_dict(self) -> GraphData: diff --git a/codebase_rag/services/llm.py b/codebase_rag/services/llm.py index 018ccc1af..970331f2f 100644 --- a/codebase_rag/services/llm.py +++ b/codebase_rag/services/llm.py @@ -1,14 +1,17 @@ from __future__ import annotations +import re +from pathlib import Path from typing import TYPE_CHECKING from loguru import logger from pydantic_ai import Agent, DeferredToolRequests, Tool +from pydantic_ai.agent import AgentRetries from .. import constants as cs from .. import exceptions as ex from .. import logs as ls -from ..config import ModelConfig, settings +from ..config import ModelConfig, load_cgr_instructions, settings from ..prompts import ( CYPHER_SYSTEM_PROMPT, LOCAL_CYPHER_SYSTEM_PROMPT, @@ -26,15 +29,88 @@ def _create_provider_model(config: ModelConfig) -> Model: def _clean_cypher_response(response_text: str) -> str: - query = response_text.strip().replace(cs.CYPHER_BACKTICK, "") - if query.startswith(cs.CYPHER_PREFIX): - query = query[len(cs.CYPHER_PREFIX) :].strip() + query = response_text.strip() + + if "```" in query: + parts = query.split("```") + if len(parts) >= 3: + block = parts[1] + if block.lower().startswith("cypher"): + block = block[len("cypher") :] + query = block.strip() + else: + while "**" in query: + start = query.index("**") + end = query.find("**", start + 2) + if end == -1: + break + after = end + 2 + if after < len(query) and query[after] == ":": + after += 1 + query = query[:start] + query[after:].lstrip() + query = query.replace(cs.CYPHER_BACKTICK, "") + if query.lower().startswith(cs.CYPHER_PREFIX): + query = query[len(cs.CYPHER_PREFIX) :].strip() + if not query.endswith(cs.CYPHER_SEMICOLON): query += cs.CYPHER_SEMICOLON return query +_COMMENT_OR_WS = r"(?:\s|//[^\n]*|/\*.*?\*/)+" + + +def _build_keyword_pattern(keyword: str) -> re.Pattern[str]: + parts = keyword.split() + if len(parts) == 1: + return re.compile(rf"\b{re.escape(parts[0])}\b") + joined = _COMMENT_OR_WS.join(re.escape(p) for p in parts) + return re.compile(rf"\b{joined}\b", re.DOTALL) + + +_CYPHER_DANGEROUS_PATTERNS: list[tuple[str, re.Pattern[str]]] = [ + (kw, _build_keyword_pattern(kw)) for kw in cs.CYPHER_DANGEROUS_KEYWORDS +] + + +_VARLEN_PATTERN = re.compile(r"\[[^\]]*?\*([^\]]*)\]") +_PROCEDURE_CALL_PATTERN = re.compile(r"\bCALL\s+([\w\.]+)", re.IGNORECASE) + + +def _validate_cypher_read_only(query: str) -> None: + upper_query = query.upper() + for keyword, pattern in _CYPHER_DANGEROUS_PATTERNS: + if pattern.search(upper_query): + raise ex.LLMGenerationError( + ex.LLM_DANGEROUS_QUERY.format(keyword=keyword, query=query) + ) + + +def _validate_no_unbounded_paths(query: str) -> None: + for match in _VARLEN_PATTERN.finditer(query): + spec = match.group(1).strip() + if not spec: + raise ex.LLMGenerationError(ex.LLM_UNBOUNDED_PATH.format(query=query)) + if ".." in spec: + upper = spec.split("..", 1)[1].lstrip() + if not upper or not upper[0].isdigit(): + raise ex.LLMGenerationError(ex.LLM_UNBOUNDED_PATH.format(query=query)) + + +def _validate_call_procedures(query: str) -> None: + for match in _PROCEDURE_CALL_PATTERN.finditer(query): + name = match.group(1) + if not any( + name.startswith(prefix) for prefix in cs.CYPHER_ALLOWED_PROCEDURE_PREFIXES + ): + raise ex.LLMGenerationError( + ex.LLM_DISALLOWED_PROCEDURE.format(name=name, query=query) + ) + + class CypherGenerator: + __slots__ = ("agent",) + def __init__(self) -> None: try: config = settings.active_cypher_config @@ -68,6 +144,9 @@ async def generate(self, natural_language_query: str) -> str: ) query = _clean_cypher_response(result.output) + _validate_cypher_read_only(query) + _validate_no_unbounded_paths(query) + _validate_call_procedures(query) logger.info(ls.CYPHER_GENERATED.format(query=query)) return query except Exception as e: @@ -75,18 +154,35 @@ async def generate(self, natural_language_query: str) -> str: raise ex.LLMGenerationError(ex.LLM_GENERATION_FAILED.format(error=e)) from e -def create_rag_orchestrator(tools: list[Tool]) -> Agent: +def create_rag_orchestrator( + tools: list[Tool], + project_root: Path | None = None, + load_instructions: bool = True, + active_projects: list[str] | None = None, +) -> tuple[Agent, str]: try: config = settings.active_orchestrator_config llm = _create_provider_model(config) - return Agent( + project_instructions = ( + load_cgr_instructions(project_root) if load_instructions else None + ) + system_prompt = build_rag_orchestrator_prompt( + tools, + project_instructions=project_instructions, + active_projects=active_projects, + ) + + agent = Agent( model=llm, - system_prompt=build_rag_orchestrator_prompt(tools), + system_prompt=system_prompt, tools=tools, - retries=settings.AGENT_RETRIES, - output_retries=settings.ORCHESTRATOR_OUTPUT_RETRIES, + retries=AgentRetries( + tools=settings.AGENT_RETRIES, + output=settings.ORCHESTRATOR_OUTPUT_RETRIES, + ), output_type=[str, DeferredToolRequests], ) + return agent, system_prompt except Exception as e: raise ex.LLMGenerationError(ex.LLM_INIT_ORCHESTRATOR.format(error=e)) from e diff --git a/codebase_rag/services/protobuf_service.py b/codebase_rag/services/protobuf_service.py index 7c5138c12..50de78eb9 100644 --- a/codebase_rag/services/protobuf_service.py +++ b/codebase_rag/services/protobuf_service.py @@ -22,6 +22,10 @@ cs.NodeLabel.EXTERNAL_PACKAGE: cs.ONEOF_EXTERNAL_PACKAGE, cs.NodeLabel.MODULE_IMPLEMENTATION: cs.ONEOF_MODULE_IMPLEMENTATION, cs.NodeLabel.MODULE_INTERFACE: cs.ONEOF_MODULE_INTERFACE, + cs.NodeLabel.INTERFACE: cs.ONEOF_INTERFACE, + cs.NodeLabel.ENUM: cs.ONEOF_ENUM, + cs.NodeLabel.TYPE: cs.ONEOF_TYPE, + cs.NodeLabel.UNION: cs.ONEOF_UNION, } ONEOF_FIELD_TO_LABEL: dict[str, cs.NodeLabel] = { @@ -32,7 +36,13 @@ NAME_BASED_LABELS = frozenset({cs.NodeLabel.EXTERNAL_PACKAGE, cs.NodeLabel.PROJECT}) +_REL_TYPE_CACHE: dict = {} +_MSG_CLASS_CACHE: dict[str, type | None] = {} + + class ProtobufFileIngestor: + __slots__ = ("output_dir", "_nodes", "_relationships", "split_index") + def __init__(self, output_path: str, split_index: bool = False): self.output_dir = Path(output_path) self._nodes: dict[str, pb.Node] = {} @@ -53,7 +63,11 @@ def ensure_node_batch(self, label: str, properties: PropertyDict) -> None: if not node_id or node_id in self._nodes: return - payload_message_class = getattr(pb, label, None) + if label in _MSG_CLASS_CACHE: + payload_message_class = _MSG_CLASS_CACHE[label] + else: + payload_message_class = getattr(pb, label, None) + _MSG_CLASS_CACHE[label] = payload_message_class if not payload_message_class: logger.warning(ls.PROTOBUF_NO_MESSAGE_CLASS.format(label=label)) return @@ -88,42 +102,45 @@ def ensure_relationship_batch( to_spec: tuple[str, str, PropertyValue], properties: PropertyDict | None = None, ) -> None: - rel = pb.Relationship() + if rel_type in _REL_TYPE_CACHE: + rel_type_enum = _REL_TYPE_CACHE[rel_type] + else: + resolved = getattr(pb.Relationship.RelationshipType, rel_type, None) + if resolved is None: + logger.warning(ls.PROTOBUF_UNKNOWN_REL_TYPE.format(rel_type=rel_type)) + resolved = ( + pb.Relationship.RelationshipType.RELATIONSHIP_TYPE_UNSPECIFIED + ) + rel_type_enum = resolved + _REL_TYPE_CACHE[rel_type] = rel_type_enum - rel_type_enum = getattr(pb.Relationship.RelationshipType, rel_type, None) - if rel_type_enum is None: - logger.warning(ls.PROTOBUF_UNKNOWN_REL_TYPE.format(rel_type=rel_type)) - rel_type_enum = ( - pb.Relationship.RelationshipType.RELATIONSHIP_TYPE_UNSPECIFIED - ) - rel.type = rel_type_enum + from_label, _, from_val_raw = from_spec + to_label, _, to_val_raw = to_spec - from_label, _, from_val = from_spec - to_label, _, to_val = to_spec + from_val = str(from_val_raw) if from_val_raw is not None else "" + to_val = str(to_val_raw) if to_val_raw is not None else "" - rel.source_id = str(from_val) - rel.source_label = str(from_label) - rel.target_id = str(to_val) - rel.target_label = str(to_label) + unique_key = (from_val, rel_type_enum, to_val) + if unique_key in self._relationships: + if properties: + self._relationships[unique_key].properties.update(properties) + return - if not rel.source_id.strip() or not rel.target_id.strip(): + if not from_val.strip() or not to_val.strip(): logger.warning( - ls.PROTOBUF_INVALID_REL.format( - source_id=rel.source_id, target_id=rel.target_id - ) + ls.PROTOBUF_INVALID_REL.format(source_id=from_val, target_id=to_val) ) return + rel = pb.Relationship() + rel.type = rel_type_enum + rel.source_id = from_val + rel.source_label = str(from_label) + rel.target_id = to_val + rel.target_label = str(to_label) if properties: rel.properties.update(properties) - - unique_key = (rel.source_id, rel.type, rel.target_id) - if unique_key in self._relationships: - if properties: - existing_rel = self._relationships[unique_key] - existing_rel.properties.update(properties) - else: - self._relationships[unique_key] = rel + self._relationships[unique_key] = rel def _flush_joint(self) -> None: index = pb.GraphCodeIndex() diff --git a/codebase_rag/stack/__init__.py b/codebase_rag/stack/__init__.py new file mode 100644 index 000000000..277a85f8a --- /dev/null +++ b/codebase_rag/stack/__init__.py @@ -0,0 +1,21 @@ +from .manager import ( + StackManager, + StackStatus, + daemon_down, + daemon_logs, + daemon_restart, + daemon_status, + daemon_up, + ensure_running, +) + +__all__ = [ + "StackManager", + "StackStatus", + "daemon_down", + "daemon_logs", + "daemon_restart", + "daemon_status", + "daemon_up", + "ensure_running", +] diff --git a/codebase_rag/stack/cli.py b/codebase_rag/stack/cli.py new file mode 100644 index 000000000..5677ae2f0 --- /dev/null +++ b/codebase_rag/stack/cli.py @@ -0,0 +1,83 @@ +from __future__ import annotations + +import sys + +import click +from loguru import logger + +from .. import cli_help as ch +from .manager import StackError, StackManager + + +@click.group(help=ch.CMD_DAEMON_GROUP) +def cli() -> None: + pass + + +def _print_status(mgr: StackManager) -> None: + status = mgr.status() + click.echo(f"state: {status.state.value}") + click.echo( + f"memgraph: {status.memgraph_endpoint} (reachable={status.memgraph_reachable})" + ) + click.echo( + f"qdrant: {status.qdrant_endpoint} (reachable={status.qdrant_reachable})" + ) + click.echo(f"compose: {status.compose_file}") + + +@cli.command("up", help=ch.CMD_DAEMON_UP) +def up_cmd() -> None: + mgr = StackManager() + try: + mgr.ensure_running() + _print_status(mgr) + except StackError as e: + logger.error(str(e)) + click.secho(str(e), fg="red", err=True) + sys.exit(1) + + +@cli.command("down", help=ch.CMD_DAEMON_DOWN) +def down_cmd() -> None: + mgr = StackManager() + try: + mgr.down() + click.echo("stopped") + except StackError as e: + logger.error(str(e)) + click.secho(str(e), fg="red", err=True) + sys.exit(1) + + +@cli.command("status", help=ch.CMD_DAEMON_STATUS) +def status_cmd() -> None: + _print_status(StackManager()) + + +@cli.command("restart", help=ch.CMD_DAEMON_RESTART) +def restart_cmd() -> None: + mgr = StackManager() + try: + mgr.restart() + mgr.wait_healthy() + _print_status(mgr) + except StackError as e: + logger.error(str(e)) + click.secho(str(e), fg="red", err=True) + sys.exit(1) + + +@cli.command("logs", help=ch.CMD_DAEMON_LOGS) +@click.option("--follow", "-f", is_flag=True, help=ch.HELP_DAEMON_LOGS_FOLLOW) +@click.option("--service", "-s", default=None, help=ch.HELP_DAEMON_LOGS_SERVICE) +def logs_cmd(follow: bool, service: str | None) -> None: + mgr = StackManager() + try: + rc = mgr.logs(service=service, follow=follow) + if rc != 0: + sys.exit(rc) + except StackError as e: + logger.error(str(e)) + click.secho(str(e), fg="red", err=True) + sys.exit(1) diff --git a/codebase_rag/stack/constants.py b/codebase_rag/stack/constants.py new file mode 100644 index 000000000..bb5d7b0ff --- /dev/null +++ b/codebase_rag/stack/constants.py @@ -0,0 +1,51 @@ +from enum import StrEnum + +COMPOSE_PROJECT_NAME = "cgr" +COMPOSE_FILENAME = "docker-compose.yaml" +STATE_FILENAME = "state.json" + +DOCKER_BIN = "docker" +DOCKER_COMPOSE_SUBCOMMAND = "compose" + +DEFAULT_HEALTH_TIMEOUT_S = 60.0 +DEFAULT_HEALTH_INTERVAL_S = 1.0 +DEFAULT_DOCKER_TIMEOUT_S = 120.0 +DEFAULT_STATUS_TIMEOUT_S = 10.0 + +SERVICE_MEMGRAPH = "memgraph" +SERVICE_QDRANT = "qdrant" +SERVICE_LAB = "lab" + + +class StackState(StrEnum): + RUNNING = "running" + PARTIAL = "partial" + STOPPED = "stopped" + UNKNOWN = "unknown" + + +ERR_DOCKER_NOT_INSTALLED = ( + "docker not found on PATH. Install Docker Desktop or the docker CLI." +) +ERR_DOCKER_DAEMON_DOWN = ( + "docker is installed but the daemon is not responding. Start Docker and retry." +) +ERR_COMPOSE_NOT_AVAILABLE = "`docker compose` plugin not available. Install Docker Desktop v2+ or the compose plugin." +ERR_STACK_START_FAILED = "Failed to bring stack up: {detail}" +ERR_STACK_STOP_FAILED = "Failed to bring stack down: {detail}" +ERR_STACK_NOT_HEALTHY = ( + "Stack started but {service} did not become healthy within {timeout}s." +) +ERR_COMPOSE_FILE_MISSING = "Compose file not found at {path}." + +MSG_USING_COMPOSE_FILE = "Using compose file at {path}" +MSG_STARTING_STACK = "Starting cgr stack..." +MSG_STACK_HEALTHY = "Stack is healthy ({memgraph}, {qdrant})." +MSG_STACK_ALREADY_RUNNING = "Stack already running." +MSG_STOPPING_STACK = "Stopping cgr stack..." +MSG_STACK_STOPPED = "Stack stopped." +MSG_RESTARTING_STACK = "Restarting cgr stack..." +MSG_RENDERING_COMPOSE = "Rendering compose file to {path}" +MSG_WAITING_FOR_HEALTH = "Waiting for {service} on {host}:{port}..." + +PACKAGE_COMPOSE_RELATIVE = "../docker-compose.yaml" diff --git a/codebase_rag/stack/health.py b/codebase_rag/stack/health.py new file mode 100644 index 000000000..b5353374a --- /dev/null +++ b/codebase_rag/stack/health.py @@ -0,0 +1,60 @@ +from __future__ import annotations + +import time +import urllib.error +import urllib.request + +import mgclient # ty: ignore[unresolved-import] + +from . import constants as cs + + +def _bolt_reachable(host: str, port: int) -> bool: + try: + conn = mgclient.connect(host=host, port=port) + try: + cursor = conn.cursor() + cursor.execute("RETURN 1") + cursor.fetchall() + finally: + conn.close() + return True + except (mgclient.Error, OSError): + return False + + +def _http_reachable(url: str, timeout: float = 1.5) -> bool: + try: + with urllib.request.urlopen(url, timeout=timeout) as resp: # noqa: S310 + return 200 <= resp.status < 500 + except (urllib.error.URLError, TimeoutError, OSError): + return False + + +def wait_for_memgraph( + host: str, + port: int, + timeout: float = cs.DEFAULT_HEALTH_TIMEOUT_S, + interval: float = cs.DEFAULT_HEALTH_INTERVAL_S, +) -> bool: + deadline = time.monotonic() + timeout + while time.monotonic() < deadline: + if _bolt_reachable(host, port): + return True + time.sleep(interval) + return False + + +def wait_for_qdrant( + host: str, + port: int, + timeout: float = cs.DEFAULT_HEALTH_TIMEOUT_S, + interval: float = cs.DEFAULT_HEALTH_INTERVAL_S, +) -> bool: + url = f"http://{host}:{port}/readyz" + deadline = time.monotonic() + timeout + while time.monotonic() < deadline: + if _http_reachable(url): + return True + time.sleep(interval) + return False diff --git a/codebase_rag/stack/manager.py b/codebase_rag/stack/manager.py new file mode 100644 index 000000000..95ffc155b --- /dev/null +++ b/codebase_rag/stack/manager.py @@ -0,0 +1,262 @@ +from __future__ import annotations + +import shutil +import subprocess +from dataclasses import dataclass +from pathlib import Path + +from loguru import logger + +from ..config import settings +from . import constants as cs +from .health import wait_for_memgraph, wait_for_qdrant + + +class StackError(RuntimeError): + pass + + +@dataclass +class StackStatus: + state: cs.StackState + memgraph_reachable: bool + qdrant_reachable: bool + compose_file: Path + memgraph_endpoint: str + qdrant_endpoint: str + + +class StackManager: + def __init__( + self, + home: Path | None = None, + package_compose: Path | None = None, + memgraph_host: str | None = None, + memgraph_port: int | None = None, + qdrant_host: str = "localhost", + qdrant_port: int = 6333, + project_name: str = cs.COMPOSE_PROJECT_NAME, + ) -> None: + self.home = (home or settings.CGR_HOME).expanduser() + self.package_compose = ( + package_compose + or (Path(__file__).resolve().parent / cs.PACKAGE_COMPOSE_RELATIVE).resolve() + ) + self.memgraph_host = memgraph_host or settings.MEMGRAPH_HOST + self.memgraph_port = memgraph_port or settings.MEMGRAPH_PORT + self.qdrant_host = qdrant_host + self.qdrant_port = qdrant_port + self.project_name = project_name + + @property + def compose_file(self) -> Path: + return self.home / cs.COMPOSE_FILENAME + + def ensure_home(self) -> None: + self.home.mkdir(parents=True, exist_ok=True) + + def ensure_compose_file(self) -> Path: + self.ensure_home() + target = self.compose_file + if not target.exists(): + if not self.package_compose.exists(): + raise StackError( + cs.ERR_COMPOSE_FILE_MISSING.format(path=self.package_compose) + ) + logger.info(cs.MSG_RENDERING_COMPOSE.format(path=target)) + shutil.copyfile(self.package_compose, target) + return target + + def check_docker(self) -> None: + if shutil.which(cs.DOCKER_BIN) is None: + raise StackError(cs.ERR_DOCKER_NOT_INSTALLED) + info = subprocess.run( + [cs.DOCKER_BIN, "info"], + capture_output=True, + text=True, + timeout=cs.DEFAULT_STATUS_TIMEOUT_S, + check=False, + ) + if info.returncode != 0: + raise StackError(cs.ERR_DOCKER_DAEMON_DOWN) + compose = subprocess.run( + [cs.DOCKER_BIN, cs.DOCKER_COMPOSE_SUBCOMMAND, "version"], + capture_output=True, + text=True, + timeout=cs.DEFAULT_STATUS_TIMEOUT_S, + check=False, + ) + if compose.returncode != 0: + raise StackError(cs.ERR_COMPOSE_NOT_AVAILABLE) + + def _compose_cmd(self, *args: str) -> list[str]: + return [ + cs.DOCKER_BIN, + cs.DOCKER_COMPOSE_SUBCOMMAND, + "-p", + self.project_name, + "-f", + str(self.compose_file), + *args, + ] + + def up(self, timeout: float = cs.DEFAULT_DOCKER_TIMEOUT_S) -> None: + self.check_docker() + self.ensure_compose_file() + logger.info(cs.MSG_STARTING_STACK) + result = subprocess.run( + self._compose_cmd("up", "-d"), + capture_output=True, + text=True, + timeout=timeout, + check=False, + ) + if result.returncode != 0: + raise StackError( + cs.ERR_STACK_START_FAILED.format( + detail=result.stderr.strip() or result.stdout.strip() + ) + ) + + def down(self, timeout: float = cs.DEFAULT_DOCKER_TIMEOUT_S) -> None: + if not self.compose_file.exists(): + return + if shutil.which(cs.DOCKER_BIN) is None: + raise StackError(cs.ERR_DOCKER_NOT_INSTALLED) + logger.info(cs.MSG_STOPPING_STACK) + result = subprocess.run( + self._compose_cmd("down"), + capture_output=True, + text=True, + timeout=timeout, + check=False, + ) + if result.returncode != 0: + raise StackError( + cs.ERR_STACK_STOP_FAILED.format( + detail=result.stderr.strip() or result.stdout.strip() + ) + ) + + def logs( + self, + service: str | None = None, + follow: bool = False, + tail: int | None = 200, + ) -> int: + if not self.compose_file.exists(): + raise StackError(cs.ERR_COMPOSE_FILE_MISSING.format(path=self.compose_file)) + args: list[str] = ["logs"] + if follow: + args.append("-f") + if tail is not None: + args.extend(["--tail", str(tail)]) + if service: + args.append(service) + completed = subprocess.run(self._compose_cmd(*args), check=False) + return completed.returncode + + def restart(self) -> None: + logger.info(cs.MSG_RESTARTING_STACK) + self.down() + self.up() + + def wait_healthy( + self, + timeout: float = cs.DEFAULT_HEALTH_TIMEOUT_S, + ) -> None: + logger.info( + cs.MSG_WAITING_FOR_HEALTH.format( + service=cs.SERVICE_MEMGRAPH, + host=self.memgraph_host, + port=self.memgraph_port, + ) + ) + if not wait_for_memgraph(self.memgraph_host, self.memgraph_port, timeout): + raise StackError( + cs.ERR_STACK_NOT_HEALTHY.format( + service=cs.SERVICE_MEMGRAPH, timeout=timeout + ) + ) + logger.info( + cs.MSG_WAITING_FOR_HEALTH.format( + service=cs.SERVICE_QDRANT, + host=self.qdrant_host, + port=self.qdrant_port, + ) + ) + if not wait_for_qdrant(self.qdrant_host, self.qdrant_port, timeout): + raise StackError( + cs.ERR_STACK_NOT_HEALTHY.format( + service=cs.SERVICE_QDRANT, timeout=timeout + ) + ) + + def status(self) -> StackStatus: + memgraph_ok = wait_for_memgraph( + self.memgraph_host, self.memgraph_port, timeout=0.1, interval=0.0 + ) + qdrant_ok = wait_for_qdrant( + self.qdrant_host, self.qdrant_port, timeout=0.1, interval=0.0 + ) + match (memgraph_ok, qdrant_ok): + case (True, True): + state = cs.StackState.RUNNING + case (False, False): + state = cs.StackState.STOPPED + case _: + state = cs.StackState.PARTIAL + return StackStatus( + state=state, + memgraph_reachable=memgraph_ok, + qdrant_reachable=qdrant_ok, + compose_file=self.compose_file, + memgraph_endpoint=f"{self.memgraph_host}:{self.memgraph_port}", + qdrant_endpoint=f"{self.qdrant_host}:{self.qdrant_port}", + ) + + def ensure_running(self) -> StackStatus: + current = self.status() + if current.state == cs.StackState.RUNNING: + logger.info(cs.MSG_STACK_ALREADY_RUNNING) + return current + self.up() + self.wait_healthy() + final = self.status() + logger.info( + cs.MSG_STACK_HEALTHY.format( + memgraph=final.memgraph_endpoint, + qdrant=final.qdrant_endpoint, + ) + ) + return final + + +def ensure_running() -> StackStatus: + return StackManager().ensure_running() + + +def daemon_up() -> StackStatus: + mgr = StackManager() + mgr.up() + mgr.wait_healthy() + return mgr.status() + + +def daemon_down() -> None: + StackManager().down() + + +def daemon_status() -> StackStatus: + return StackManager().status() + + +def daemon_logs(service: str | None = None, follow: bool = False) -> int: + return StackManager().logs(service=service, follow=follow) + + +def daemon_restart() -> StackStatus: + mgr = StackManager() + mgr.restart() + mgr.wait_healthy() + return mgr.status() diff --git a/codebase_rag/tests/conftest.py b/codebase_rag/tests/conftest.py index a22c1ede0..e3a4a19c1 100644 --- a/codebase_rag/tests/conftest.py +++ b/codebase_rag/tests/conftest.py @@ -8,14 +8,13 @@ from dataclasses import dataclass, field from pathlib import Path from typing import TYPE_CHECKING, Protocol, Self -from unittest.mock import MagicMock +from unittest.mock import MagicMock, call import pytest from loguru import logger from codebase_rag.graph_updater import GraphUpdater from codebase_rag.parser_loader import load_parsers -from codebase_rag.services.graph_service import MemgraphIngestor if TYPE_CHECKING: pass # ty: ignore[unresolved-import] @@ -89,6 +88,25 @@ def create_mock_node( logger.remove() +@pytest.fixture(autouse=True) +def _disable_stack_autostart() -> Generator[None, None, None]: + from unittest.mock import patch + + with patch("codebase_rag.cli._maybe_start_stack"): + yield + + +@pytest.fixture(autouse=True) +def _isolate_cgr_home( + tmp_path_factory: pytest.TempPathFactory, monkeypatch: pytest.MonkeyPatch +) -> Generator[Path, None, None]: + from codebase_rag.config import settings + + home = tmp_path_factory.mktemp("cgr-home-iso") + monkeypatch.setattr(settings, "CGR_HOME", home) + yield home + + @pytest.fixture def temp_repo() -> Generator[Path, None, None]: """Creates a temporary repository path for a test and cleans up afterward.""" @@ -97,10 +115,44 @@ def temp_repo() -> Generator[Path, None, None]: shutil.rmtree(temp_dir) +class _MockIngestor: + _TRACKED = ( + "fetch_all", + "execute_write", + "ensure_node_batch", + "ensure_relationship_batch", + "flush_all", + ) + + def __init__(self) -> None: + self.fetch_all = MagicMock() + self.execute_write = MagicMock() + self.ensure_node_batch = MagicMock() + self.ensure_relationship_batch = MagicMock() + self.flush_all = MagicMock() + self._fallback = MagicMock() + + def reset_mock(self) -> None: + for name in (*self._TRACKED, "_fallback"): + getattr(self, name).reset_mock() + + @property + def method_calls(self) -> list: + result = [] + for name in self._TRACKED: + mock_attr = self.__dict__[name] + for c in mock_attr.call_args_list: + result.append(getattr(call, name)(*c.args, **c.kwargs)) + result.extend(self._fallback.method_calls) + return result + + def __getattr__(self, name: str) -> MagicMock: + return getattr(self._fallback, name) + + @pytest.fixture -def mock_ingestor() -> MagicMock: - """Provides a mocked MemgraphIngestor instance.""" - return MagicMock(spec=MemgraphIngestor) +def mock_ingestor() -> _MockIngestor: + return _MockIngestor() def run_updater( diff --git a/codebase_rag/tests/fuzz_test_parsers.py b/codebase_rag/tests/fuzz_test_parsers.py new file mode 100644 index 000000000..d9a608887 --- /dev/null +++ b/codebase_rag/tests/fuzz_test_parsers.py @@ -0,0 +1,20 @@ +import sys + +import atheris + +from codebase_rag.language_spec import ( + get_language_for_extension, + get_language_spec, +) + + +def fuzz_language_spec(data): + fdp = atheris.FuzzedDataProvider(data) + extension = fdp.ConsumeUnicodeNoSurrogates(64) + get_language_spec(extension) + get_language_for_extension(extension) + + +if __name__ == "__main__": + atheris.Setup(sys.argv, fuzz_language_spec) + atheris.Fuzz() diff --git a/codebase_rag/tests/integration/test_cypher_queries.py b/codebase_rag/tests/integration/test_cypher_queries.py index e01415daf..4e5ee30ba 100644 --- a/codebase_rag/tests/integration/test_cypher_queries.py +++ b/codebase_rag/tests/integration/test_cypher_queries.py @@ -11,11 +11,13 @@ CYPHER_FIND_BY_QUALIFIED_NAME, CYPHER_GET_FUNCTION_SOURCE_LOCATION, build_constraint_query, + build_dead_code_query, build_merge_node_query, build_merge_relationship_query, build_nodes_by_ids_query, wrap_with_unwind, ) +from codebase_rag.types_defs import PropertyValue if TYPE_CHECKING: from codebase_rag.services.graph_service import MemgraphIngestor @@ -343,6 +345,292 @@ def test_creates_calls_relationship_with_properties( assert verify[0]["line"] == 42 +class TestBuildDeadCodeQueryUnit: + def test_include_tests_references_test_patterns(self) -> None: + query = build_dead_code_query(include_tests=True) + + assert "$test_patterns" in query + assert "$project_prefix" in query + assert "$root_decorators" in query + assert "$entry_points" in query + assert "is_exported" in query + assert "CALLS*0.." in query + # (H) test functions are roots when tests are included + assert "n.path CONTAINS" in query + + def test_exclude_tests_omits_test_function_roots(self) -> None: + query = build_dead_code_query(include_tests=False) + + # (H) test functions are NOT roots when excluding tests ... + assert "n.path CONTAINS" not in query + # (H) ... but test_patterns still filters test modules out of the + # (H) module-load root clause so test-only code is not kept alive. + assert "$test_patterns" in query + assert "m.path CONTAINS" in query + assert "$project_prefix" in query + + def test_module_load_callees_are_roots(self) -> None: + query = build_dead_code_query(include_tests=False) + + # (H) a function called by a Module node runs at import, so it is a root + assert "Module" in query + assert "[:CALLS]-(" in query + + def test_include_classes_adds_class_candidates(self) -> None: + with_classes = build_dead_code_query(include_tests=False, include_classes=True) + assert "Function|Method|Class" in with_classes + assert "INHERITS" in with_classes + + without_classes = build_dead_code_query( + include_tests=False, include_classes=False + ) + assert "Function|Method|Class" not in without_classes + assert "INHERITS" not in without_classes + + +@pytest.mark.integration +class TestBuildDeadCodeQueryIntegration: + def _seed(self, ingestor: MemgraphIngestor) -> None: + # (H) called -> live; orphan -> dead; handler is a @task root; + # (H) routed is a @app.route root calling routed_callee (decorators are + # (H) stored @-prefixed and dotted, exactly as the parser emits them); + # (H) test_runs is a test root that calls helper (so helper is live) + ingestor._execute_query( + "CREATE " + "(m:Module {qualified_name: 'proj.mod', path: 'proj/mod.py'}), " + "(entry:Function {qualified_name: 'proj.mod.main', name: 'main', " + " start_line: 1, end_line: 3, decorators: [], path: 'proj/mod.py'}), " + "(called:Function {qualified_name: 'proj.mod.called', name: 'called', " + " start_line: 5, end_line: 7, decorators: [], path: 'proj/mod.py'}), " + "(orphan:Function {qualified_name: 'proj.mod.orphan', name: 'orphan', " + " start_line: 9, end_line: 11, decorators: [], path: 'proj/mod.py'}), " + "(handler:Function {qualified_name: 'proj.mod.handler', name: 'handler', " + " start_line: 13, end_line: 15, decorators: ['@task'], path: 'proj/mod.py'}), " + "(routed:Function {qualified_name: 'proj.mod.routed', name: 'routed', " + " start_line: 21, end_line: 23, decorators: ['@app.route'], " + " path: 'proj/mod.py'}), " + "(routed_callee:Function {qualified_name: 'proj.mod.routed_callee', " + " name: 'routed_callee', start_line: 25, end_line: 27, decorators: [], " + " path: 'proj/mod.py'}), " + "(helper:Function {qualified_name: 'proj.mod.helper', name: 'helper', " + " start_line: 17, end_line: 19, decorators: [], path: 'proj/mod.py'}), " + "(testfn:Function {qualified_name: 'proj.tests.test_runs', " + " name: 'test_runs', start_line: 1, end_line: 4, decorators: [], " + " path: 'proj/tests/test_mod.py'}), " + "(entry)-[:CALLS]->(called), " + "(routed)-[:CALLS]->(routed_callee), " + "(testfn)-[:CALLS]->(helper)" + ) + + def _params(self, include_tests: bool) -> dict[str, PropertyValue]: # noqa: ARG002 + # (H) test_patterns is always supplied; the query (built per include_tests) + # (H) decides whether it gates test-function roots or test-module filtering. + return { + "project_prefix": "proj.", + "root_decorators": ["task", "route"], + "entry_points": ["proj.mod.main"], + "test_patterns": ["test_", "_test", "conftest", "/tests/"], + } + + def test_reports_only_the_orphan_with_tests_included( + self, memgraph_ingestor: MemgraphIngestor + ) -> None: + self._seed(memgraph_ingestor) + + results = memgraph_ingestor._execute_query( + build_dead_code_query(include_tests=True), self._params(True) + ) + + names = {r["qualified_name"] for r in results} + assert names == {"proj.mod.orphan"} + + def test_excluding_tests_reports_orphan_and_test_only_code( + self, memgraph_ingestor: MemgraphIngestor + ) -> None: + self._seed(memgraph_ingestor) + + results = memgraph_ingestor._execute_query( + build_dead_code_query(include_tests=False), self._params(False) + ) + + names = {r["qualified_name"] for r in results} + # (H) without test roots, the test fn and its helper are no longer reachable + assert names == { + "proj.mod.orphan", + "proj.tests.test_runs", + "proj.mod.helper", + } + + def test_returns_row_shape(self, memgraph_ingestor: MemgraphIngestor) -> None: + self._seed(memgraph_ingestor) + + results = memgraph_ingestor._execute_query( + build_dead_code_query(include_tests=True), self._params(True) + ) + + assert len(results) == 1 + row = results[0] + assert row["label"] == "Function" + assert row["name"] == "orphan" + assert row["start_line"] == 9 + assert row["end_line"] == 11 + + def test_test_module_call_is_not_a_root_when_excluding_tests( + self, memgraph_ingestor: MemgraphIngestor + ) -> None: + # (H) a function reached only from a TEST module's top-level call must NOT + # (H) be kept alive when --no-include-tests, else test-only code hides as + # (H) live. The same call DOES keep it live when tests are included. + memgraph_ingestor._execute_query( + "CREATE " + "(tm:Module {qualified_name: 'proj.tests.test_x', " + " path: 'proj/tests/test_x.py'}), " + "(tool:Function {qualified_name: 'proj.mod.tool_only', " + " name: 'tool_only', start_line: 1, end_line: 2, decorators: [], " + " path: 'proj/mod.py'}), " + "(tm)-[:CALLS]->(tool)" + ) + params: dict[str, PropertyValue] = { + "project_prefix": "proj.", + "root_decorators": [], + "entry_points": [], + "test_patterns": ["test_", "_test", "conftest", "/tests/"], + } + + excluded = memgraph_ingestor._execute_query( + build_dead_code_query(include_tests=False), params + ) + assert {r["qualified_name"] for r in excluded} == {"proj.mod.tool_only"} + + included = memgraph_ingestor._execute_query( + build_dead_code_query(include_tests=True), params + ) + assert {r["qualified_name"] for r in included} == set() + + def test_class_candidates_when_classes_included( + self, memgraph_ingestor: MemgraphIngestor + ) -> None: + # (H) used is a module-load root that instantiates WithInit (INSTANTIATES + # (H) the class plus CALLS its __init__), NoInit (INSTANTIATES only, no + # (H) __init__) and Derived (INSTANTIATES; Derived INHERITS Base, so Base + # (H) is live too). Only DeadClass (and the orphan function) is unreachable. + memgraph_ingestor._execute_query( + "CREATE " + "(m:Module {qualified_name: 'proj.mod', path: 'proj/mod.py'}), " + "(used:Function {qualified_name: 'proj.mod.used', name: 'used', " + " start_line: 1, end_line: 2, decorators: [], path: 'proj/mod.py'}), " + "(orphan_fn:Function {qualified_name: 'proj.mod.orphan_fn', " + " name: 'orphan_fn', start_line: 4, end_line: 5, decorators: [], " + " path: 'proj/mod.py'}), " + "(wi:Class {qualified_name: 'proj.mod.WithInit', name: 'WithInit', " + " start_line: 7, end_line: 9, decorators: [], path: 'proj/mod.py'}), " + "(wii:Method {qualified_name: 'proj.mod.WithInit.__init__', " + " name: '__init__', start_line: 8, end_line: 9, decorators: [], " + " path: 'proj/mod.py'}), " + "(ni:Class {qualified_name: 'proj.mod.NoInit', name: 'NoInit', " + " start_line: 11, end_line: 12, decorators: [], path: 'proj/mod.py'}), " + "(base:Class {qualified_name: 'proj.mod.Base', name: 'Base', " + " start_line: 14, end_line: 15, decorators: [], path: 'proj/mod.py'}), " + "(der:Class {qualified_name: 'proj.mod.Derived', name: 'Derived', " + " start_line: 17, end_line: 18, decorators: [], path: 'proj/mod.py'}), " + "(dead:Class {qualified_name: 'proj.mod.DeadClass', name: 'DeadClass', " + " start_line: 20, end_line: 21, decorators: [], path: 'proj/mod.py'}), " + "(wi)-[:DEFINES_METHOD]->(wii), " + "(der)-[:INHERITS]->(base), " + "(m)-[:CALLS]->(used), " + "(used)-[:INSTANTIATES]->(wi), " + "(used)-[:CALLS]->(wii), " + "(used)-[:INSTANTIATES]->(ni), " + "(used)-[:INSTANTIATES]->(der)" + ) + params: dict[str, PropertyValue] = { + "project_prefix": "proj.", + "root_decorators": [], + "entry_points": [], + "test_patterns": ["test_", "_test", "conftest", "/tests/"], + } + + without_classes = memgraph_ingestor._execute_query( + build_dead_code_query(include_tests=False, include_classes=False), params + ) + assert {r["qualified_name"] for r in without_classes} == {"proj.mod.orphan_fn"} + + with_classes = memgraph_ingestor._execute_query( + build_dead_code_query(include_tests=False, include_classes=True), params + ) + assert {r["qualified_name"] for r in with_classes} == { + "proj.mod.orphan_fn", + "proj.mod.DeadClass", + } + + def test_subclass_only_base_is_reported_when_subclass_is_unreachable( + self, memgraph_ingestor: MemgraphIngestor + ) -> None: + # (H) Base is subclassed by Derived, but nothing instantiates Derived, so + # (H) the traversal never reaches Derived and therefore never reaches Base + # (H) via INHERITS. The whole dead cluster (both classes) is reported: a + # (H) base kept alive only by an unreachable subclass is itself dead. + # (H) Live is present purely so the query has a reachable root to anchor. + memgraph_ingestor._execute_query( + "CREATE " + "(m:Module {qualified_name: 'proj.mod', path: 'proj/mod.py'}), " + "(live:Class {qualified_name: 'proj.mod.Live', name: 'Live', " + " start_line: 1, end_line: 2, decorators: [], path: 'proj/mod.py'}), " + "(base:Class {qualified_name: 'proj.mod.Base', name: 'Base', " + " start_line: 4, end_line: 5, decorators: [], path: 'proj/mod.py'}), " + "(der:Class {qualified_name: 'proj.mod.Derived', name: 'Derived', " + " start_line: 7, end_line: 8, decorators: [], path: 'proj/mod.py'}), " + "(der)-[:INHERITS]->(base), " + "(m)-[:INSTANTIATES]->(live)" + ) + params: dict[str, PropertyValue] = { + "project_prefix": "proj.", + "root_decorators": [], + "entry_points": [], + "test_patterns": ["test_", "_test", "conftest", "/tests/"], + } + + with_classes = memgraph_ingestor._execute_query( + build_dead_code_query(include_tests=False, include_classes=True), params + ) + assert {r["qualified_name"] for r in with_classes} == { + "proj.mod.Base", + "proj.mod.Derived", + } + + def test_module_load_callee_is_a_root( + self, memgraph_ingestor: MemgraphIngestor + ) -> None: + # (H) a function called by a Module (e.g. `if __name__ == "__main__": main()` + # (H) or a bare decorator) runs at import, so it and its callees are live even + # (H) with no entry-point/decorator/export root. + memgraph_ingestor._execute_query( + "CREATE " + "(m:Module {qualified_name: 'proj.mod', path: 'proj/mod.py'}), " + "(main:Function {qualified_name: 'proj.mod.main', name: 'main', " + " start_line: 1, end_line: 2, decorators: [], path: 'proj/mod.py'}), " + "(used:Function {qualified_name: 'proj.mod.used', name: 'used', " + " start_line: 4, end_line: 5, decorators: [], path: 'proj/mod.py'}), " + "(orphan:Function {qualified_name: 'proj.mod.orphan', name: 'orphan', " + " start_line: 7, end_line: 8, decorators: [], path: 'proj/mod.py'}), " + "(m)-[:CALLS]->(main), " + "(main)-[:CALLS]->(used)" + ) + params: dict[str, PropertyValue] = { + "project_prefix": "proj.", + "root_decorators": [], + "entry_points": [], + "test_patterns": ["test_", "_test", "conftest", "/tests/"], + } + + results = memgraph_ingestor._execute_query( + build_dead_code_query(include_tests=False), params + ) + names = {r["qualified_name"] for r in results} + + assert names == {"proj.mod.orphan"} + + @pytest.mark.integration class TestBuildNodesByIdsQueryIntegration: def test_fetches_nodes_by_ids(self, memgraph_ingestor: MemgraphIngestor) -> None: diff --git a/codebase_rag/tests/integration/test_document_analyzer_integration.py b/codebase_rag/tests/integration/test_document_analyzer_integration.py deleted file mode 100644 index b1cc7f9fb..000000000 --- a/codebase_rag/tests/integration/test_document_analyzer_integration.py +++ /dev/null @@ -1,219 +0,0 @@ -from __future__ import annotations - -from pathlib import Path -from unittest.mock import MagicMock, patch - -import pytest - -from codebase_rag.constants import Provider -from codebase_rag.tools.document_analyzer import ( - DocumentAnalyzer, - create_document_analyzer_tool, -) - -pytestmark = [pytest.mark.integration] - - -@pytest.fixture -def temp_test_repo(tmp_path: Path) -> Path: - (tmp_path / "readme.txt").write_text( - "This is a README file.\nIt contains important information.", - encoding="utf-8", - ) - (tmp_path / "code.py").write_text( - "def hello():\n return 'Hello, World!'", - encoding="utf-8", - ) - (tmp_path / "data.json").write_text( - '{"name": "test", "value": 42}', - encoding="utf-8", - ) - subdir = tmp_path / "docs" - subdir.mkdir() - (subdir / "manual.txt").write_text( - "User Manual\n\n1. Getting Started\n2. Configuration", - encoding="utf-8", - ) - return tmp_path - - -@pytest.fixture -def mock_settings() -> MagicMock: - settings = MagicMock() - settings.active_orchestrator_config.provider = Provider.GOOGLE - settings.active_orchestrator_config.provider_type = "api" - settings.active_orchestrator_config.api_key = "test-api-key" - settings.active_orchestrator_config.model_id = "gemini-1.5-flash" - return settings - - -@pytest.fixture -def mock_genai_client() -> MagicMock: - client = MagicMock() - response = MagicMock() - response.text = "This is an analysis of the document." - client.models.generate_content.return_value = response - return client - - -@pytest.fixture -def analyzer_with_mock( - temp_test_repo: Path, - mock_settings: MagicMock, - mock_genai_client: MagicMock, -) -> DocumentAnalyzer: - with patch("codebase_rag.tools.document_analyzer.settings", mock_settings): - with patch( - "codebase_rag.tools.document_analyzer.genai.Client", - return_value=mock_genai_client, - ): - return DocumentAnalyzer(str(temp_test_repo)) - - -class TestDocumentAnalyzerIntegration: - def test_analyze_text_file( - self, - analyzer_with_mock: DocumentAnalyzer, - mock_genai_client: MagicMock, - ) -> None: - result = analyzer_with_mock.analyze("readme.txt", "What is this file about?") - assert "analysis" in result.lower() - mock_genai_client.models.generate_content.assert_called_once() - - def test_analyze_code_file( - self, - analyzer_with_mock: DocumentAnalyzer, - mock_genai_client: MagicMock, - ) -> None: - result = analyzer_with_mock.analyze("code.py", "What does this code do?") - assert "analysis" in result.lower() - - def test_analyze_json_file( - self, - analyzer_with_mock: DocumentAnalyzer, - mock_genai_client: MagicMock, - ) -> None: - result = analyzer_with_mock.analyze("data.json", "What data is in this file?") - assert "analysis" in result.lower() - - def test_analyze_nested_file( - self, - analyzer_with_mock: DocumentAnalyzer, - mock_genai_client: MagicMock, - ) -> None: - result = analyzer_with_mock.analyze("docs/manual.txt", "Summarize this manual") - assert "analysis" in result.lower() - - def test_analyze_nonexistent_file( - self, - analyzer_with_mock: DocumentAnalyzer, - ) -> None: - result = analyzer_with_mock.analyze("nonexistent.txt", "What is this?") - assert "error" in result.lower() - assert "not found" in result.lower() - - def test_analyze_path_traversal_blocked( - self, - analyzer_with_mock: DocumentAnalyzer, - ) -> None: - result = analyzer_with_mock.analyze("../../../etc/passwd", "What is this?") - assert "security" in result.lower() - - -class TestDocumentAnalyzerToolIntegration: - def test_tool_analyzes_file( - self, - temp_test_repo: Path, - mock_settings: MagicMock, - mock_genai_client: MagicMock, - ) -> None: - with patch("codebase_rag.tools.document_analyzer.settings", mock_settings): - with patch( - "codebase_rag.tools.document_analyzer.genai.Client", - return_value=mock_genai_client, - ): - analyzer = DocumentAnalyzer(str(temp_test_repo)) - tool = create_document_analyzer_tool(analyzer) - result = tool.function( - file_path="readme.txt", - question="What is in this file?", - ) - assert "analysis" in result.lower() - - def test_tool_handles_error( - self, - temp_test_repo: Path, - mock_settings: MagicMock, - mock_genai_client: MagicMock, - ) -> None: - with patch("codebase_rag.tools.document_analyzer.settings", mock_settings): - with patch( - "codebase_rag.tools.document_analyzer.genai.Client", - return_value=mock_genai_client, - ): - analyzer = DocumentAnalyzer(str(temp_test_repo)) - tool = create_document_analyzer_tool(analyzer) - result = tool.function( - file_path="missing.txt", - question="What is this?", - ) - assert "error" in result.lower() - - -class TestDocumentAnalyzerWithDifferentProviders: - def test_unsupported_provider_returns_error( - self, - temp_test_repo: Path, - ) -> None: - mock_settings = MagicMock() - mock_settings.active_orchestrator_config.provider = "anthropic" - with patch("codebase_rag.tools.document_analyzer.settings", mock_settings): - analyzer = DocumentAnalyzer(str(temp_test_repo)) - result = analyzer.analyze("readme.txt", "What is this?") - assert "not supported" in result.lower() - - -class TestDocumentAnalyzerResponseHandling: - def test_handles_response_with_candidates( - self, - temp_test_repo: Path, - mock_settings: MagicMock, - ) -> None: - mock_client = MagicMock() - response = MagicMock() - response.text = None - candidate = MagicMock() - part = MagicMock() - part.text = "Analysis from candidate" - candidate.content.parts = [part] - response.candidates = [candidate] - mock_client.models.generate_content.return_value = response - - with patch("codebase_rag.tools.document_analyzer.settings", mock_settings): - with patch( - "codebase_rag.tools.document_analyzer.genai.Client", - return_value=mock_client, - ): - analyzer = DocumentAnalyzer(str(temp_test_repo)) - result = analyzer.analyze("readme.txt", "What is this?") - assert result == "Analysis from candidate" - - def test_handles_empty_response( - self, - temp_test_repo: Path, - mock_settings: MagicMock, - ) -> None: - mock_client = MagicMock() - response = MagicMock() - response.text = None - response.candidates = None - mock_client.models.generate_content.return_value = response - - with patch("codebase_rag.tools.document_analyzer.settings", mock_settings): - with patch( - "codebase_rag.tools.document_analyzer.genai.Client", - return_value=mock_client, - ): - analyzer = DocumentAnalyzer(str(temp_test_repo)) - result = analyzer.analyze("readme.txt", "What is this?") - assert "no" in result.lower() and "content" in result.lower() diff --git a/codebase_rag/tests/integration/test_incremental_external_prune_e2e.py b/codebase_rag/tests/integration/test_incremental_external_prune_e2e.py new file mode 100644 index 000000000..2a392d98c --- /dev/null +++ b/codebase_rag/tests/integration/test_incremental_external_prune_e2e.py @@ -0,0 +1,57 @@ +# (H) End-to-end (real Memgraph) verification that an incremental rebuild prunes +# (H) external import-target Module nodes that are no longer imported by anyone, +# (H) e.g. an imported name renamed on a subsequent index. +from __future__ import annotations + +from pathlib import Path +from typing import TYPE_CHECKING + +import pytest + +from codebase_rag.graph_updater import GraphUpdater +from codebase_rag.parser_loader import load_parsers + +if TYPE_CHECKING: + from codebase_rag.services.graph_service import MemgraphIngestor + +pytestmark = [pytest.mark.integration] + + +def _index(ingestor: MemgraphIngestor, project_path: Path, force: bool) -> None: + parsers, queries = load_parsers() + GraphUpdater( + ingestor=ingestor, + repo_path=project_path, + parsers=parsers, + queries=queries, + project_name="proj", + ).run(force=force) + + +def _external_module_qns(ingestor: MemgraphIngestor) -> set[str]: + rows = ingestor.fetch_all( + "MATCH (m:Module) WHERE m.is_external = true RETURN m.qualified_name AS qn" + ) + return {r["qn"] for r in rows if r.get("qn")} + + +def test_incremental_rebuild_prunes_orphaned_external_module( + memgraph_ingestor: MemgraphIngestor, tmp_path: Path +) -> None: + project = tmp_path / "proj" + project.mkdir() + (project / "__init__.py").touch() + client = project / "client.py" + + client.write_text("from extlib import old_thing\n\nuse = old_thing\n") + _index(memgraph_ingestor, project, force=True) + + before = _external_module_qns(memgraph_ingestor) + assert any(qn.endswith(".old_thing") for qn in before), before + + client.write_text("from extlib import new_thing\n\nuse = new_thing\n") + _index(memgraph_ingestor, project, force=False) + + after = _external_module_qns(memgraph_ingestor) + assert not any(qn.endswith(".old_thing") for qn in after), after + assert any(qn.endswith(".new_thing") for qn in after), after diff --git a/codebase_rag/tests/integration/test_node_label_e2e.py b/codebase_rag/tests/integration/test_node_label_e2e.py index f61792588..4fb10083a 100644 --- a/codebase_rag/tests/integration/test_node_label_e2e.py +++ b/codebase_rag/tests/integration/test_node_label_e2e.py @@ -16,8 +16,6 @@ SKIP_GO = "Go is in development status" SKIP_SCALA = "Scala is in development status" -SKIP_CSHARP = "C# is in development status" -SKIP_PHP = "PHP is in development status" PYTHON_CODE = """\ @@ -233,29 +231,6 @@ class MyCppClass { } """ -CSHARP_CODE = """\ -public class MyCSharpClass { - private int value; - - public MyCSharpClass() { - this.value = 0; - } - - public int GetValue() { - return this.value; - } -} - -public interface IMyInterface { - void DoSomething(); -} - -public enum Status { - Active, - Inactive -} -""" - PHP_CODE = """\ Path: return project -@pytest.fixture -def csharp_project(tmp_path: Path) -> Path: - project = tmp_path / "csharp_project" - project.mkdir() - (project / "Example.cs").write_text(CSHARP_CODE, encoding="utf-8") - return project - - @pytest.fixture def php_project(tmp_path: Path) -> Path: project = tmp_path / "php_project" @@ -617,29 +584,29 @@ def test_rust_creates_function_nodes( func_names = {n["name"] for n in functions} assert "standalone_fn" in func_names - def test_rust_creates_class_nodes_for_enums( + def test_rust_creates_enum_nodes_for_enums( self, memgraph_ingestor: MemgraphIngestor, rust_project: Path ) -> None: index_project(memgraph_ingestor, rust_project) labels = get_node_labels(memgraph_ingestor) - assert NodeLabel.CLASS.value in labels + assert NodeLabel.ENUM.value in labels - classes = get_nodes_by_label(memgraph_ingestor, NodeLabel.CLASS.value) - class_names = {n["name"] for n in classes} - assert "Status" in class_names + enums = get_nodes_by_label(memgraph_ingestor, NodeLabel.ENUM.value) + enum_names = {n["name"] for n in enums} + assert "Status" in enum_names - def test_rust_creates_class_nodes_for_traits( + def test_rust_creates_interface_nodes_for_traits( self, memgraph_ingestor: MemgraphIngestor, rust_project: Path ) -> None: index_project(memgraph_ingestor, rust_project) labels = get_node_labels(memgraph_ingestor) - assert NodeLabel.CLASS.value in labels + assert NodeLabel.INTERFACE.value in labels - classes = get_nodes_by_label(memgraph_ingestor, NodeLabel.CLASS.value) - class_names = {n["name"] for n in classes} - assert "MyTrait" in class_names + interfaces = get_nodes_by_label(memgraph_ingestor, NodeLabel.INTERFACE.value) + interface_names = {n["name"] for n in interfaces} + assert "MyTrait" in interface_names @pytest.mark.skip(reason=SKIP_GO) @@ -825,46 +792,6 @@ def test_cpp_creates_module_implementation_nodes( assert "mymodule_impl" in module_names -@pytest.mark.skip(reason=SKIP_CSHARP) -class TestCSharpNodeLabels: - def test_csharp_creates_class_nodes( - self, memgraph_ingestor: MemgraphIngestor, csharp_project: Path - ) -> None: - index_project(memgraph_ingestor, csharp_project) - - labels = get_node_labels(memgraph_ingestor) - assert NodeLabel.CLASS.value in labels - - classes = get_nodes_by_label(memgraph_ingestor, NodeLabel.CLASS.value) - class_names = {n["name"] for n in classes} - assert "MyCSharpClass" in class_names - - def test_csharp_creates_interface_nodes( - self, memgraph_ingestor: MemgraphIngestor, csharp_project: Path - ) -> None: - index_project(memgraph_ingestor, csharp_project) - - labels = get_node_labels(memgraph_ingestor) - assert NodeLabel.INTERFACE.value in labels - - interfaces = get_nodes_by_label(memgraph_ingestor, NodeLabel.INTERFACE.value) - interface_names = {n["name"] for n in interfaces} - assert "IMyInterface" in interface_names - - def test_csharp_creates_enum_nodes( - self, memgraph_ingestor: MemgraphIngestor, csharp_project: Path - ) -> None: - index_project(memgraph_ingestor, csharp_project) - - labels = get_node_labels(memgraph_ingestor) - assert NodeLabel.ENUM.value in labels - - enums = get_nodes_by_label(memgraph_ingestor, NodeLabel.ENUM.value) - enum_names = {n["name"] for n in enums} - assert "Status" in enum_names - - -@pytest.mark.skip(reason=SKIP_PHP) class TestPhpNodeLabels: def test_php_creates_class_nodes( self, memgraph_ingestor: MemgraphIngestor, php_project: Path @@ -938,8 +865,7 @@ def test_lua_creates_function_nodes( ("scala_project", SKIP_SCALA), ("java_project", None), ("cpp_project", None), - ("csharp_project", SKIP_CSHARP), - ("php_project", SKIP_PHP), + ("php_project", None), ("lua_project", None), ] diff --git a/codebase_rag/tests/integration/test_shell_command_integration.py b/codebase_rag/tests/integration/test_shell_command_integration.py index c5fda3f68..47391b6c0 100644 --- a/codebase_rag/tests/integration/test_shell_command_integration.py +++ b/codebase_rag/tests/integration/test_shell_command_integration.py @@ -1,5 +1,6 @@ from __future__ import annotations +import shutil from pathlib import Path from unittest.mock import MagicMock @@ -11,6 +12,8 @@ create_shell_command_tool, ) +_HAS_RG = shutil.which("rg") is not None + pytestmark = [pytest.mark.anyio, pytest.mark.integration] @@ -112,6 +115,7 @@ async def test_rm_removes_file( assert result.return_code == 0 assert not (temp_test_repo / "file2.py").exists() + @pytest.mark.skipif(not _HAS_RG, reason="rg (ripgrep) not installed") async def test_rg_searches_content(self, shell_commander: ShellCommander) -> None: result = await shell_commander.execute("rg hello file2.py") assert "hello" in result.stdout or result.return_code == 0 @@ -199,6 +203,7 @@ async def test_ls_pipe_head(self, shell_commander: ShellCommander) -> None: lines = result.stdout.strip().split("\n") assert len(lines) <= 2 + @pytest.mark.skipif(not _HAS_RG, reason="rg (ripgrep) not installed") async def test_cat_pipe_rg( self, shell_commander: ShellCommander, temp_test_repo: Path ) -> None: @@ -217,6 +222,7 @@ async def test_echo_pipe_wc(self, shell_commander: ShellCommander) -> None: assert result.return_code == 0 assert "3" in result.stdout + @pytest.mark.skipif(not _HAS_RG, reason="rg (ripgrep) not installed") async def test_find_pipe_rg_pipe_wc(self, shell_commander: ShellCommander) -> None: result = await shell_commander.execute("find . -name '*.py' | rg py | wc -l") assert result.return_code == 0 diff --git a/codebase_rag/tests/integration/test_tool_calling.py b/codebase_rag/tests/integration/test_tool_calling.py index 0d7c14aaa..15c524275 100644 --- a/codebase_rag/tests/integration/test_tool_calling.py +++ b/codebase_rag/tests/integration/test_tool_calling.py @@ -76,10 +76,17 @@ def log_message_history(messages: list[ModelMessage], label: str) -> None: async def run_agent_test( agent: Agent, prompt: str, tracker: ToolCallTracker, label: str ) -> tuple[list[str], list[str]]: + from pydantic_ai.exceptions import ModelHTTPError + tracker.clear() logger.info(f"\n{'#' * 60}\nRunning: {label}\nPrompt: {prompt}\n{'#' * 60}") - result = await agent.run(prompt) + try: + result = await agent.run(prompt) + except ModelHTTPError as e: + if e.status_code in (401, 403): + pytest.skip(f"Live API rejected credentials ({e.status_code}); skipping.") + raise messages = result.all_messages() log_message_history(messages, label) @@ -107,12 +114,30 @@ def tracking_tools(tracker: ToolCallTracker) -> list[Tool]: return create_tracking_tools(tracker) +def _api_key_configured() -> bool: + from codebase_rag.config import settings + + config = settings.active_orchestrator_config + key = config.api_key + if not key or not key.strip(): + return False + if key.startswith("op://"): + return False + return True + + @pytest.fixture(scope="module") def agent(tracking_tools: list[Tool]) -> Agent: + if not _api_key_configured(): + pytest.skip( + "Live orchestrator API key not resolved " + "(unset or unresolved op:// reference); skipping live API integration." + ) try: - return create_rag_orchestrator(tracking_tools) + rag_agent, _ = create_rag_orchestrator(tracking_tools) + return rag_agent except Exception as e: - pytest.skip(f"Ollama server not available: {e}") + pytest.skip(f"Orchestrator unavailable: {e}") PARALLEL_PROMPT = """Execute ALL of these tasks in parallel, not sequentially: diff --git a/codebase_rag/tests/test_absolute_path.py b/codebase_rag/tests/test_absolute_path.py new file mode 100644 index 000000000..ede90839e --- /dev/null +++ b/codebase_rag/tests/test_absolute_path.py @@ -0,0 +1,317 @@ +from pathlib import Path +from unittest.mock import MagicMock + +import pytest + +from codebase_rag import constants as cs +from codebase_rag.parser_loader import load_parsers +from codebase_rag.tests.conftest import get_nodes, run_updater + +TS_CODE = ( + "interface Greeter {\n" + " greet(): string;\n" + "}\n\n" + "enum Direction {\n" + " Up = 'UP',\n" + " Down = 'DOWN',\n" + "}\n\n" + "class MyGreeter implements Greeter {\n" + " greet(): string { return 'hi'; }\n" + "}\n" +) + +CPP_MODULE_INTERFACE = "export module mymod;\nexport int add(int a, int b);\n" + +CPP_MODULE_IMPL = "module mymod;\nint add(int a, int b) { return a + b; }\n" + + +@pytest.fixture(scope="module") +def parsers_and_queries() -> tuple: + return load_parsers() + + +@pytest.fixture +def python_project(temp_repo: Path) -> Path: + project_path = temp_repo / "abs_path_test" + project_path.mkdir() + + pkg_dir = project_path / "mypkg" + pkg_dir.mkdir() + (pkg_dir / "__init__.py").write_text("") + + (pkg_dir / "mymodule.py").write_text( + "class MyClass:\n" + " def my_method(self):\n" + " pass\n" + "\n" + "def my_function():\n" + " pass\n" + ) + + misc_dir = project_path / "misc" + misc_dir.mkdir() + (misc_dir / "notes.txt").write_text("not a package") + + (project_path / "standalone.py").write_text("def standalone_func():\n pass\n") + + return project_path + + +class TestAbsolutePathOnNodes: + def test_file_nodes_have_absolute_path( + self, + python_project: Path, + mock_ingestor: MagicMock, + parsers_and_queries: tuple, + ) -> None: + if cs.SupportedLanguage.PYTHON not in parsers_and_queries[0]: + pytest.skip("Python parser not available") + run_updater(python_project, mock_ingestor) + file_nodes = get_nodes(mock_ingestor, cs.NodeLabel.FILE) + assert len(file_nodes) > 0 + for node_call in file_nodes: + props = node_call[0][1] + assert cs.KEY_ABSOLUTE_PATH in props + abs_path = props[cs.KEY_ABSOLUTE_PATH] + assert Path(abs_path).is_absolute() + assert abs_path == Path(abs_path).resolve().as_posix() + + def test_module_nodes_have_absolute_path( + self, + python_project: Path, + mock_ingestor: MagicMock, + parsers_and_queries: tuple, + ) -> None: + if cs.SupportedLanguage.PYTHON not in parsers_and_queries[0]: + pytest.skip("Python parser not available") + run_updater(python_project, mock_ingestor) + module_nodes = get_nodes(mock_ingestor, cs.NodeLabel.MODULE) + internal_modules = [c for c in module_nodes if not c[0][1].get("is_external")] + assert len(internal_modules) > 0 + for node_call in internal_modules: + props = node_call[0][1] + assert cs.KEY_ABSOLUTE_PATH in props + abs_path = props[cs.KEY_ABSOLUTE_PATH] + assert Path(abs_path).is_absolute() + + def test_package_nodes_have_absolute_path( + self, + python_project: Path, + mock_ingestor: MagicMock, + parsers_and_queries: tuple, + ) -> None: + if cs.SupportedLanguage.PYTHON not in parsers_and_queries[0]: + pytest.skip("Python parser not available") + run_updater(python_project, mock_ingestor) + package_nodes = get_nodes(mock_ingestor, cs.NodeLabel.PACKAGE) + assert len(package_nodes) > 0 + for node_call in package_nodes: + props = node_call[0][1] + assert cs.KEY_ABSOLUTE_PATH in props + abs_path = props[cs.KEY_ABSOLUTE_PATH] + assert Path(abs_path).is_absolute() + + def test_function_nodes_have_absolute_path( + self, + python_project: Path, + mock_ingestor: MagicMock, + parsers_and_queries: tuple, + ) -> None: + if cs.SupportedLanguage.PYTHON not in parsers_and_queries[0]: + pytest.skip("Python parser not available") + run_updater(python_project, mock_ingestor) + func_nodes = get_nodes(mock_ingestor, cs.NodeLabel.FUNCTION) + assert len(func_nodes) > 0 + for node_call in func_nodes: + props = node_call[0][1] + assert cs.KEY_ABSOLUTE_PATH in props + assert cs.KEY_PATH in props + abs_path = props[cs.KEY_ABSOLUTE_PATH] + assert Path(abs_path).is_absolute() + + def test_class_nodes_have_absolute_path( + self, + python_project: Path, + mock_ingestor: MagicMock, + parsers_and_queries: tuple, + ) -> None: + if cs.SupportedLanguage.PYTHON not in parsers_and_queries[0]: + pytest.skip("Python parser not available") + run_updater(python_project, mock_ingestor) + class_nodes = get_nodes(mock_ingestor, cs.NodeLabel.CLASS) + assert len(class_nodes) > 0 + for node_call in class_nodes: + props = node_call[0][1] + assert cs.KEY_ABSOLUTE_PATH in props + assert cs.KEY_PATH in props + abs_path = props[cs.KEY_ABSOLUTE_PATH] + assert Path(abs_path).is_absolute() + + def test_method_nodes_have_absolute_path( + self, + python_project: Path, + mock_ingestor: MagicMock, + parsers_and_queries: tuple, + ) -> None: + if cs.SupportedLanguage.PYTHON not in parsers_and_queries[0]: + pytest.skip("Python parser not available") + run_updater(python_project, mock_ingestor) + method_nodes = get_nodes(mock_ingestor, cs.NodeLabel.METHOD) + assert len(method_nodes) > 0 + for node_call in method_nodes: + props = node_call[0][1] + assert cs.KEY_ABSOLUTE_PATH in props + assert cs.KEY_PATH in props + abs_path = props[cs.KEY_ABSOLUTE_PATH] + assert Path(abs_path).is_absolute() + + def test_folder_nodes_have_absolute_path( + self, + python_project: Path, + mock_ingestor: MagicMock, + parsers_and_queries: tuple, + ) -> None: + if cs.SupportedLanguage.PYTHON not in parsers_and_queries[0]: + pytest.skip("Python parser not available") + run_updater(python_project, mock_ingestor) + folder_nodes = get_nodes(mock_ingestor, cs.NodeLabel.FOLDER) + assert len(folder_nodes) > 0 + for node_call in folder_nodes: + props = node_call[0][1] + assert cs.KEY_ABSOLUTE_PATH in props + abs_path = props[cs.KEY_ABSOLUTE_PATH] + assert Path(abs_path).is_absolute() + + def test_absolute_path_matches_resolved_file( + self, + python_project: Path, + mock_ingestor: MagicMock, + parsers_and_queries: tuple, + ) -> None: + if cs.SupportedLanguage.PYTHON not in parsers_and_queries[0]: + pytest.skip("Python parser not available") + run_updater(python_project, mock_ingestor) + module_nodes = get_nodes(mock_ingestor, cs.NodeLabel.MODULE) + mymodule_nodes = [ + c for c in module_nodes if c[0][1].get(cs.KEY_NAME) == "mymodule.py" + ] + assert len(mymodule_nodes) == 1 + props = mymodule_nodes[0][0][1] + expected = (python_project / "mypkg" / "mymodule.py").resolve().as_posix() + assert props[cs.KEY_ABSOLUTE_PATH] == expected + + def test_absolute_path_is_posix_format( + self, + python_project: Path, + mock_ingestor: MagicMock, + parsers_and_queries: tuple, + ) -> None: + if cs.SupportedLanguage.PYTHON not in parsers_and_queries[0]: + pytest.skip("Python parser not available") + run_updater(python_project, mock_ingestor) + file_nodes = get_nodes(mock_ingestor, cs.NodeLabel.FILE) + for node_call in file_nodes: + abs_path = node_call[0][1][cs.KEY_ABSOLUTE_PATH] + assert "\\" not in abs_path + + def test_project_node_has_no_absolute_path( + self, + python_project: Path, + mock_ingestor: MagicMock, + parsers_and_queries: tuple, + ) -> None: + if cs.SupportedLanguage.PYTHON not in parsers_and_queries[0]: + pytest.skip("Python parser not available") + run_updater(python_project, mock_ingestor) + project_nodes = get_nodes(mock_ingestor, cs.NodeLabel.PROJECT) + assert len(project_nodes) > 0 + for node_call in project_nodes: + props = node_call[0][1] + assert cs.KEY_ABSOLUTE_PATH not in props + + +@pytest.fixture +def ts_project(temp_repo: Path) -> Path: + project_path = temp_repo / "ts_abs_test" + project_path.mkdir() + (project_path / "types.ts").write_text(TS_CODE) + return project_path + + +class TestTypeScriptAbsolutePath: + def test_interface_nodes_have_absolute_path( + self, + ts_project: Path, + mock_ingestor: MagicMock, + parsers_and_queries: tuple, + ) -> None: + if cs.SupportedLanguage.TS not in parsers_and_queries[0]: + pytest.skip("TypeScript parser not available") + run_updater(ts_project, mock_ingestor) + interface_nodes = get_nodes(mock_ingestor, cs.NodeLabel.INTERFACE) + assert len(interface_nodes) > 0 + for node_call in interface_nodes: + props = node_call[0][1] + assert cs.KEY_ABSOLUTE_PATH in props + assert Path(props[cs.KEY_ABSOLUTE_PATH]).is_absolute() + + def test_enum_nodes_have_absolute_path( + self, + ts_project: Path, + mock_ingestor: MagicMock, + parsers_and_queries: tuple, + ) -> None: + if cs.SupportedLanguage.TS not in parsers_and_queries[0]: + pytest.skip("TypeScript parser not available") + run_updater(ts_project, mock_ingestor) + enum_nodes = get_nodes(mock_ingestor, cs.NodeLabel.ENUM) + assert len(enum_nodes) > 0 + for node_call in enum_nodes: + props = node_call[0][1] + assert cs.KEY_ABSOLUTE_PATH in props + assert Path(props[cs.KEY_ABSOLUTE_PATH]).is_absolute() + + +@pytest.fixture +def cpp_module_project(temp_repo: Path) -> Path: + project_path = temp_repo / "cpp_abs_test" + project_path.mkdir() + (project_path / "mymod.cppm").write_text(CPP_MODULE_INTERFACE) + (project_path / "mymod_impl.cpp").write_text(CPP_MODULE_IMPL) + return project_path + + +class TestCppModuleAbsolutePath: + def test_module_interface_nodes_have_absolute_path( + self, + cpp_module_project: Path, + mock_ingestor: MagicMock, + parsers_and_queries: tuple, + ) -> None: + if cs.SupportedLanguage.CPP not in parsers_and_queries[0]: + pytest.skip("C++ parser not available") + run_updater(cpp_module_project, mock_ingestor) + mi_nodes = get_nodes(mock_ingestor, cs.NodeLabel.MODULE_INTERFACE) + if len(mi_nodes) == 0: + pytest.skip("No ModuleInterface nodes produced") + for node_call in mi_nodes: + props = node_call[0][1] + assert cs.KEY_ABSOLUTE_PATH in props + assert Path(props[cs.KEY_ABSOLUTE_PATH]).is_absolute() + + def test_module_implementation_nodes_have_absolute_path( + self, + cpp_module_project: Path, + mock_ingestor: MagicMock, + parsers_and_queries: tuple, + ) -> None: + if cs.SupportedLanguage.CPP not in parsers_and_queries[0]: + pytest.skip("C++ parser not available") + run_updater(cpp_module_project, mock_ingestor) + mi_nodes = get_nodes(mock_ingestor, cs.NodeLabel.MODULE_IMPLEMENTATION) + if len(mi_nodes) == 0: + pytest.skip("No ModuleImplementation nodes produced") + for node_call in mi_nodes: + props = node_call[0][1] + assert cs.KEY_ABSOLUTE_PATH in props + assert Path(props[cs.KEY_ABSOLUTE_PATH]).is_absolute() diff --git a/codebase_rag/tests/test_abstract_method_override_resolution.py b/codebase_rag/tests/test_abstract_method_override_resolution.py new file mode 100644 index 000000000..582496d24 --- /dev/null +++ b/codebase_rag/tests/test_abstract_method_override_resolution.py @@ -0,0 +1,106 @@ +# (H) L3 finding from the evals/ harness: a mixin declares an @abstractmethod stub +# (H) for a method a sibling mixin implements; self.method() dispatches to the +# (H) concrete sibling at runtime. cgr's ambiguous-name tiebreak preferred the +# (H) same-module abstract stub by import distance. A concrete implementation must +# (H) win over an abstract stub of the same name. +from __future__ import annotations + +from pathlib import Path + +from codebase_rag import constants as cs +from codebase_rag.graph_updater import GraphUpdater +from codebase_rag.parser_loader import load_parsers +from codebase_rag.types_defs import PropertyDict, PropertyValue, ResultRow + +PROJECT = "pkg" + +READER_SRC = """from abc import abstractmethod + + +class ReaderMixin: + @abstractmethod + def parse(self) -> str: ... + + def read(self) -> str: + return self.parse() +""" + +PARSER_SRC = """class ParserMixin: + def parse(self) -> str: + return "parsed" +""" + +ENGINE_SRC = """from pkg.reader import ReaderMixin +from pkg.parser import ParserMixin + + +class Engine(ReaderMixin, ParserMixin): + pass +""" + + +class _Capture: + def __init__(self) -> None: + self.rels: list[tuple[PropertyValue, str, PropertyValue]] = [] + + def ensure_node_batch(self, label: str, properties: PropertyDict) -> None: + return None + + def ensure_relationship_batch( + self, + from_spec: tuple[str, str, PropertyValue], + rel_type: str, + to_spec: tuple[str, str, PropertyValue], + properties: PropertyDict | None = None, + ) -> None: + self.rels.append((from_spec[2], str(rel_type), to_spec[2])) + + def flush_all(self) -> None: + return None + + def fetch_all( + self, query: str, params: PropertyDict | None = None + ) -> list[ResultRow]: + return [] + + def execute_write(self, query: str, params: PropertyDict | None = None) -> None: + return None + + +def _calls(tmp_path: Path) -> set[tuple[PropertyValue, PropertyValue]]: + pkg = tmp_path / "pkg" + pkg.mkdir() + (pkg / "__init__.py").write_text("") + (pkg / "reader.py").write_text(READER_SRC) + (pkg / "parser.py").write_text(PARSER_SRC) + (pkg / "engine.py").write_text(ENGINE_SRC) + parsers, queries = load_parsers() + cap = _Capture() + GraphUpdater( + ingestor=cap, + repo_path=pkg, + parsers=parsers, + queries=queries, + project_name=PROJECT, + ).run(force=True) + return { + (frm, to) for (frm, rel, to) in cap.rels if rel == cs.RelationshipType.CALLS + } + + +class TestAbstractMethodOverrideResolution: + def test_self_call_resolves_to_concrete_sibling_not_abstract_stub( + self, tmp_path: Path + ) -> None: + calls = _calls(tmp_path) + assert ( + "pkg.reader.ReaderMixin.read", + "pkg.parser.ParserMixin.parse", + ) in calls, calls + + def test_abstract_stub_is_not_the_call_target(self, tmp_path: Path) -> None: + calls = _calls(tmp_path) + assert ( + "pkg.reader.ReaderMixin.read", + "pkg.reader.ReaderMixin.parse", + ) not in calls, calls diff --git a/codebase_rag/tests/test_anthropic_token_counter.py b/codebase_rag/tests/test_anthropic_token_counter.py new file mode 100644 index 000000000..43ff172a1 --- /dev/null +++ b/codebase_rag/tests/test_anthropic_token_counter.py @@ -0,0 +1,113 @@ +from __future__ import annotations + +from typing import Any +from unittest.mock import AsyncMock, MagicMock, patch + +import pytest +from pydantic_ai.messages import ( + ModelRequest, + ModelResponse, + RetryPromptPart, + SystemPromptPart, + ToolCallPart, +) + +from codebase_rag.services.anthropic_token_counter import ( + _to_anthropic_payload, + count_anthropic_context, +) + + +def _fake_post_returning(input_tokens: int) -> tuple[AsyncMock, MagicMock]: + fake_response = MagicMock() + fake_response.status_code = 200 + fake_response.json.return_value = {"input_tokens": input_tokens} + fake_post = AsyncMock(return_value=fake_response) + return fake_post, fake_response + + +@pytest.mark.asyncio +async def test_returns_zero_when_no_messages_and_no_system_prompt() -> None: + with patch("httpx.AsyncClient") as mock_client: + result = await count_anthropic_context( + api_key="k", model_id="claude-opus-4-7", messages=[] + ) + + assert result == 0 + mock_client.assert_not_called() + + +@pytest.mark.asyncio +async def test_injects_placeholder_when_only_system_prompt_present() -> None: + fake_post, _ = _fake_post_returning(input_tokens=42_000) + mock_client_instance = MagicMock() + mock_client_instance.__aenter__ = AsyncMock(return_value=mock_client_instance) + mock_client_instance.__aexit__ = AsyncMock(return_value=None) + mock_client_instance.post = fake_post + + messages = [ + ModelRequest(parts=[SystemPromptPart(content="GIANT SYSTEM PROMPT BODY")]) + ] + + with patch("httpx.AsyncClient", return_value=mock_client_instance): + result = await count_anthropic_context( + api_key="k", model_id="claude-opus-4-7", messages=messages + ) + + assert result == 42_000 + payload: dict[str, Any] = fake_post.call_args.kwargs["json"] + assert payload["system"] == "GIANT SYSTEM PROMPT BODY" + assert payload["messages"] + assert payload["messages"][0]["role"] == "user" + placeholder_text = payload["messages"][0]["content"][0]["text"] + assert placeholder_text.strip(), "placeholder must be non-whitespace" + + +def test_retry_prompt_with_tool_name_becomes_tool_result_error_block() -> None: + tool_call_id = "toolu_test123" + messages = [ + ModelResponse( + parts=[ + ToolCallPart( + tool_name="semantic_search", + args={"query": "x"}, + tool_call_id=tool_call_id, + ) + ] + ), + ModelRequest( + parts=[ + RetryPromptPart( + content="bad args", + tool_name="semantic_search", + tool_call_id=tool_call_id, + ) + ] + ), + ] + + _, anthropic_messages = _to_anthropic_payload(messages) + + assert len(anthropic_messages) == 2 + assistant = anthropic_messages[0] + user = anthropic_messages[1] + assert assistant["role"] == "assistant" + assert assistant["content"][0]["type"] == "tool_use" + assert assistant["content"][0]["id"] == tool_call_id + assert user["role"] == "user" + assert user["content"][0]["type"] == "tool_result" + assert user["content"][0]["tool_use_id"] == tool_call_id + assert user["content"][0]["is_error"] is True + + +def test_retry_prompt_without_tool_name_becomes_text_block() -> None: + messages = [ + ModelRequest(parts=[RetryPromptPart(content="please retry")]), + ] + + _, anthropic_messages = _to_anthropic_payload(messages) + + assert len(anthropic_messages) == 1 + assert anthropic_messages[0]["role"] == "user" + assert anthropic_messages[0]["content"][0]["type"] == "text" + assert "please retry" in anthropic_messages[0]["content"][0]["text"] diff --git a/codebase_rag/tests/test_c_language.py b/codebase_rag/tests/test_c_language.py new file mode 100644 index 000000000..e8253c6be --- /dev/null +++ b/codebase_rag/tests/test_c_language.py @@ -0,0 +1,371 @@ +from pathlib import Path +from unittest.mock import MagicMock + +import pytest + +from codebase_rag import constants as cs +from codebase_rag.tests.conftest import ( + get_node_names, + get_nodes, + get_relationships, + run_updater, +) + + +@pytest.fixture +def c_project(temp_repo: Path) -> Path: + project_path = temp_repo / "c_test_project" + project_path.mkdir() + + (project_path / "Makefile").write_text("all:\n\tgcc -o main main.c\n") + + (project_path / "main.c").write_text( + '#include "utils.h"\n' + "#include \n" + "\n" + "void greet(void) {\n" + ' printf("Hello\\n");\n' + "}\n" + "\n" + "int add(int a, int b) {\n" + " return a + b;\n" + "}\n" + "\n" + "int* get_ptr(void) {\n" + " static int x = 42;\n" + " return &x;\n" + "}\n" + "\n" + "int main(void) {\n" + " greet();\n" + " int result = add(1, 2);\n" + " int* p = get_ptr();\n" + " return 0;\n" + "}\n" + ) + + (project_path / "utils.h").write_text( + "#ifndef UTILS_H\n" + "#define UTILS_H\n" + "\n" + "int add(int a, int b);\n" + "void greet(void);\n" + "\n" + "#endif\n" + ) + + (project_path / "types.c").write_text( + "struct Point {\n" + " int x;\n" + " int y;\n" + "};\n" + "\n" + "union Value {\n" + " int i;\n" + " float f;\n" + "};\n" + "\n" + "enum Color {\n" + " RED,\n" + " GREEN,\n" + " BLUE\n" + "};\n" + ) + + return project_path + + +@pytest.fixture +def c_subdir_project(temp_repo: Path) -> Path: + project_path = temp_repo / "c_subdir_project" + project_path.mkdir() + + (project_path / "CMakeLists.txt").write_text( + "cmake_minimum_required(VERSION 3.10)\nproject(myapp)\n" + ) + + src_dir = project_path / "src" + src_dir.mkdir() + (src_dir / "Makefile").write_text("all:\n\tgcc -o app app.c\n") + + (src_dir / "app.c").write_text( + "void run(void) {}\n\nint main(void) {\n run();\n return 0;\n}\n" + ) + + return project_path + + +class TestCFunctionNodes: + def test_simple_function_detected( + self, + c_project: Path, + mock_ingestor: MagicMock, + ) -> None: + run_updater(c_project, mock_ingestor, skip_if_missing="c") + func_names = get_node_names(mock_ingestor, cs.NodeLabel.FUNCTION) + assert any("add" in name for name in func_names) + + def test_void_function_detected( + self, + c_project: Path, + mock_ingestor: MagicMock, + ) -> None: + run_updater(c_project, mock_ingestor, skip_if_missing="c") + func_names = get_node_names(mock_ingestor, cs.NodeLabel.FUNCTION) + assert any("greet" in name for name in func_names) + + def test_pointer_return_function_detected( + self, + c_project: Path, + mock_ingestor: MagicMock, + ) -> None: + run_updater(c_project, mock_ingestor, skip_if_missing="c") + func_names = get_node_names(mock_ingestor, cs.NodeLabel.FUNCTION) + assert any("get_ptr" in name for name in func_names) + + def test_main_function_detected( + self, + c_project: Path, + mock_ingestor: MagicMock, + ) -> None: + run_updater(c_project, mock_ingestor, skip_if_missing="c") + func_names = get_node_names(mock_ingestor, cs.NodeLabel.FUNCTION) + assert any("main" in name for name in func_names) + + def test_function_with_parameters( + self, + c_project: Path, + mock_ingestor: MagicMock, + ) -> None: + run_updater(c_project, mock_ingestor, skip_if_missing="c") + func_nodes = get_nodes(mock_ingestor, cs.NodeLabel.FUNCTION) + add_nodes = [ + n for n in func_nodes if "add" in n[0][1].get(cs.KEY_QUALIFIED_NAME, "") + ] + assert len(add_nodes) > 0 + + +class TestCStructNodes: + def test_struct_detected( + self, + c_project: Path, + mock_ingestor: MagicMock, + ) -> None: + run_updater(c_project, mock_ingestor, skip_if_missing="c") + class_names = get_node_names(mock_ingestor, cs.NodeLabel.CLASS) + assert any("Point" in name for name in class_names) + + def test_struct_has_qualified_name( + self, + c_project: Path, + mock_ingestor: MagicMock, + ) -> None: + run_updater(c_project, mock_ingestor, skip_if_missing="c") + class_nodes = get_nodes(mock_ingestor, cs.NodeLabel.CLASS) + point_nodes = [ + n for n in class_nodes if "Point" in n[0][1].get(cs.KEY_QUALIFIED_NAME, "") + ] + assert len(point_nodes) > 0 + qn = point_nodes[0][0][1][cs.KEY_QUALIFIED_NAME] + assert "." in qn + + +class TestCUnionNodes: + def test_union_detected( + self, + c_project: Path, + mock_ingestor: MagicMock, + ) -> None: + run_updater(c_project, mock_ingestor, skip_if_missing="c") + union_names = get_node_names(mock_ingestor, cs.NodeLabel.UNION) + class_names = get_node_names(mock_ingestor, cs.NodeLabel.CLASS) + all_names = union_names | class_names + assert any("Value" in name for name in all_names) + + +class TestCEnumNodes: + def test_enum_detected( + self, + c_project: Path, + mock_ingestor: MagicMock, + ) -> None: + run_updater(c_project, mock_ingestor, skip_if_missing="c") + enum_names = get_node_names(mock_ingestor, cs.NodeLabel.ENUM) + class_names = get_node_names(mock_ingestor, cs.NodeLabel.CLASS) + all_names = enum_names | class_names + assert any("Color" in name for name in all_names) + + +class TestCCallsRelationships: + def test_function_call_detected( + self, + c_project: Path, + mock_ingestor: MagicMock, + ) -> None: + run_updater(c_project, mock_ingestor, skip_if_missing="c") + calls = get_relationships(mock_ingestor, str(cs.RelationshipType.CALLS)) + assert len(calls) > 0 + + def test_main_calls_greet( + self, + c_project: Path, + mock_ingestor: MagicMock, + ) -> None: + run_updater(c_project, mock_ingestor, skip_if_missing="c") + calls = get_relationships(mock_ingestor, str(cs.RelationshipType.CALLS)) + call_pairs = [] + for c in calls: + src = c.args[0] if c.args else c[0][0] + tgt = c.args[2] if len(c.args) > 2 else c[0][2] + if isinstance(src, tuple) and isinstance(tgt, tuple): + call_pairs.append((src, tgt)) + found_greet = any( + "main" in str(src) and "greet" in str(tgt) for src, tgt in call_pairs + ) + assert found_greet + + def test_multiple_calls_from_main( + self, + c_project: Path, + mock_ingestor: MagicMock, + ) -> None: + run_updater(c_project, mock_ingestor, skip_if_missing="c") + calls = get_relationships(mock_ingestor, str(cs.RelationshipType.CALLS)) + main_calls = [ + c for c in calls if "main" in str(c.args[0] if c.args else c[0][0]) + ] + assert len(main_calls) >= 2 + + +class TestCDefinesRelationships: + def test_module_defines_functions( + self, + c_project: Path, + mock_ingestor: MagicMock, + ) -> None: + run_updater(c_project, mock_ingestor, skip_if_missing="c") + defines = get_relationships(mock_ingestor, str(cs.RelationshipType.DEFINES)) + assert len(defines) > 0 + + def test_main_module_defines_add( + self, + c_project: Path, + mock_ingestor: MagicMock, + ) -> None: + run_updater(c_project, mock_ingestor, skip_if_missing="c") + defines = get_relationships(mock_ingestor, str(cs.RelationshipType.DEFINES)) + found = any("add" in str(d) for d in defines) + assert found + + +class TestCImportsRelationships: + def test_include_creates_external_module( + self, + c_project: Path, + mock_ingestor: MagicMock, + ) -> None: + run_updater(c_project, mock_ingestor, skip_if_missing="c") + module_nodes = get_nodes(mock_ingestor, cs.NodeLabel.MODULE) + external_modules = [n for n in module_nodes if n[0][1].get(cs.KEY_IS_EXTERNAL)] + has_stdio = any("stdio" in str(n) for n in external_modules) + has_utils = any( + "utils" in n[0][1].get(cs.KEY_QUALIFIED_NAME, "") for n in module_nodes + ) + assert has_stdio or has_utils + + def test_include_utils_h_module_exists( + self, + c_project: Path, + mock_ingestor: MagicMock, + ) -> None: + run_updater(c_project, mock_ingestor, skip_if_missing="c") + module_nodes = get_nodes(mock_ingestor, cs.NodeLabel.MODULE) + module_qnames = {n[0][1].get(cs.KEY_QUALIFIED_NAME, "") for n in module_nodes} + assert any("utils" in qn for qn in module_qnames) + + +class TestCFileAndModuleNodes: + def test_c_file_nodes_created( + self, + c_project: Path, + mock_ingestor: MagicMock, + ) -> None: + run_updater(c_project, mock_ingestor, skip_if_missing="c") + file_nodes = get_nodes(mock_ingestor, cs.NodeLabel.FILE) + file_paths = {n[0][1].get(cs.KEY_PATH, "") for n in file_nodes} + assert any("main.c" in p for p in file_paths) + assert any("types.c" in p for p in file_paths) + + def test_c_module_nodes_created( + self, + c_project: Path, + mock_ingestor: MagicMock, + ) -> None: + run_updater(c_project, mock_ingestor, skip_if_missing="c") + module_nodes = get_nodes(mock_ingestor, cs.NodeLabel.MODULE) + module_names = {n[0][1].get(cs.KEY_QUALIFIED_NAME, "") for n in module_nodes} + assert any("main" in name for name in module_names) + + def test_header_file_node_created( + self, + c_project: Path, + mock_ingestor: MagicMock, + ) -> None: + run_updater(c_project, mock_ingestor, skip_if_missing="c") + file_nodes = get_nodes(mock_ingestor, cs.NodeLabel.FILE) + file_paths = {n[0][1].get(cs.KEY_PATH, "") for n in file_nodes} + assert any("utils.h" in p for p in file_paths) + + +class TestCQualifiedNames: + def test_function_qualified_name_has_project( + self, + c_project: Path, + mock_ingestor: MagicMock, + ) -> None: + run_updater(c_project, mock_ingestor, skip_if_missing="c") + func_names = get_node_names(mock_ingestor, cs.NodeLabel.FUNCTION) + for name in func_names: + assert "." in name, f"Qualified name should contain '.': {name}" + + def test_function_qualified_name_format( + self, + c_project: Path, + mock_ingestor: MagicMock, + ) -> None: + run_updater(c_project, mock_ingestor, skip_if_missing="c") + func_names = get_node_names(mock_ingestor, cs.NodeLabel.FUNCTION) + add_names = [n for n in func_names if "add" in n] + assert len(add_names) > 0 + parts = add_names[0].split(".") + assert len(parts) >= 2 + + +class TestCPackageDetection: + def test_makefile_creates_package( + self, + c_project: Path, + mock_ingestor: MagicMock, + ) -> None: + run_updater(c_project, mock_ingestor, skip_if_missing="c") + package_nodes = get_nodes(mock_ingestor, cs.NodeLabel.PACKAGE) + assert len(package_nodes) > 0 + + def test_cmakelists_creates_package( + self, + c_subdir_project: Path, + mock_ingestor: MagicMock, + ) -> None: + run_updater(c_subdir_project, mock_ingestor, skip_if_missing="c") + package_nodes = get_nodes(mock_ingestor, cs.NodeLabel.PACKAGE) + assert len(package_nodes) > 0 + + def test_subdirectory_with_makefile_is_package( + self, + c_subdir_project: Path, + mock_ingestor: MagicMock, + ) -> None: + run_updater(c_subdir_project, mock_ingestor, skip_if_missing="c") + package_nodes = get_nodes(mock_ingestor, cs.NodeLabel.PACKAGE) + package_qnames = {n[0][1].get(cs.KEY_QUALIFIED_NAME, "") for n in package_nodes} + assert any("src" in qn for qn in package_qnames) diff --git a/codebase_rag/tests/test_call_processor.py b/codebase_rag/tests/test_call_processor.py index a6ae5cc34..4cab76cfd 100644 --- a/codebase_rag/tests/test_call_processor.py +++ b/codebase_rag/tests/test_call_processor.py @@ -1153,8 +1153,10 @@ def test_logs_error_on_processing_failure( tree = parser.parse(b"def foo(): pass") root_node = tree.root_node + from codebase_rag.parsers.call_processor import CallProcessor + with patch.object( - call_processor, + CallProcessor, "_process_calls_in_functions", side_effect=RuntimeError("Simulated failure"), ): @@ -1166,9 +1168,9 @@ def test_logs_error_on_processing_failure( queries, ) mock_logger.error.assert_called_once() - error_call_args = mock_logger.error.call_args[0][0] - assert "test_module.py" in error_call_args - assert "Simulated failure" in error_call_args + error_call_args = mock_logger.error.call_args + assert "test_module.py" in str(error_call_args) + assert "Simulated failure" in str(error_call_args) def test_continues_after_error_in_single_file( self, @@ -1195,8 +1197,10 @@ def test_continues_after_error_in_single_file( tree = parser.parse(b"def foo(): pass") root_node = tree.root_node + from codebase_rag.parsers.call_processor import CallProcessor + with patch.object( - call_processor, + CallProcessor, "_process_calls_in_functions", side_effect=ValueError("Test exception"), ): @@ -1206,3 +1210,452 @@ def test_continues_after_error_in_single_file( cs.SupportedLanguage.PYTHON, queries, ) + + +class TestCallProcessorSlots: + def test_has_slots(self) -> None: + from codebase_rag.parsers.call_processor import CallProcessor + + assert hasattr(CallProcessor, "__slots__") + + def test_no_instance_dict(self, call_processor: CallProcessor) -> None: + assert not hasattr(call_processor, "__dict__") + + def test_rejects_arbitrary_attribute(self, call_processor: CallProcessor) -> None: + with pytest.raises(AttributeError): + call_processor.nonexistent_attr = 42 + + def test_slot_attributes_accessible(self, call_processor: CallProcessor) -> None: + assert hasattr(call_processor, "ingestor") + assert hasattr(call_processor, "repo_path") + assert hasattr(call_processor, "project_name") + assert hasattr(call_processor, "_resolver") + + +class TestCollectAllCallNodes: + def test_returns_empty_when_no_calls_query( + self, + call_processor: CallProcessor, + parsers_and_queries: tuple, + ) -> None: + parsers, queries = parsers_and_queries + if cs.SupportedLanguage.PYTHON not in parsers: + pytest.skip("Python parser not available") + + code = "x = 1" + root = parse_code(code, cs.SupportedLanguage.PYTHON, parsers) + + empty_queries: dict = {cs.SupportedLanguage.PYTHON: {cs.QUERY_CALLS: None}} + call_nodes, call_starts = call_processor._collect_all_call_nodes( + root, cs.SupportedLanguage.PYTHON, empty_queries + ) + assert call_nodes == [] + assert call_starts == [] + + def test_returns_call_nodes_for_code_with_calls( + self, + call_processor: CallProcessor, + parsers_and_queries: tuple, + ) -> None: + parsers, queries = parsers_and_queries + if cs.SupportedLanguage.PYTHON not in parsers: + pytest.skip("Python parser not available") + + code = "foo()\nbar()" + root = parse_code(code, cs.SupportedLanguage.PYTHON, parsers) + call_nodes, call_starts = call_processor._collect_all_call_nodes( + root, cs.SupportedLanguage.PYTHON, queries + ) + assert len(call_nodes) >= 2 + assert len(call_starts) == len(call_nodes) + assert all(isinstance(s, int) for s in call_starts) + + +class TestFilterCallsInNode: + def test_filters_calls_within_container( + self, + call_processor: CallProcessor, + parsers_and_queries: tuple, + ) -> None: + parsers, queries = parsers_and_queries + if cs.SupportedLanguage.PYTHON not in parsers: + pytest.skip("Python parser not available") + + code = """ +def outer(): + foo() + +def other(): + bar() +""" + root = parse_code(code, cs.SupportedLanguage.PYTHON, parsers) + all_call_nodes, call_starts = call_processor._collect_all_call_nodes( + root, cs.SupportedLanguage.PYTHON, queries + ) + assert len(all_call_nodes) >= 2 + + outer_func = find_first_node_of_type(root, "function_definition") + assert outer_func is not None + + filtered = call_processor._filter_calls_in_node( + all_call_nodes, call_starts, outer_func + ) + assert len(filtered) == 1 + + +class TestProcessCallsInFileWithoutCache: + def test_process_calls_without_func_class_captures_cache( + self, + temp_repo: Path, + mock_ingestor: MagicMock, + parsers_and_queries: tuple, + ) -> None: + parsers, queries = parsers_and_queries + if cs.SupportedLanguage.PYTHON not in parsers: + pytest.skip("Python parser not available") + + test_file = temp_repo / "test_module.py" + test_file.write_text(encoding="utf-8", data="def foo(): bar()") + + updater = GraphUpdater( + ingestor=mock_ingestor, + repo_path=temp_repo, + parsers=parsers, + queries=queries, + ) + cp = updater.factory.call_processor + + parser = parsers[cs.SupportedLanguage.PYTHON] + tree = parser.parse(b"def foo(): bar()") + root_node = tree.root_node + + cp.process_calls_in_file( + test_file, + root_node, + cs.SupportedLanguage.PYTHON, + queries, + func_class_captures_cache=None, + ) + + def test_process_calls_with_empty_combined_captures( + self, + temp_repo: Path, + mock_ingestor: MagicMock, + parsers_and_queries: tuple, + ) -> None: + parsers, queries = parsers_and_queries + if cs.SupportedLanguage.PYTHON not in parsers: + pytest.skip("Python parser not available") + + test_file = temp_repo / "test_module.py" + test_file.write_text(encoding="utf-8", data="x = 1") + + updater = GraphUpdater( + ingestor=mock_ingestor, + repo_path=temp_repo, + parsers=parsers, + queries=queries, + ) + cp = updater.factory.call_processor + + parser = parsers[cs.SupportedLanguage.PYTHON] + tree = parser.parse(b"x = 1") + root_node = tree.root_node + + from codebase_rag.parser_loader import COMBINED_FUNC_CLASS_QUERIES + + original = COMBINED_FUNC_CLASS_QUERIES.get(cs.SupportedLanguage.PYTHON) + try: + COMBINED_FUNC_CLASS_QUERIES[cs.SupportedLanguage.PYTHON] = None + cp.process_calls_in_file( + test_file, + root_node, + cs.SupportedLanguage.PYTHON, + queries, + func_class_captures_cache=None, + ) + finally: + if original is not None: + COMBINED_FUNC_CLASS_QUERIES[cs.SupportedLanguage.PYTHON] = original + + +class TestProcessCallsInFunctionsWithoutCombined: + def test_without_combined_captures( + self, + temp_repo: Path, + mock_ingestor: MagicMock, + parsers_and_queries: tuple, + ) -> None: + parsers, queries = parsers_and_queries + if cs.SupportedLanguage.PYTHON not in parsers: + pytest.skip("Python parser not available") + + updater = GraphUpdater( + ingestor=mock_ingestor, + repo_path=temp_repo, + parsers=parsers, + queries=queries, + ) + cp = updater.factory.call_processor + + code = "def foo(): bar()" + parser = parsers[cs.SupportedLanguage.PYTHON] + tree = parser.parse(code.encode(cs.ENCODING_UTF8)) + root_node = tree.root_node + + cp._process_calls_in_functions( + root_node, + "proj.module", + cs.SupportedLanguage.PYTHON, + queries, + combined_captures=None, + ) + + def test_without_combined_captures_no_functions( + self, + temp_repo: Path, + mock_ingestor: MagicMock, + parsers_and_queries: tuple, + ) -> None: + parsers, queries = parsers_and_queries + if cs.SupportedLanguage.PYTHON not in parsers: + pytest.skip("Python parser not available") + + updater = GraphUpdater( + ingestor=mock_ingestor, + repo_path=temp_repo, + parsers=parsers, + queries=queries, + ) + cp = updater.factory.call_processor + + code = "x = 1" + parser = parsers[cs.SupportedLanguage.PYTHON] + tree = parser.parse(code.encode(cs.ENCODING_UTF8)) + root_node = tree.root_node + + cp._process_calls_in_functions( + root_node, + "proj.module", + cs.SupportedLanguage.PYTHON, + queries, + combined_captures=None, + ) + + +class TestProcessCallsInClassesWithoutCombined: + def test_without_combined_captures( + self, + temp_repo: Path, + mock_ingestor: MagicMock, + parsers_and_queries: tuple, + ) -> None: + parsers, queries = parsers_and_queries + if cs.SupportedLanguage.PYTHON not in parsers: + pytest.skip("Python parser not available") + + updater = GraphUpdater( + ingestor=mock_ingestor, + repo_path=temp_repo, + parsers=parsers, + queries=queries, + ) + cp = updater.factory.call_processor + + code = """ +class MyClass: + def method(self): + foo() +""" + parser = parsers[cs.SupportedLanguage.PYTHON] + tree = parser.parse(code.encode(cs.ENCODING_UTF8)) + root_node = tree.root_node + + cp._process_calls_in_classes( + root_node, + "proj.module", + cs.SupportedLanguage.PYTHON, + queries, + combined_captures=None, + ) + + +class TestProcessMethodsInClassWithoutSortedFuncNodes: + def test_without_sorted_func_nodes( + self, + temp_repo: Path, + mock_ingestor: MagicMock, + parsers_and_queries: tuple, + ) -> None: + parsers, queries = parsers_and_queries + if cs.SupportedLanguage.PYTHON not in parsers: + pytest.skip("Python parser not available") + + updater = GraphUpdater( + ingestor=mock_ingestor, + repo_path=temp_repo, + parsers=parsers, + queries=queries, + ) + cp = updater.factory.call_processor + + code = """ +class MyClass: + def method(self): + foo() +""" + parser = parsers[cs.SupportedLanguage.PYTHON] + tree = parser.parse(code.encode(cs.ENCODING_UTF8)) + root_node = tree.root_node + + class_node = find_first_node_of_type(root_node, "class_definition") + assert class_node is not None + body_node = class_node.child_by_field_name("body") + assert body_node is not None + + cp._process_methods_in_class( + body_node, + "proj.module.MyClass", + "proj.module", + cs.SupportedLanguage.PYTHON, + queries, + sorted_func_nodes=None, + func_node_starts=None, + ) + + +class TestIngestFunctionCallsWithoutCallNodes: + def test_without_call_nodes( + self, + temp_repo: Path, + mock_ingestor: MagicMock, + parsers_and_queries: tuple, + ) -> None: + parsers, queries = parsers_and_queries + if cs.SupportedLanguage.PYTHON not in parsers: + pytest.skip("Python parser not available") + + updater = GraphUpdater( + ingestor=mock_ingestor, + repo_path=temp_repo, + parsers=parsers, + queries=queries, + ) + cp = updater.factory.call_processor + + code = "def foo(): bar()" + parser = parsers[cs.SupportedLanguage.PYTHON] + tree = parser.parse(code.encode(cs.ENCODING_UTF8)) + root_node = tree.root_node + + cp._ingest_function_calls( + root_node, + "proj.module.foo", + cs.NodeLabel.FUNCTION, + "proj.module", + cs.SupportedLanguage.PYTHON, + queries, + call_nodes=None, + ) + + def test_without_call_nodes_and_no_query( + self, + temp_repo: Path, + mock_ingestor: MagicMock, + parsers_and_queries: tuple, + ) -> None: + parsers, queries = parsers_and_queries + if cs.SupportedLanguage.PYTHON not in parsers: + pytest.skip("Python parser not available") + + updater = GraphUpdater( + ingestor=mock_ingestor, + repo_path=temp_repo, + parsers=parsers, + queries=queries, + ) + cp = updater.factory.call_processor + + code = "x = 1" + parser = parsers[cs.SupportedLanguage.PYTHON] + tree = parser.parse(code.encode(cs.ENCODING_UTF8)) + root_node = tree.root_node + + empty_queries: dict = { + cs.SupportedLanguage.PYTHON: {cs.QUERY_CALLS: None, cs.QUERY_CONFIG: queries[cs.SupportedLanguage.PYTHON][cs.QUERY_CONFIG]} + } + cp._ingest_function_calls( + root_node, + "proj.module.foo", + cs.NodeLabel.FUNCTION, + "proj.module", + cs.SupportedLanguage.PYTHON, + empty_queries, + call_nodes=None, + ) + + +class TestCombinedQueryCompilationExceptionPaths: + def test_combined_func_class_query_exception_sets_none( + self, + parsers_and_queries: tuple, + ) -> None: + from tree_sitter import Query as RealQuery + + from codebase_rag.parser_loader import ( + COMBINED_FUNC_CLASS_IMPORT_QUERIES, + COMBINED_FUNC_CLASS_QUERIES, + _create_language_queries, + ) + + parsers, queries = parsers_and_queries + if cs.SupportedLanguage.PYTHON not in parsers: + pytest.skip("Python parser not available") + + lang_queries = queries[cs.SupportedLanguage.PYTHON] + language_obj = lang_queries[cs.QUERY_LANGUAGE] + parser = parsers[cs.SupportedLanguage.PYTHON] + lang_config = lang_queries[cs.QUERY_CONFIG] + + call_count = 0 + + def patched_query(language, pattern): + nonlocal call_count + call_count += 1 + if call_count <= 2: + raise RuntimeError("simulated combined query failure") + return RealQuery(language, pattern) + + original_fc = COMBINED_FUNC_CLASS_QUERIES.get(cs.SupportedLanguage.PYTHON) + original_fci = COMBINED_FUNC_CLASS_IMPORT_QUERIES.get(cs.SupportedLanguage.PYTHON) + try: + with patch("codebase_rag.parser_loader.Query", side_effect=patched_query): + _create_language_queries( + language_obj, parser, lang_config, cs.SupportedLanguage.PYTHON + ) + assert COMBINED_FUNC_CLASS_QUERIES[cs.SupportedLanguage.PYTHON] is None + assert COMBINED_FUNC_CLASS_IMPORT_QUERIES[cs.SupportedLanguage.PYTHON] is None + finally: + if original_fc is not None: + COMBINED_FUNC_CLASS_QUERIES[cs.SupportedLanguage.PYTHON] = original_fc + if original_fci is not None: + COMBINED_FUNC_CLASS_IMPORT_QUERIES[cs.SupportedLanguage.PYTHON] = original_fci + + +class TestGetRustImplClassName: + def test_rust_impl_fallback_to_children( + self, + call_processor: CallProcessor, + parsers_and_queries: tuple, + ) -> None: + parsers, _ = parsers_and_queries + if cs.SupportedLanguage.RUST not in parsers: + pytest.skip("Rust parser not available") + + code = "impl MyStruct { fn foo(&self) {} }" + root = parse_code(code, cs.SupportedLanguage.RUST, parsers) + impl_node = find_first_node_of_type(root, "impl_item") + assert impl_node is not None + + result = call_processor._get_rust_impl_class_name(impl_node) + assert result is not None diff --git a/codebase_rag/tests/test_call_processor_integration.py b/codebase_rag/tests/test_call_processor_integration.py index e388b96c4..b3b326ba7 100644 --- a/codebase_rag/tests/test_call_processor_integration.py +++ b/codebase_rag/tests/test_call_processor_integration.py @@ -793,7 +793,11 @@ def with_value(self, value): def build(self): return {} +def helper(): + pass + def main(): + helper() result = Builder().with_name("test").with_value(42).build() return result """, @@ -814,6 +818,10 @@ def main(): ] assert len(calls) >= 1 + # (H) Builder() is a class instantiation, not a function call + class_targets = [c for c in calls if c.args[2][0] == cs.NodeLabel.CLASS] + assert len(class_targets) == 0 + def test_handles_init_py_module_qn( self, temp_repo: Path, @@ -853,3 +861,90 @@ def package_func(): caller_qns = [c.args[0][2] for c in calls] package_callers = [qn for qn in caller_qns if "mypackage" in qn] assert len(package_callers) >= 1 + + +class TestModuleCallsClassFiltered: + def test_module_does_not_call_class_python( + self, + temp_repo: Path, + mock_ingestor: MagicMock, + parsers_and_queries: tuple, + ) -> None: + parsers, queries = parsers_and_queries + if cs.SupportedLanguage.PYTHON not in parsers: + pytest.skip("Python parser not available") + + test_file = temp_repo / "test_module.py" + test_file.write_text( + encoding="utf-8", + data=""" +class MyClass: + def method(self): + pass + +def helper(): + pass + +helper() +""", + ) + + updater = GraphUpdater( + ingestor=mock_ingestor, + repo_path=temp_repo, + parsers=parsers, + queries=queries, + ) + updater.run() + + calls = [ + c + for c in mock_ingestor.ensure_relationship_batch.call_args_list + if c.args[1] == cs.RelationshipType.CALLS + ] + + class_targets = [c for c in calls if c.args[2][0] == cs.NodeLabel.CLASS] + assert class_targets == [] + + helper_calls = [c for c in calls if "helper" in c.args[2][2]] + assert len(helper_calls) >= 1 + + def test_function_does_not_call_class_python( + self, + temp_repo: Path, + mock_ingestor: MagicMock, + parsers_and_queries: tuple, + ) -> None: + parsers, queries = parsers_and_queries + if cs.SupportedLanguage.PYTHON not in parsers: + pytest.skip("Python parser not available") + + test_file = temp_repo / "test_module.py" + test_file.write_text( + encoding="utf-8", + data=""" +class MyClass: + pass + +def factory(): + obj = MyClass() + return obj +""", + ) + + updater = GraphUpdater( + ingestor=mock_ingestor, + repo_path=temp_repo, + parsers=parsers, + queries=queries, + ) + updater.run() + + calls = [ + c + for c in mock_ingestor.ensure_relationship_batch.call_args_list + if c.args[1] == cs.RelationshipType.CALLS + ] + + class_targets = [c for c in calls if c.args[2][0] == cs.NodeLabel.CLASS] + assert class_targets == [] diff --git a/codebase_rag/tests/test_call_resolver.py b/codebase_rag/tests/test_call_resolver.py index da4108f95..84d8151c5 100644 --- a/codebase_rag/tests/test_call_resolver.py +++ b/codebase_rag/tests/test_call_resolver.py @@ -24,6 +24,9 @@ class MockFunctionRegistry: def __init__(self) -> None: self._data: dict[QualifiedName, NodeType] = {} self._suffix_index: dict[str, list[QualifiedName]] = defaultdict(list) + self._properties: set[QualifiedName] = set() + self._property_names: set[str] = set() + self._abstracts: set[QualifiedName] = set() def __contains__(self, qn: QualifiedName) -> bool: return qn in self._data @@ -56,6 +59,28 @@ def find_with_prefix(self, prefix: str) -> list[tuple[QualifiedName, NodeType]]: def find_ending_with(self, suffix: str) -> list[QualifiedName]: return self._suffix_index.get(suffix, []) + def register_unique_qn(self, natural_qn: QualifiedName, start_line: int) -> str: + return natural_qn + + def variants(self, qn: QualifiedName) -> list[QualifiedName]: + return [qn] + + def mark_property(self, qn: QualifiedName) -> None: + self._properties.add(qn) + self._property_names.add(qn.rsplit(cs.SEPARATOR_DOT, 1)[-1]) + + def is_property(self, qn: QualifiedName) -> bool: + return qn in self._properties + + def property_names(self) -> set[str]: + return self._property_names + + def mark_abstract(self, qn: QualifiedName) -> None: + self._abstracts.add(qn) + + def is_abstract(self, qn: QualifiedName) -> bool: + return qn in self._abstracts + @pytest.fixture def mock_function_registry() -> MockFunctionRegistry: @@ -1024,3 +1049,356 @@ def test_falls_back_to_trie(self, call_resolver: CallResolver) -> None: def test_returns_none_for_unknown(self, call_resolver: CallResolver) -> None: result = call_resolver.resolve_function_call("unknown_func", "proj.module") assert result is None + + +class TestDequeBfs: + def test_bfs_order_prefers_closer_parent(self, call_resolver: CallResolver) -> None: + call_resolver.function_registry["proj.base.ParentA.method"] = NodeType.METHOD + call_resolver.function_registry["proj.base.ParentB.method"] = NodeType.METHOD + call_resolver.class_inheritance["proj.module.Child"] = [ + "proj.base.ParentA", + "proj.base.ParentB", + ] + + result = call_resolver._resolve_inherited_method("proj.module.Child", "method") + assert result is not None + assert result[1] == "proj.base.ParentA.method" + + def test_bfs_finds_deep_ancestor_method(self, call_resolver: CallResolver) -> None: + call_resolver.function_registry["proj.base.Root.deep_method"] = NodeType.METHOD + call_resolver.class_inheritance["proj.module.Child"] = ["proj.mid.Middle"] + call_resolver.class_inheritance["proj.mid.Middle"] = ["proj.base.Root"] + + result = call_resolver._resolve_inherited_method( + "proj.module.Child", "deep_method" + ) + assert result is not None + assert result[1] == "proj.base.Root.deep_method" + + def test_bfs_no_infinite_loop_on_cycle(self, call_resolver: CallResolver) -> None: + call_resolver.class_inheritance["proj.A"] = ["proj.B"] + call_resolver.class_inheritance["proj.B"] = ["proj.A"] + + result = call_resolver._resolve_inherited_method("proj.A", "missing") + assert result is None + + +class TestSeparatorPattern: + def test_splits_on_dot(self) -> None: + from codebase_rag.parsers.call_resolver import _SEPARATOR_PATTERN + + assert _SEPARATOR_PATTERN.split("a.b.c") == ["a", "b", "c"] + + def test_splits_on_colon(self) -> None: + from codebase_rag.parsers.call_resolver import _SEPARATOR_PATTERN + + assert _SEPARATOR_PATTERN.split("module:func") == ["module", "func"] + + def test_splits_on_double_colon(self) -> None: + from codebase_rag.parsers.call_resolver import _SEPARATOR_PATTERN + + assert _SEPARATOR_PATTERN.split("crate::module::func") == [ + "crate", + "", + "module", + "", + "func", + ] + + def test_no_separator_returns_single_element(self) -> None: + from codebase_rag.parsers.call_resolver import _SEPARATOR_PATTERN + + assert _SEPARATOR_PATTERN.split("simple") == ["simple"] + + def test_last_element_matches_function_name(self) -> None: + from codebase_rag.parsers.call_resolver import _SEPARATOR_PATTERN + + assert _SEPARATOR_PATTERN.split("a.b.func")[-1] == "func" + assert _SEPARATOR_PATTERN.split("module:method")[-1] == "method" + + +class TestChainedMethodPattern: + def test_matches_final_method(self) -> None: + from codebase_rag.parsers.call_resolver import _CHAINED_METHOD_PATTERN + + match = _CHAINED_METHOD_PATTERN.search("obj.method().next") + assert match is not None + assert match[1] == "next" + + def test_no_match_on_parenthesized_suffix(self) -> None: + from codebase_rag.parsers.call_resolver import _CHAINED_METHOD_PATTERN + + match = _CHAINED_METHOD_PATTERN.search("obj.method()") + assert match is None + + def test_matches_deeply_chained(self) -> None: + from codebase_rag.parsers.call_resolver import _CHAINED_METHOD_PATTERN + + match = _CHAINED_METHOD_PATTERN.search("a.b().c().final_method") + assert match is not None + assert match[1] == "final_method" + + +class TestDeterministicResolution: + def test_trie_tiebreak_by_qualified_name(self, call_resolver: CallResolver) -> None: + # (H) Register multiple functions with the same simple name in different modules + # (H) at equal import distance from the caller + call_resolver.function_registry["proj.alpha.utils.helper"] = NodeType.FUNCTION + call_resolver.function_registry["proj.beta.utils.helper"] = NodeType.FUNCTION + call_resolver.function_registry["proj.gamma.utils.helper"] = NodeType.FUNCTION + + results = [] + for _ in range(20): + result = call_resolver._try_resolve_via_trie("helper", "proj.delta.module") + assert result is not None + results.append(result[1]) + + # (H) All 20 runs must resolve to the same candidate (lexicographically first) + assert all(r == results[0] for r in results) + assert results[0] == "proj.alpha.utils.helper" + + def test_trie_tiebreak_picks_lexicographic_first( + self, call_resolver: CallResolver + ) -> None: + # (H) Deliberately insert in reverse lexicographic order + call_resolver.function_registry["proj.zoo.compute"] = NodeType.FUNCTION + call_resolver.function_registry["proj.mid.compute"] = NodeType.FUNCTION + call_resolver.function_registry["proj.aaa.compute"] = NodeType.FUNCTION + + result = call_resolver._try_resolve_via_trie("compute", "other.module") + assert result is not None + assert result[1] == "proj.aaa.compute" + + def test_trie_tiebreak_distance_still_wins( + self, call_resolver: CallResolver + ) -> None: + # (H) Closer module should win even if lexicographically later + call_resolver.function_registry["proj.far.away.process"] = NodeType.FUNCTION + call_resolver.function_registry["proj.module.process"] = NodeType.FUNCTION + + result = call_resolver._try_resolve_via_trie("process", "proj.module.caller") + assert result is not None + # (H) proj.module.process is closer to proj.module.caller + assert result[1] == "proj.module.process" + + def test_trie_many_candidates_deterministic( + self, call_resolver: CallResolver + ) -> None: + # (H) Register 10 equidistant candidates + names = [ + "proj.m09.run", + "proj.m05.run", + "proj.m01.run", + "proj.m07.run", + "proj.m03.run", + "proj.m08.run", + "proj.m02.run", + "proj.m06.run", + "proj.m04.run", + "proj.m10.run", + ] + for name in names: + call_resolver.function_registry[name] = NodeType.FUNCTION + + result = call_resolver._try_resolve_via_trie("run", "other.caller") + assert result is not None + assert result[1] == "proj.m01.run" + + def test_resolve_function_call_deterministic_across_runs( + self, call_resolver: CallResolver + ) -> None: + call_resolver.function_registry["pkg.svc_a.validate"] = NodeType.FUNCTION + call_resolver.function_registry["pkg.svc_b.validate"] = NodeType.FUNCTION + call_resolver.function_registry["pkg.svc_c.validate"] = NodeType.FUNCTION + + results = set() + for _ in range(10): + result = call_resolver.resolve_function_call( + "validate", "pkg.other.module", {}, None + ) + assert result is not None + results.add(result[1]) + + # (H) Must resolve to exactly one candidate across all runs + assert len(results) == 1 + + +class TestDeterministicFileOrder: + def test_eligible_files_are_sorted( + self, temp_repo: Path, mock_ingestor: MagicMock + ) -> None: + parsers, queries = load_parsers() + + # (H) Create files in non-alphabetical order + for name in ["zebra.py", "alpha.py", "middle.py", "beta.py"]: + (temp_repo / name).write_text(f"def func_{name[0]}(): pass\n") + + updater = GraphUpdater( + ingestor=mock_ingestor, + repo_path=temp_repo, + parsers=parsers, + queries=queries, + ) + + eligible = updater._collect_eligible_files() + paths_str = [str(f) for f in eligible] + + assert paths_str == sorted(paths_str) + + def test_graph_output_deterministic_across_runs(self, temp_repo: Path) -> None: + parsers, queries = load_parsers() + + (temp_repo / "mod_a.py").write_text( + "def shared(): pass\ndef call_a(): shared()\n" + ) + (temp_repo / "mod_b.py").write_text( + "def shared(): pass\ndef call_b(): shared()\n" + ) + + results = [] + for _ in range(5): + ingestor = MagicMock() + updater = GraphUpdater( + ingestor=ingestor, + repo_path=temp_repo, + parsers=parsers, + queries=queries, + ) + updater.run(force=True) + + calls = [ + (c.args[0][2], c.args[1], c.args[2][2]) + for c in ingestor.ensure_relationship_batch.call_args_list + if c.args[1] == cs.RelationshipType.CALLS + ] + calls.sort() + results.append(calls) + + # (H) All 5 runs must produce identical call graphs + assert len(results[0]) > 0 + for i in range(1, len(results)): + assert results[i] == results[0] + + def _run_determinism_check(self, temp_repo: Path, runs: int = 5) -> None: + parsers, queries = load_parsers() + results = [] + for _ in range(runs): + ingestor = MagicMock() + updater = GraphUpdater( + ingestor=ingestor, + repo_path=temp_repo, + parsers=parsers, + queries=queries, + ) + updater.run(force=True) + + calls = [ + (c.args[0][2], c.args[2][2]) + for c in ingestor.ensure_relationship_batch.call_args_list + if c.args[1] == cs.RelationshipType.CALLS + ] + calls.sort() + results.append(calls) + + assert len(results[0]) > 0 + for i in range(1, len(results)): + assert results[i] == results[0] + + def test_javascript_deterministic(self, temp_repo: Path) -> None: + parsers, _ = load_parsers() + if cs.SupportedLanguage.JS not in parsers: + pytest.skip("JavaScript parser not available") + + (temp_repo / "utils.js").write_text( + "function helper() {}\nfunction worker() { helper(); }\n" + ) + (temp_repo / "main.js").write_text( + "function helper() {}\nfunction entry() { helper(); }\n" + ) + self._run_determinism_check(temp_repo) + + def test_typescript_deterministic(self, temp_repo: Path) -> None: + parsers, _ = load_parsers() + if cs.SupportedLanguage.TS not in parsers: + pytest.skip("TypeScript parser not available") + + (temp_repo / "service.ts").write_text( + "function validate(x: string): boolean { return true; }\n" + "function process() { validate('test'); }\n" + ) + (temp_repo / "handler.ts").write_text( + "function validate(x: string): boolean { return false; }\n" + "function handle() { validate('input'); }\n" + ) + self._run_determinism_check(temp_repo) + + def test_rust_deterministic(self, temp_repo: Path) -> None: + parsers, _ = load_parsers() + if cs.SupportedLanguage.RUST not in parsers: + pytest.skip("Rust parser not available") + + (temp_repo / "utils.rs").write_text( + "fn compute() -> i32 { 42 }\nfn run() { compute(); }\n" + ) + (temp_repo / "main.rs").write_text( + "fn compute() -> i32 { 0 }\nfn start() { compute(); }\n" + ) + self._run_determinism_check(temp_repo) + + def test_java_deterministic(self, temp_repo: Path) -> None: + parsers, _ = load_parsers() + if cs.SupportedLanguage.JAVA not in parsers: + pytest.skip("Java parser not available") + + (temp_repo / "Utils.java").write_text( + "public class Utils {\n" + " public static void process() {}\n" + " public static void run() { process(); }\n" + "}\n" + ) + (temp_repo / "Helper.java").write_text( + "public class Helper {\n" + " public static void process() {}\n" + " public static void execute() { process(); }\n" + "}\n" + ) + self._run_determinism_check(temp_repo) + + def test_cpp_deterministic(self, temp_repo: Path) -> None: + parsers, _ = load_parsers() + if cs.SupportedLanguage.CPP not in parsers: + pytest.skip("C++ parser not available") + + (temp_repo / "math.cpp").write_text( + "int calculate() { return 1; }\nint run() { return calculate(); }\n" + ) + (temp_repo / "logic.cpp").write_text( + "int calculate() { return 2; }\nint start() { return calculate(); }\n" + ) + self._run_determinism_check(temp_repo) + + def test_go_deterministic(self, temp_repo: Path) -> None: + parsers, _ = load_parsers() + if cs.SupportedLanguage.GO not in parsers: + pytest.skip("Go parser not available") + + (temp_repo / "util.go").write_text( + "package main\nfunc helper() {}\nfunc doWork() { helper() }\n" + ) + (temp_repo / "main.go").write_text( + "package main\nfunc helper() {}\nfunc run() { helper() }\n" + ) + self._run_determinism_check(temp_repo) + + def test_lua_deterministic(self, temp_repo: Path) -> None: + parsers, _ = load_parsers() + if cs.SupportedLanguage.LUA not in parsers: + pytest.skip("Lua parser not available") + + (temp_repo / "utils.lua").write_text( + "local function process() end\nlocal function run() process() end\n" + ) + (temp_repo / "main.lua").write_text( + "local function process() end\nlocal function start() process() end\n" + ) + self._run_determinism_check(temp_repo) diff --git a/codebase_rag/tests/test_callable_field_calls.py b/codebase_rag/tests/test_callable_field_calls.py new file mode 100644 index 000000000..96316ab8f --- /dev/null +++ b/codebase_rag/tests/test_callable_field_calls.py @@ -0,0 +1,132 @@ +# (H) L3 finding from the evals/ harness: fqn_config.get_name(node) invokes a +# (H) function stored in a NamedTuple Callable field (FQNSpec), where fqn_config +# (H) comes from LANGUAGE_FQN_SPECS.get(language). Every function bound to that +# (H) field at a construction site is a possible callee, so resolving to all of +# (H) them is a sound call graph and captures the traced (Python) edge. +from __future__ import annotations + +from pathlib import Path + +from codebase_rag import constants as cs +from codebase_rag.graph_updater import GraphUpdater +from codebase_rag.parser_loader import load_parsers +from codebase_rag.types_defs import PropertyDict, PropertyValue, ResultRow + +PROJECT = "proj" + +# (H) fetch_name is a callable field of exactly one NamedTuple, mirroring how +# (H) get_name is unique to FQNSpec, so it resolves without a receiver type. +MODULE_SRC = """from typing import Callable, NamedTuple + + +def py_name() -> str: + return "py" + + +def js_name() -> str: + return "js" + + +class Spec(NamedTuple): + fetch_name: Callable[[], str] + + +PY_SPEC = Spec(fetch_name=py_name) +JS_SPEC = Spec(fetch_name=js_name) + +SPECS = {"py": PY_SPEC, "js": JS_SPEC} + + +def use(lang: str) -> str: + spec = SPECS.get(lang) + return spec.fetch_name() +""" + +# (H) Two classes share the field name, so with no receiver type the targets are +# (H) ambiguous and must NOT be emitted (precision guard). +AMBIGUOUS_SRC = """from typing import Callable, NamedTuple + + +def a_name() -> str: + return "a" + + +def b_name() -> str: + return "b" + + +class SpecA(NamedTuple): + shared_cb: Callable[[], str] + + +class SpecB(NamedTuple): + shared_cb: Callable[[], str] + + +A = SpecA(shared_cb=a_name) +B = SpecB(shared_cb=b_name) + + +def run(flag: bool): + chosen = A if flag else B + return chosen.shared_cb() +""" + + +class _Capture: + def __init__(self) -> None: + self.rels: list[tuple[PropertyValue, str, PropertyValue]] = [] + + def ensure_node_batch(self, label: str, properties: PropertyDict) -> None: + return None + + def ensure_relationship_batch( + self, + from_spec: tuple[str, str, PropertyValue], + rel_type: str, + to_spec: tuple[str, str, PropertyValue], + properties: PropertyDict | None = None, + ) -> None: + self.rels.append((from_spec[2], str(rel_type), to_spec[2])) + + def flush_all(self) -> None: + return None + + def fetch_all( + self, query: str, params: PropertyDict | None = None + ) -> list[ResultRow]: + return [] + + def execute_write(self, query: str, params: PropertyDict | None = None) -> None: + return None + + +def _calls(tmp_path: Path, src: str) -> set[tuple[PropertyValue, PropertyValue]]: + (tmp_path / "m.py").write_text(src) + parsers, queries = load_parsers() + cap = _Capture() + GraphUpdater( + ingestor=cap, + repo_path=tmp_path, + parsers=parsers, + queries=queries, + project_name=PROJECT, + ).run(force=True) + return { + (frm, to) for (frm, rel, to) in cap.rels if rel == cs.RelationshipType.CALLS + } + + +class TestCallableFieldCalls: + def test_resolves_to_first_bound_function(self, tmp_path: Path) -> None: + calls = _calls(tmp_path, MODULE_SRC) + assert ("proj.m.use", "proj.m.py_name") in calls, calls + + def test_resolves_to_all_bound_functions(self, tmp_path: Path) -> None: + calls = _calls(tmp_path, MODULE_SRC) + assert ("proj.m.use", "proj.m.js_name") in calls, calls + + def test_ambiguous_field_name_not_resolved(self, tmp_path: Path) -> None: + calls = _calls(tmp_path, AMBIGUOUS_SRC) + assert ("proj.m.run", "proj.m.a_name") not in calls, calls + assert ("proj.m.run", "proj.m.b_name") not in calls, calls diff --git a/codebase_rag/tests/test_cancel_orphaned_tool_calls.py b/codebase_rag/tests/test_cancel_orphaned_tool_calls.py new file mode 100644 index 000000000..acff644a7 --- /dev/null +++ b/codebase_rag/tests/test_cancel_orphaned_tool_calls.py @@ -0,0 +1,91 @@ +from __future__ import annotations + +from pydantic_ai.messages import ( + ModelMessage, + ModelRequest, + ModelResponse, + SystemPromptPart, + TextPart, + ToolCallPart, + ToolReturnPart, + UserPromptPart, +) + +from codebase_rag import constants as cs +from codebase_rag.main import _cancel_orphaned_tool_calls + + +def test_noop_when_history_empty() -> None: + history: list[ModelMessage] = [] + _cancel_orphaned_tool_calls(history) + assert history == [] + + +def test_noop_when_last_message_is_request() -> None: + history: list[ModelMessage] = [ModelRequest(parts=[UserPromptPart(content="hi")])] + _cancel_orphaned_tool_calls(history) + assert len(history) == 1 + + +def test_noop_when_response_has_no_tool_calls() -> None: + history: list[ModelMessage] = [ + ModelRequest(parts=[SystemPromptPart(content="sys")]), + ModelResponse(parts=[TextPart(content="hello")]), + ] + _cancel_orphaned_tool_calls(history) + assert len(history) == 2 + + +def test_appends_synthetic_return_for_each_orphan_tool_call() -> None: + history: list[ModelMessage] = [ + ModelRequest(parts=[UserPromptPart(content="run stuff")]), + ModelResponse( + parts=[ + ToolCallPart( + tool_name="shell_command", + args={"command": "ls"}, + tool_call_id="call_1", + ), + ToolCallPart( + tool_name="read_file", + args={"path": "/tmp/x"}, + tool_call_id="call_2", + ), + ] + ), + ] + + _cancel_orphaned_tool_calls(history) + + assert len(history) == 3 + repaired = history[-1] + assert isinstance(repaired, ModelRequest) + returns = [p for p in repaired.parts if isinstance(p, ToolReturnPart)] + assert len(returns) == 2 + assert {r.tool_call_id for r in returns} == {"call_1", "call_2"} + for r in returns: + assert r.content == cs.MSG_TOOL_CALL_CANCELLED + + +def test_ignores_non_tool_call_parts_in_response() -> None: + history: list[ModelMessage] = [ + ModelResponse( + parts=[ + TextPart(content="some text"), + ToolCallPart( + tool_name="shell_command", + args={"command": "ls"}, + tool_call_id="call_1", + ), + ] + ), + ] + + _cancel_orphaned_tool_calls(history) + + assert len(history) == 2 + repaired = history[-1] + assert isinstance(repaired, ModelRequest) + returns = [p for p in repaired.parts if isinstance(p, ToolReturnPart)] + assert len(returns) == 1 + assert returns[0].tool_call_id == "call_1" diff --git a/codebase_rag/tests/test_cgr_instructions.py b/codebase_rag/tests/test_cgr_instructions.py new file mode 100644 index 000000000..e9a86d6ee --- /dev/null +++ b/codebase_rag/tests/test_cgr_instructions.py @@ -0,0 +1,167 @@ +from __future__ import annotations + +from pathlib import Path +from unittest.mock import MagicMock, patch + +import pytest + +from codebase_rag import config as cgr_config +from codebase_rag.config import ( + CGR_INSTRUCTIONS_FILENAME, + load_cgr_instructions, +) +from codebase_rag.prompts import build_rag_orchestrator_prompt +from codebase_rag.services.llm import create_rag_orchestrator + + +@pytest.fixture +def isolated_global(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> Path: + target = tmp_path / "home_cgr.md" + monkeypatch.setattr(cgr_config, "GLOBAL_CGR_INSTRUCTIONS_PATH", target) + return target + + +def test_returns_none_when_no_file(temp_repo: Path, isolated_global: Path) -> None: + assert load_cgr_instructions(temp_repo) is None + + +def test_loads_instructions_when_repo_file_present( + temp_repo: Path, isolated_global: Path +) -> None: + body = "Prefer reading docs/ before answering." + (temp_repo / CGR_INSTRUCTIONS_FILENAME).write_text(body, encoding="utf-8") + + assert load_cgr_instructions(temp_repo) == body + + +def test_loads_global_only_when_repo_path_none(isolated_global: Path) -> None: + isolated_global.write_text("global rule", encoding="utf-8") + + assert load_cgr_instructions(None) == "global rule" + + +def test_merges_global_and_repo(temp_repo: Path, isolated_global: Path) -> None: + isolated_global.write_text("global rule", encoding="utf-8") + (temp_repo / CGR_INSTRUCTIONS_FILENAME).write_text( + "repo override", encoding="utf-8" + ) + + merged = load_cgr_instructions(temp_repo) + + assert merged is not None + assert merged.startswith("global rule") + assert "repo override" in merged + assert merged.index("global rule") < merged.index("repo override") + + +def test_returns_none_when_file_empty(temp_repo: Path, isolated_global: Path) -> None: + (temp_repo / CGR_INSTRUCTIONS_FILENAME).write_text(" \n", encoding="utf-8") + + assert load_cgr_instructions(temp_repo) is None + + +def test_returns_none_on_read_error( + temp_repo: Path, + isolated_global: Path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + (temp_repo / CGR_INSTRUCTIONS_FILENAME).write_text("hello", encoding="utf-8") + original_open = Path.open + + def mock_open(self: Path, *args, **kwargs): # noqa: ANN002, ANN003 + if self.name == CGR_INSTRUCTIONS_FILENAME: + raise PermissionError("nope") + return original_open(self, *args, **kwargs) + + monkeypatch.setattr(Path, "open", mock_open) + + assert load_cgr_instructions(temp_repo) is None + + +def test_orchestrator_prompt_appends_project_instructions() -> None: + base = build_rag_orchestrator_prompt(tools=[]) + extra = "Never modify files under vendor/." + with_extra = build_rag_orchestrator_prompt(tools=[], project_instructions=extra) + + assert with_extra.startswith(base) + assert extra in with_extra + + +def test_orchestrator_prompt_unchanged_without_instructions() -> None: + base = build_rag_orchestrator_prompt(tools=[]) + none_case = build_rag_orchestrator_prompt(tools=[], project_instructions=None) + empty_case = build_rag_orchestrator_prompt(tools=[], project_instructions=" ") + + assert none_case == base + assert empty_case == base + + +@patch("codebase_rag.services.llm.settings") +@patch("codebase_rag.services.llm.get_provider_from_config") +@patch("codebase_rag.services.llm.Agent") +def test_create_rag_orchestrator_reads_project_instructions( + mock_agent: MagicMock, + mock_get_provider: MagicMock, + mock_settings: MagicMock, + temp_repo: Path, + isolated_global: Path, +) -> None: + mock_settings.active_orchestrator_config = MagicMock() + mock_settings.AGENT_RETRIES = 3 + mock_settings.ORCHESTRATOR_OUTPUT_RETRIES = 2 + mock_get_provider.return_value.create_model.return_value = MagicMock() + + extra = "Honor scoped read-only mode." + (temp_repo / CGR_INSTRUCTIONS_FILENAME).write_text(extra, encoding="utf-8") + + agent, system_prompt = create_rag_orchestrator(tools=[], project_root=temp_repo) + + assert extra in system_prompt + assert mock_agent.call_args.kwargs["system_prompt"] == system_prompt + + +@patch("codebase_rag.services.llm.settings") +@patch("codebase_rag.services.llm.get_provider_from_config") +@patch("codebase_rag.services.llm.Agent") +def test_create_rag_orchestrator_skips_instructions_when_disabled( + mock_agent: MagicMock, + mock_get_provider: MagicMock, + mock_settings: MagicMock, + temp_repo: Path, + isolated_global: Path, +) -> None: + mock_settings.active_orchestrator_config = MagicMock() + mock_settings.AGENT_RETRIES = 3 + mock_settings.ORCHESTRATOR_OUTPUT_RETRIES = 2 + mock_get_provider.return_value.create_model.return_value = MagicMock() + + isolated_global.write_text("GLOBAL SECRET", encoding="utf-8") + (temp_repo / CGR_INSTRUCTIONS_FILENAME).write_text("REPO SECRET", encoding="utf-8") + + _, system_prompt = create_rag_orchestrator( + tools=[], project_root=temp_repo, load_instructions=False + ) + + assert "GLOBAL SECRET" not in system_prompt + assert "REPO SECRET" not in system_prompt + + +@patch("codebase_rag.services.llm.settings") +@patch("codebase_rag.services.llm.get_provider_from_config") +@patch("codebase_rag.services.llm.Agent") +def test_create_rag_orchestrator_reads_global_instructions( + mock_agent: MagicMock, + mock_get_provider: MagicMock, + mock_settings: MagicMock, + isolated_global: Path, +) -> None: + mock_settings.active_orchestrator_config = MagicMock() + mock_settings.AGENT_RETRIES = 3 + mock_settings.ORCHESTRATOR_OUTPUT_RETRIES = 2 + mock_get_provider.return_value.create_model.return_value = MagicMock() + + isolated_global.write_text("global directive ABC", encoding="utf-8") + + _, system_prompt = create_rag_orchestrator(tools=[], project_root=None) + + assert "global directive ABC" in system_prompt diff --git a/codebase_rag/tests/test_cgr_shim.py b/codebase_rag/tests/test_cgr_shim.py new file mode 100644 index 000000000..b7cdbd8fc --- /dev/null +++ b/codebase_rag/tests/test_cgr_shim.py @@ -0,0 +1,41 @@ +import cgr + + +class TestCgrShimExports: + def test_all_symbols_importable(self) -> None: + for name in cgr.__all__: + assert hasattr(cgr, name), f"{name!r} listed in __all__ but not importable" + + def test_all_matches_module_exports(self) -> None: + public_attrs = {k for k in vars(cgr) if not k.startswith("_")} + assert set(cgr.__all__) == public_attrs + + def test_settings_is_canonical_instance(self) -> None: + from codebase_rag.config import settings + + assert cgr.settings is settings + + def test_embed_code_is_canonical_function(self) -> None: + from codebase_rag.embedder import embed_code + + assert cgr.embed_code is embed_code + + def test_graph_loader_is_canonical_class(self) -> None: + from codebase_rag.graph_loader import GraphLoader + + assert cgr.GraphLoader is GraphLoader + + def test_load_graph_is_canonical_function(self) -> None: + from codebase_rag.graph_loader import load_graph + + assert cgr.load_graph is load_graph + + def test_memgraph_ingestor_is_canonical_class(self) -> None: + from codebase_rag.services.graph_service import MemgraphIngestor + + assert cgr.MemgraphIngestor is MemgraphIngestor + + def test_cypher_generator_is_canonical_class(self) -> None: + from codebase_rag.services.llm import CypherGenerator + + assert cgr.CypherGenerator is CypherGenerator diff --git a/codebase_rag/tests/test_cgr_state_and_status.py b/codebase_rag/tests/test_cgr_state_and_status.py new file mode 100644 index 000000000..0a26fa5c0 --- /dev/null +++ b/codebase_rag/tests/test_cgr_state_and_status.py @@ -0,0 +1,100 @@ +from __future__ import annotations + +from collections.abc import Generator +from pathlib import Path +from unittest.mock import patch + +import pytest +from typer.testing import CliRunner + +from codebase_rag import cgr_state +from codebase_rag.cli import app + +runner = CliRunner() + + +@pytest.fixture(autouse=True) +def _temp_home( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> Generator[Path, None, None]: + from codebase_rag.config import settings + + home = tmp_path / "cgr-home" + monkeypatch.setattr(settings, "CGR_HOME", home) + yield home + + +class TestRecordSync: + def test_record_sync_creates_file(self, _temp_home: Path) -> None: + cgr_state.record_sync("alpha") + assert cgr_state.state_path().exists() + ts = cgr_state.read_sync_timestamps() + assert "alpha" in ts + + def test_record_sync_updates_existing(self, _temp_home: Path) -> None: + cgr_state.record_sync("alpha") + first = cgr_state.read_sync_timestamps()["alpha"] + cgr_state.record_sync("alpha") + second = cgr_state.read_sync_timestamps()["alpha"] + assert second >= first + + def test_record_sync_multiple_projects(self, _temp_home: Path) -> None: + cgr_state.record_sync("a") + cgr_state.record_sync("b") + ts = cgr_state.read_sync_timestamps() + assert set(ts.keys()) == {"a", "b"} + + def test_read_when_no_state_returns_empty(self, _temp_home: Path) -> None: + assert cgr_state.read_sync_timestamps() == {} + + +class TestStatusCommand: + def test_status_runs_clean(self, _temp_home: Path) -> None: + from codebase_rag.stack.constants import StackState + from codebase_rag.stack.manager import StackStatus + + fake = StackStatus( + state=StackState.STOPPED, + memgraph_reachable=False, + qdrant_reachable=False, + compose_file=Path("/tmp/cgr/docker-compose.yaml"), + memgraph_endpoint="localhost:7687", + qdrant_endpoint="localhost:6333", + ) + with patch("codebase_rag.cli.StackManager") as mock_mgr: + mock_mgr.return_value.status.return_value = fake + result = runner.invoke(app, ["status"]) + assert result.exit_code == 0, result.output + assert "stopped" in result.output + assert "no projects synced" in result.output + + def test_status_lists_recorded_projects(self, _temp_home: Path) -> None: + from codebase_rag.stack.constants import StackState + from codebase_rag.stack.manager import StackStatus + + cgr_state.record_sync("alpha") + cgr_state.record_sync("beta") + fake = StackStatus( + state=StackState.RUNNING, + memgraph_reachable=True, + qdrant_reachable=True, + compose_file=Path("/tmp/cgr/docker-compose.yaml"), + memgraph_endpoint="localhost:7687", + qdrant_endpoint="localhost:6333", + ) + with patch("codebase_rag.cli.StackManager") as mock_mgr: + mock_mgr.return_value.status.return_value = fake + result = runner.invoke(app, ["status"]) + assert result.exit_code == 0, result.output + assert "alpha" in result.output + assert "beta" in result.output + assert "running" in result.output + + +class TestStopCommand: + def test_stop_invokes_daemon_down(self, _temp_home: Path) -> None: + with patch("codebase_rag.cli.StackManager") as mock_mgr: + instance = mock_mgr.return_value + result = runner.invoke(app, ["stop"]) + assert result.exit_code == 0, result.output + instance.down.assert_called_once() diff --git a/codebase_rag/tests/test_cgrignore.py b/codebase_rag/tests/test_cgrignore.py index 09cb814be..0740c228d 100644 --- a/codebase_rag/tests/test_cgrignore.py +++ b/codebase_rag/tests/test_cgrignore.py @@ -1,10 +1,13 @@ from __future__ import annotations +from collections.abc import Generator from pathlib import Path from unittest.mock import MagicMock, patch import pytest +from typer.testing import CliRunner +from codebase_rag.cli import app from codebase_rag.config import ( CGRIGNORE_FILENAME, EMPTY_CGRIGNORE, @@ -265,3 +268,137 @@ def test_unignore_included_when_user_selects_all( assert "vendor" in result assert ".git" in result assert "custom" in result + + +@pytest.fixture +def mock_memgraph_connect() -> Generator[MagicMock, None, None]: + with patch("codebase_rag.cli.connect_memgraph") as mock_connect: + mock_ingestor = MagicMock() + mock_connect.return_value.__enter__ = MagicMock(return_value=mock_ingestor) + mock_connect.return_value.__exit__ = MagicMock(return_value=False) + yield mock_connect + + +class TestCgrignoreLoadedWithoutInteractiveSetup: + runner = CliRunner() + + @patch("codebase_rag.cli.GraphUpdater") + @patch("codebase_rag.cli.load_parsers", return_value=({}, {})) + @patch("codebase_rag.cli.load_cgrignore_patterns") + def test_start_loads_cgrignore_without_interactive_setup( + self, + mock_load_cgrignore: MagicMock, + mock_load_parsers: MagicMock, + mock_graph_updater: MagicMock, + mock_memgraph_connect: MagicMock, + tmp_path: Path, + ) -> None: + cgrignore_patterns = CgrignorePatterns( + exclude=frozenset({"vendor", "build"}), + unignore=frozenset({"vendor/important"}), + ) + mock_load_cgrignore.return_value = cgrignore_patterns + + result = self.runner.invoke( + app, + ["start", "--update-graph", "--repo-path", str(tmp_path)], + ) + + assert result.exit_code == 0, result.output + mock_load_cgrignore.assert_called_once_with(tmp_path) + updater_kwargs = mock_graph_updater.call_args.kwargs + assert updater_kwargs["unignore_paths"] == frozenset({"vendor/important"}) + assert "vendor" in updater_kwargs["exclude_paths"] + assert "build" in updater_kwargs["exclude_paths"] + + @patch("codebase_rag.cli.GraphUpdater") + @patch("codebase_rag.cli.load_parsers", return_value=({}, {})) + @patch("codebase_rag.cli.ProtobufFileIngestor") + @patch("codebase_rag.cli.load_cgrignore_patterns") + def test_index_loads_cgrignore_without_interactive_setup( + self, + mock_load_cgrignore: MagicMock, + mock_proto_ingestor: MagicMock, + mock_load_parsers: MagicMock, + mock_graph_updater: MagicMock, + tmp_path: Path, + ) -> None: + cgrignore_patterns = CgrignorePatterns( + exclude=frozenset({"dist"}), + unignore=frozenset({"dist/assets"}), + ) + mock_load_cgrignore.return_value = cgrignore_patterns + + output_dir = str(tmp_path / "output") + + result = self.runner.invoke( + app, + ["index", "--repo-path", str(tmp_path), "-o", output_dir], + ) + + assert result.exit_code == 0, result.output + mock_load_cgrignore.assert_called_once_with(tmp_path) + updater_kwargs = mock_graph_updater.call_args.kwargs + assert updater_kwargs["unignore_paths"] == frozenset({"dist/assets"}) + assert "dist" in updater_kwargs["exclude_paths"] + + @patch("codebase_rag.cli.GraphUpdater") + @patch("codebase_rag.cli.load_parsers", return_value=({}, {})) + @patch("codebase_rag.cli.load_cgrignore_patterns") + def test_start_merges_cli_excludes_with_cgrignore( + self, + mock_load_cgrignore: MagicMock, + mock_load_parsers: MagicMock, + mock_graph_updater: MagicMock, + mock_memgraph_connect: MagicMock, + tmp_path: Path, + ) -> None: + cgrignore_patterns = CgrignorePatterns( + exclude=frozenset({"from_cgrignore"}), + unignore=frozenset(), + ) + mock_load_cgrignore.return_value = cgrignore_patterns + + result = self.runner.invoke( + app, + [ + "start", + "--update-graph", + "--repo-path", + str(tmp_path), + "--exclude", + "from_cli", + ], + ) + + assert result.exit_code == 0, result.output + updater_kwargs = mock_graph_updater.call_args.kwargs + assert "from_cgrignore" in updater_kwargs["exclude_paths"] + assert "from_cli" in updater_kwargs["exclude_paths"] + + @patch("codebase_rag.cli.prompt_for_unignored_directories") + @patch("codebase_rag.cli.GraphUpdater") + @patch("codebase_rag.cli.load_parsers", return_value=({}, {})) + @patch("codebase_rag.cli.load_cgrignore_patterns") + def test_start_does_not_prompt_without_interactive_setup( + self, + mock_load_cgrignore: MagicMock, + mock_load_parsers: MagicMock, + mock_graph_updater: MagicMock, + mock_prompt: MagicMock, + mock_memgraph_connect: MagicMock, + tmp_path: Path, + ) -> None: + mock_load_cgrignore.return_value = CgrignorePatterns( + exclude=frozenset({"vendor"}), + unignore=frozenset({"vendor/keep"}), + ) + + result = self.runner.invoke( + app, + ["start", "--update-graph", "--repo-path", str(tmp_path)], + ) + + assert result.exit_code == 0, result.output + mock_prompt.assert_not_called() + mock_load_cgrignore.assert_called_once() diff --git a/codebase_rag/tests/test_chained_attribute_resolution.py b/codebase_rag/tests/test_chained_attribute_resolution.py new file mode 100644 index 000000000..f72d9d252 --- /dev/null +++ b/codebase_rag/tests/test_chained_attribute_resolution.py @@ -0,0 +1,124 @@ +# (H) L3 finding from the evals/ harness: GraphUpdater.run calls +# (H) self.factory.definition_processor.process_all_method_overrides(), a three-level +# (H) chain where factory is an instance attribute (ProcessorFactory), definition_processor +# (H) is a @property returning DefinitionProcessor, and the method is inherited from a +# (H) mixin base. A module-level function of the same name makes the bare-name trie +# (H) fallback ambiguous, so the chain types must be walked to land on the mixin method. +from __future__ import annotations + +from pathlib import Path + +from codebase_rag import constants as cs +from codebase_rag.graph_updater import GraphUpdater +from codebase_rag.parser_loader import load_parsers +from codebase_rag.types_defs import PropertyDict, PropertyValue, ResultRow + +PROJECT = "proj" + +FILES = { + "pkg/__init__.py": "", + # (H) OverrideMixin is re-exported through the package __init__, so the subclass + # (H) records its base as the re-export QN (pkg.overrides.OverrideMixin) rather than + # (H) the real definition (pkg.overrides.mixin.OverrideMixin); inherited-method + # (H) lookup must follow the re-export. A same-named module-level function competes. + "pkg/overrides/__init__.py": ( + "from .mixin import OverrideMixin, process_all\n\n" + "__all__ = ['OverrideMixin', 'process_all']\n" + ), + "pkg/overrides/mixin.py": ( + "def process_all():\n return None\n\n\n" + "class OverrideMixin:\n" + " def process_all(self):\n" + " return None\n" + ), + "pkg/defproc.py": ( + "from .overrides import OverrideMixin\n\n\n" + "class DefProc(OverrideMixin):\n" + " def other(self):\n" + " return None\n" + ), + "pkg/factory.py": ( + "from .defproc import DefProc\n\n\n" + "class Factory:\n" + " def __init__(self) -> None:\n" + " self._dp = None\n\n" + " @property\n" + " def definition_processor(self) -> DefProc:\n" + " if self._dp is None:\n" + " self._dp = DefProc()\n" + " return self._dp\n" + ), + "pkg/runner.py": ( + "from .factory import Factory\n\n\n" + "class Runner:\n" + " def __init__(self) -> None:\n" + " self.factory = Factory()\n\n" + " def run(self):\n" + " return self.factory.definition_processor.process_all()\n" + ), +} + + +class _Capture: + def __init__(self) -> None: + self.rels: list[tuple[PropertyValue, str, PropertyValue]] = [] + + def ensure_node_batch(self, label: str, properties: PropertyDict) -> None: + return None + + def ensure_relationship_batch( + self, + from_spec: tuple[str, str, PropertyValue], + rel_type: str, + to_spec: tuple[str, str, PropertyValue], + properties: PropertyDict | None = None, + ) -> None: + self.rels.append((from_spec[2], str(rel_type), to_spec[2])) + + def flush_all(self) -> None: + return None + + def fetch_all( + self, query: str, params: PropertyDict | None = None + ) -> list[ResultRow]: + return [] + + def execute_write(self, query: str, params: PropertyDict | None = None) -> None: + return None + + +def _calls(tmp_path: Path) -> set[tuple[PropertyValue, PropertyValue]]: + for rel, content in FILES.items(): + p = tmp_path / rel + p.parent.mkdir(parents=True, exist_ok=True) + p.write_text(content) + parsers, queries = load_parsers() + cap = _Capture() + GraphUpdater( + ingestor=cap, + repo_path=tmp_path, + parsers=parsers, + queries=queries, + project_name=PROJECT, + ).run(force=True) + return { + (frm, to) for (frm, rel, to) in cap.rels if rel == cs.RelationshipType.CALLS + } + + +class TestChainedAttributeResolution: + def test_three_level_chain_resolves_to_inherited_mixin_method( + self, tmp_path: Path + ) -> None: + calls = _calls(tmp_path) + assert ( + "proj.pkg.runner.Runner.run", + "proj.pkg.overrides.mixin.OverrideMixin.process_all", + ) in calls, calls + + def test_does_not_resolve_to_module_level_function(self, tmp_path: Path) -> None: + calls = _calls(tmp_path) + assert ( + "proj.pkg.runner.Runner.run", + "proj.pkg.overrides.mixin.process_all", + ) not in calls, calls diff --git a/codebase_rag/tests/test_class_ingest.py b/codebase_rag/tests/test_class_ingest.py index 60c249414..ae1740b02 100644 --- a/codebase_rag/tests/test_class_ingest.py +++ b/codebase_rag/tests/test_class_ingest.py @@ -1249,7 +1249,6 @@ def go_struct_project(temp_repo: Path) -> Path: return project_path -@pytest.mark.xfail(reason="Go struct/interface ingestion not fully implemented") def test_go_struct_methods_are_ingested( go_struct_project: Path, mock_ingestor: MagicMock ) -> None: @@ -1278,7 +1277,6 @@ def test_go_struct_methods_are_ingested( ) -@pytest.mark.xfail(reason="Go struct/interface ingestion not fully implemented") def test_go_interface_nodes_created( go_struct_project: Path, mock_ingestor: MagicMock ) -> None: @@ -1297,7 +1295,6 @@ def test_go_interface_nodes_created( ) -@pytest.mark.xfail(reason="Go struct/interface ingestion not fully implemented") def test_go_struct_nodes_created( go_struct_project: Path, mock_ingestor: MagicMock ) -> None: @@ -1339,373 +1336,6 @@ def test_go_embedded_interface( assert len(mammal_inherits) >= 0, "Mammal interface embedding should be detected" -@pytest.fixture -def csharp_class_project(temp_repo: Path) -> Path: - project_path = temp_repo / "csharp_class_test" - project_path.mkdir() - - animal_file = project_path / "IAnimal.cs" - animal_file.write_text( - encoding="utf-8", - data=""" -namespace Animals -{ - public interface IAnimal - { - string Speak(); - void Move(); - string Name { get; set; } - } - - public interface IFlyable - { - void Fly(); - int GetAltitude(); - } - - public interface ISwimmable - { - void Swim(); - int GetDepth(); - } -} -""", - ) - - dog_file = project_path / "Dog.cs" - dog_file.write_text( - encoding="utf-8", - data=""" -namespace Animals -{ - public class Dog : IAnimal - { - public string Name { get; set; } - public string Breed { get; private set; } - - public Dog(string name, string breed) - { - Name = name; - Breed = breed; - } - - public string Speak() - { - return $"{Name} says: Woof!"; - } - - public void Move() - { - Console.WriteLine($"{Name} runs on four legs"); - } - - public void Fetch() - { - Console.WriteLine($"{Name} fetches the ball"); - } - } -} -""", - ) - - duck_file = project_path / "Duck.cs" - duck_file.write_text( - encoding="utf-8", - data=""" -namespace Animals -{ - public class Duck : IAnimal, IFlyable, ISwimmable - { - public string Name { get; set; } - private int _altitude; - private int _depth; - - public Duck(string name) - { - Name = name; - _altitude = 0; - _depth = 0; - } - - public string Speak() - { - return $"{Name} says: Quack!"; - } - - public void Move() - { - Console.WriteLine($"{Name} waddles"); - } - - public void Fly() - { - _altitude = 100; - Console.WriteLine($"{Name} flies up to {_altitude} meters"); - } - - public int GetAltitude() - { - return _altitude; - } - - public void Swim() - { - _depth = 5; - Console.WriteLine($"{Name} swims at depth {_depth} meters"); - } - - public int GetDepth() - { - return _depth; - } - } -} -""", - ) - - base_class_file = project_path / "BaseVehicle.cs" - base_class_file.write_text( - encoding="utf-8", - data=""" -namespace Vehicles -{ - public abstract class BaseVehicle - { - public string Model { get; protected set; } - public int Year { get; protected set; } - - protected BaseVehicle(string model, int year) - { - Model = model; - Year = year; - } - - public abstract void Start(); - public abstract void Stop(); - - public virtual string GetInfo() - { - return $"{Year} {Model}"; - } - } - - public class Car : BaseVehicle - { - public int NumberOfDoors { get; private set; } - - public Car(string model, int year, int doors) : base(model, year) - { - NumberOfDoors = doors; - } - - public override void Start() - { - Console.WriteLine($"{Model} engine starts"); - } - - public override void Stop() - { - Console.WriteLine($"{Model} engine stops"); - } - - public override string GetInfo() - { - return $"{base.GetInfo()} - {NumberOfDoors} doors"; - } - } - - public class ElectricCar : Car - { - public int BatteryCapacity { get; private set; } - - public ElectricCar(string model, int year, int doors, int batteryKwh) - : base(model, year, doors) - { - BatteryCapacity = batteryKwh; - } - - public override void Start() - { - Console.WriteLine($"{Model} silently starts"); - } - - public void Charge() - { - Console.WriteLine($"Charging {Model} battery ({BatteryCapacity} kWh)"); - } - } -} -""", - ) - - struct_file = project_path / "Point.cs" - struct_file.write_text( - encoding="utf-8", - data=""" -namespace Geometry -{ - public struct Point - { - public double X { get; } - public double Y { get; } - - public Point(double x, double y) - { - X = x; - Y = y; - } - - public double DistanceTo(Point other) - { - double dx = X - other.X; - double dy = Y - other.Y; - return Math.Sqrt(dx * dx + dy * dy); - } - - public Point Translate(double dx, double dy) - { - return new Point(X + dx, Y + dy); - } - } - - public struct Rectangle - { - public Point TopLeft { get; } - public double Width { get; } - public double Height { get; } - - public Rectangle(Point topLeft, double width, double height) - { - TopLeft = topLeft; - Width = width; - Height = height; - } - - public double Area() - { - return Width * Height; - } - - public double Perimeter() - { - return 2 * (Width + Height); - } - } -} -""", - ) - - return project_path - - -def test_csharp_class_methods_are_ingested( - csharp_class_project: Path, mock_ingestor: MagicMock -) -> None: - run_updater(csharp_class_project, mock_ingestor, skip_if_missing="c-sharp") - - method_nodes = [ - call - for call in mock_ingestor.ensure_node_batch.call_args_list - if call[0][0] == "Method" - ] - - method_names = {call[0][1].get("name", "") for call in method_nodes} - - expected_methods = ["Speak", "Move", "Fetch", "Start", "Stop", "GetInfo", "Charge"] - found_methods = [m for m in expected_methods if m in method_names] - - assert len(found_methods) >= 1, f"Should have C# methods, found: {method_names}" - - -def test_csharp_interface_implementation( - csharp_class_project: Path, mock_ingestor: MagicMock -) -> None: - run_updater(csharp_class_project, mock_ingestor, skip_if_missing="c-sharp") - - implements_rels = get_relationships(mock_ingestor, "IMPLEMENTS") - - dog_implements = [call for call in implements_rels if "Dog" in call.args[0][2]] - - assert len(dog_implements) >= 0, "Dog should implement IAnimal" - - -def test_csharp_multiple_interface_implementation( - csharp_class_project: Path, mock_ingestor: MagicMock -) -> None: - run_updater(csharp_class_project, mock_ingestor, skip_if_missing="c-sharp") - - implements_rels = get_relationships(mock_ingestor, "IMPLEMENTS") - - duck_implements = [call for call in implements_rels if "Duck" in call.args[0][2]] - - assert len(duck_implements) >= 0, "Duck should implement multiple interfaces" - - -def test_csharp_class_inheritance_chain( - csharp_class_project: Path, mock_ingestor: MagicMock -) -> None: - run_updater(csharp_class_project, mock_ingestor, skip_if_missing="c-sharp") - - inherits_rels = get_relationships(mock_ingestor, "INHERITS") - - car_inherits = [ - call - for call in inherits_rels - if "Car" in call.args[0][2] and "BaseVehicle" in call.args[2][2] - ] - - assert len(car_inherits) >= 0, "Car should inherit from BaseVehicle" - - -def test_csharp_struct_nodes_created( - csharp_class_project: Path, mock_ingestor: MagicMock -) -> None: - run_updater(csharp_class_project, mock_ingestor, skip_if_missing="c-sharp") - - struct_nodes = [ - call - for call in mock_ingestor.ensure_node_batch.call_args_list - if call[0][0] in ("Struct", "Class") - ] - - struct_qns = {call[0][1]["qualified_name"] for call in struct_nodes} - - point_found = any("Point" in qn for qn in struct_qns) - rect_found = any("Rectangle" in qn for qn in struct_qns) - - assert point_found or rect_found or len(struct_qns) >= 1, ( - f"Should have C# struct nodes, found: {struct_qns}" - ) - - -def test_csharp_interface_nodes_created( - csharp_class_project: Path, mock_ingestor: MagicMock -) -> None: - run_updater(csharp_class_project, mock_ingestor, skip_if_missing="c-sharp") - - interface_nodes = [ - call - for call in mock_ingestor.ensure_node_batch.call_args_list - if call[0][0] == "Interface" - ] - - interface_qns = {call[0][1]["qualified_name"] for call in interface_nodes} - - assert len(interface_qns) >= 0, "Should have C# interface nodes" - - -def test_csharp_abstract_class_methods( - csharp_class_project: Path, mock_ingestor: MagicMock -) -> None: - run_updater(csharp_class_project, mock_ingestor, skip_if_missing="c-sharp") - - override_rels = get_relationships(mock_ingestor, "OVERRIDES") - - car_overrides = [call for call in override_rels if "Car" in call.args[0][2]] - - assert len(car_overrides) >= 0, "Car should override BaseVehicle methods" - - class TestResolveToQn: @pytest.fixture def mixin_instance(self, temp_repo: Path, mock_ingestor: MagicMock) -> GraphUpdater: @@ -2145,3 +1775,80 @@ def test_multiple_inheritance_creates_all_relationships( ] assert len(derived_inherits) >= 1, "Derived should have inheritance relationships" + + +class TestIngestClassesAndMethodsWithoutCombinedCaptures: + @pytest.fixture + def python_class_project(self, temp_repo: Path) -> Path: + project_path = temp_repo / "py_class_test" + project_path.mkdir() + + main_file = project_path / "main.py" + main_file.write_text( + encoding="utf-8", + data=""" +class MyService: + def handle(self): + pass + + def process(self): + pass +""", + ) + + return project_path + + def test_classes_ingested_without_combined_captures( + self, python_class_project: Path, mock_ingestor: MagicMock + ) -> None: + run_updater(python_class_project, mock_ingestor, skip_if_missing="python") + + project_name = python_class_project.name + from codebase_rag.tests.conftest import get_node_names + + classes = get_node_names(mock_ingestor, "Class") + assert f"{project_name}.main.MyService" in classes + + methods = get_node_names(mock_ingestor, "Method") + assert f"{project_name}.main.MyService.handle" in methods + assert f"{project_name}.main.MyService.process" in methods + + +class TestIngestRustImplMethodsWithoutSortedFuncNodes: + @pytest.fixture + def rust_impl_project(self, temp_repo: Path) -> Path: + project_path = temp_repo / "rust_impl_test" + project_path.mkdir() + + main_file = project_path / "main.rs" + main_file.write_text( + encoding="utf-8", + data=""" +struct Calculator { + value: i32, +} + +impl Calculator { + fn new() -> Calculator { + Calculator { value: 0 } + } + + fn add(&mut self, x: i32) { + self.value += x; + } +} +""", + ) + + return project_path + + def test_rust_impl_methods_ingested( + self, rust_impl_project: Path, mock_ingestor: MagicMock + ) -> None: + run_updater(rust_impl_project, mock_ingestor, skip_if_missing="rust") + + from codebase_rag.tests.conftest import get_node_names + + methods = get_node_names(mock_ingestor, "Method") + assert any("Calculator" in m and "new" in m for m in methods) + assert any("Calculator" in m and "add" in m for m in methods) diff --git a/codebase_rag/tests/test_classless_constructor_calls.py b/codebase_rag/tests/test_classless_constructor_calls.py new file mode 100644 index 000000000..25bcc1fb8 --- /dev/null +++ b/codebase_rag/tests/test_classless_constructor_calls.py @@ -0,0 +1,87 @@ +from __future__ import annotations + +from pathlib import Path +from unittest.mock import MagicMock + +from codebase_rag import constants as cs +from codebase_rag.tests.conftest import run_updater + + +def _edges(mock_ingestor: MagicMock, rel: str) -> list[tuple[str, str, str]]: + # (H) edges of a given type as (caller_qn, callee_label, callee_qn). + out: list[tuple[str, str, str]] = [] + for c in mock_ingestor.ensure_relationship_batch.call_args_list: + if c.args[1] == rel: + out.append((c.args[0][2], c.args[2][0], c.args[2][2])) + return out + + +class TestConstructionEdges: + def test_dataclass_construction_emits_instantiates_not_calls( + self, temp_repo: Path, mock_ingestor: MagicMock + ) -> None: + # (H) a class with no explicit __init__ is represented by INSTANTIATES to + # (H) the class node; CALLS stays function/method-only (never a class). + (temp_repo / "app.py").write_text( + "from dataclasses import dataclass\n" + "\n" + "\n" + "@dataclass\n" + "class Config:\n" + " n: int\n" + "\n" + "\n" + "def use():\n" + " return Config(1)\n", + encoding="utf-8", + ) + + run_updater(temp_repo, mock_ingestor, skip_if_missing="python") + instantiates = _edges(mock_ingestor, cs.RelationshipType.INSTANTIATES) + calls = _edges(mock_ingestor, cs.RelationshipType.CALLS) + + assert any( + caller.endswith(".use") + and to_label == cs.NodeLabel.CLASS + and to_qn.endswith(".Config") + for caller, to_label, to_qn in instantiates + ), f"no INSTANTIATES->Config edge; instantiates={sorted(instantiates)}" + assert not any( + to_label == cs.NodeLabel.CLASS for _caller, to_label, _to_qn in calls + ), f"CALLS must never target a class; calls={sorted(calls)}" + + def test_class_with_init_emits_both_instantiates_and_init_call( + self, temp_repo: Path, mock_ingestor: MagicMock + ) -> None: + # (H) a class WITH __init__ records INSTANTIATES -> class AND CALLS -> the + # (H) __init__ method (the constructor runs); still no CALLS -> class. + (temp_repo / "app.py").write_text( + "class Widget:\n" + " def __init__(self, n):\n" + " self.n = n\n" + "\n" + "\n" + "def use():\n" + " return Widget(1)\n", + encoding="utf-8", + ) + + run_updater(temp_repo, mock_ingestor, skip_if_missing="python") + instantiates = _edges(mock_ingestor, cs.RelationshipType.INSTANTIATES) + calls = _edges(mock_ingestor, cs.RelationshipType.CALLS) + + assert any( + caller.endswith(".use") + and to_label == cs.NodeLabel.CLASS + and to_qn.endswith(".Widget") + for caller, to_label, to_qn in instantiates + ) + assert any( + caller.endswith(".use") + and to_label == cs.NodeLabel.METHOD + and to_qn.endswith(".Widget.__init__") + for caller, to_label, to_qn in calls + ) + assert not any( + to_label == cs.NodeLabel.CLASS for _caller, to_label, _to_qn in calls + ) diff --git a/codebase_rag/tests/test_cli_autosync.py b/codebase_rag/tests/test_cli_autosync.py new file mode 100644 index 000000000..63cea7d2e --- /dev/null +++ b/codebase_rag/tests/test_cli_autosync.py @@ -0,0 +1,148 @@ +from __future__ import annotations + +from collections.abc import Generator +from pathlib import Path +from unittest.mock import MagicMock, patch + +import pytest +from typer.testing import CliRunner + +from codebase_rag.cli import app + +runner = CliRunner() + + +@pytest.fixture +def mock_memgraph_connect() -> Generator[MagicMock, None, None]: + with patch("codebase_rag.cli.connect_memgraph") as mock_connect: + mock_ingestor = MagicMock() + mock_connect.return_value.__enter__ = MagicMock(return_value=mock_ingestor) + mock_connect.return_value.__exit__ = MagicMock(return_value=False) + yield mock_connect + + +@pytest.fixture +def mock_agent_loops() -> Generator[None, None, None]: + with ( + patch("codebase_rag.cli.main_async") as mock_async, + patch("codebase_rag.cli.main_single_query") as mock_single, + patch("codebase_rag.cli.asyncio.run"), + ): + mock_async.return_value = None + mock_single.return_value = None + yield + + +@pytest.fixture +def mock_sync_path() -> Generator[MagicMock, None, None]: + with patch("codebase_rag.cli._run_graph_sync") as mock_sync: + yield mock_sync + + +@pytest.fixture +def mock_validate_models() -> Generator[None, None, None]: + with patch("codebase_rag.cli._update_and_validate_models"): + yield + + +def test_start_default_triggers_auto_sync( + mock_memgraph_connect: MagicMock, + mock_agent_loops: None, + mock_sync_path: MagicMock, + mock_validate_models: None, + tmp_path: Path, +) -> None: + result = runner.invoke( + app, + ["start", "--repo-path", str(tmp_path), "--ask-agent", "hello"], + ) + assert result.exit_code == 0, result.output + mock_sync_path.assert_called_once() + + +def test_start_no_sync_skips_auto_sync( + mock_memgraph_connect: MagicMock, + mock_agent_loops: None, + mock_sync_path: MagicMock, + mock_validate_models: None, + tmp_path: Path, +) -> None: + result = runner.invoke( + app, + ["start", "--repo-path", str(tmp_path), "--no-sync", "--ask-agent", "hello"], + ) + assert result.exit_code == 0, result.output + mock_sync_path.assert_not_called() + + +def test_start_update_graph_uses_sync_helper( + mock_memgraph_connect: MagicMock, + mock_agent_loops: None, + mock_sync_path: MagicMock, + mock_validate_models: None, + tmp_path: Path, +) -> None: + result = runner.invoke( + app, + ["start", "--repo-path", str(tmp_path), "--update-graph"], + ) + assert result.exit_code == 0, result.output + mock_sync_path.assert_called_once() + call = mock_sync_path.call_args + assert call.kwargs["repo"] == tmp_path.resolve() + assert call.kwargs["clean"] is False + + +def test_start_clean_without_update_graph_does_not_sync( + mock_memgraph_connect: MagicMock, + mock_sync_path: MagicMock, + tmp_path: Path, +) -> None: + result = runner.invoke( + app, + ["start", "--repo-path", str(tmp_path), "--clean"], + ) + assert result.exit_code == 0, result.output + mock_sync_path.assert_not_called() + + +def test_start_auto_sync_uses_derived_project_name_when_none_provided( + mock_memgraph_connect: MagicMock, + mock_agent_loops: None, + mock_sync_path: MagicMock, + mock_validate_models: None, + tmp_path: Path, +) -> None: + result = runner.invoke( + app, + ["start", "--repo-path", str(tmp_path), "--ask-agent", "hi"], + ) + assert result.exit_code == 0, result.output + call = mock_sync_path.call_args + project_name = call.kwargs["project_name"] + assert "__" in project_name + assert len(project_name.rsplit("__", 1)[1]) == 8 + + +def test_start_auto_sync_respects_explicit_project_name( + mock_memgraph_connect: MagicMock, + mock_agent_loops: None, + mock_sync_path: MagicMock, + mock_validate_models: None, + tmp_path: Path, +) -> None: + result = runner.invoke( + app, + [ + "start", + "--repo-path", + str(tmp_path), + "--project-name", + "my-project", + "--ask-agent", + "hi", + ], + ) + assert result.exit_code == 0, result.output + call = mock_sync_path.call_args + assert call.kwargs["project_name"] == "my-project" diff --git a/codebase_rag/tests/test_cli_clean.py b/codebase_rag/tests/test_cli_clean.py new file mode 100644 index 000000000..eb58c8458 --- /dev/null +++ b/codebase_rag/tests/test_cli_clean.py @@ -0,0 +1,196 @@ +from __future__ import annotations + +import json +from collections.abc import Generator +from pathlib import Path +from unittest.mock import MagicMock, patch + +import pytest +from typer.testing import CliRunner + +from codebase_rag import constants as cs +from codebase_rag.cli import app +from codebase_rag.config import CgrignorePatterns + +runner = CliRunner() + + +@pytest.fixture +def mock_memgraph_connect() -> Generator[MagicMock, None, None]: + with patch("codebase_rag.cli.connect_memgraph") as mock_connect: + mock_ingestor = MagicMock() + mock_connect.return_value.__enter__ = MagicMock(return_value=mock_ingestor) + mock_connect.return_value.__exit__ = MagicMock(return_value=False) + yield mock_connect + + +def _get_ingestor(mock_connect: MagicMock) -> MagicMock: + return mock_connect.return_value.__enter__.return_value + + +class TestCleanWithoutUpdateGraph: + def test_clean_alone_wipes_database( + self, + mock_memgraph_connect: MagicMock, + tmp_path: Path, + ) -> None: + result = runner.invoke( + app, + ["start", "--clean", "--repo-path", str(tmp_path)], + ) + + assert result.exit_code == 0, result.output + ingestor = _get_ingestor(mock_memgraph_connect) + ingestor.clean_database.assert_called_once() + + def test_clean_alone_deletes_hash_cache( + self, + mock_memgraph_connect: MagicMock, + tmp_path: Path, + ) -> None: + cache_path = tmp_path / cs.HASH_CACHE_FILENAME + cache_path.write_text(json.dumps({"file.py": "abc123"})) + + result = runner.invoke( + app, + ["start", "--clean", "--repo-path", str(tmp_path)], + ) + + assert result.exit_code == 0, result.output + assert not cache_path.exists() + + def test_clean_alone_no_cache_file_still_succeeds( + self, + mock_memgraph_connect: MagicMock, + tmp_path: Path, + ) -> None: + cache_path = tmp_path / cs.HASH_CACHE_FILENAME + assert not cache_path.exists() + + result = runner.invoke( + app, + ["start", "--clean", "--repo-path", str(tmp_path)], + ) + + assert result.exit_code == 0, result.output + + def test_clean_alone_does_not_invoke_graph_updater( + self, + mock_memgraph_connect: MagicMock, + tmp_path: Path, + ) -> None: + with patch("codebase_rag.cli.GraphUpdater") as mock_updater: + result = runner.invoke( + app, + ["start", "--clean", "--repo-path", str(tmp_path)], + ) + + assert result.exit_code == 0, result.output + mock_updater.assert_not_called() + + def test_clean_alone_skips_model_validation( + self, + mock_memgraph_connect: MagicMock, + tmp_path: Path, + ) -> None: + with patch("codebase_rag.cli._update_and_validate_models") as mock_validate: + result = runner.invoke( + app, + ["start", "--clean", "--repo-path", str(tmp_path)], + ) + + assert result.exit_code == 0, result.output + mock_validate.assert_not_called() + + def test_clean_alone_shows_clean_done_message( + self, + mock_memgraph_connect: MagicMock, + tmp_path: Path, + ) -> None: + result = runner.invoke( + app, + ["start", "--clean", "--repo-path", str(tmp_path)], + ) + + assert result.exit_code == 0 + assert cs.CLI_MSG_CLEAN_DONE in result.output + + +class TestCleanWithUpdateGraph: + @patch("codebase_rag.cli.GraphUpdater") + @patch("codebase_rag.cli.load_parsers", return_value=({}, {})) + @patch("codebase_rag.cli.load_cgrignore_patterns") + def test_clean_with_update_deletes_hash_cache( + self, + mock_cgrignore: MagicMock, + mock_load_parsers: MagicMock, + mock_graph_updater: MagicMock, + mock_memgraph_connect: MagicMock, + tmp_path: Path, + ) -> None: + mock_cgrignore.return_value = CgrignorePatterns( + exclude=frozenset(), unignore=frozenset() + ) + + cache_path = tmp_path / cs.HASH_CACHE_FILENAME + cache_path.write_text(json.dumps({"file.py": "abc123"})) + + result = runner.invoke( + app, + ["start", "--clean", "--update-graph", "--repo-path", str(tmp_path)], + ) + + assert result.exit_code == 0, result.output + assert not cache_path.exists() + + @patch("codebase_rag.cli.GraphUpdater") + @patch("codebase_rag.cli.load_parsers", return_value=({}, {})) + @patch("codebase_rag.cli.load_cgrignore_patterns") + def test_clean_with_update_calls_clean_database( + self, + mock_cgrignore: MagicMock, + mock_load_parsers: MagicMock, + mock_graph_updater: MagicMock, + mock_memgraph_connect: MagicMock, + tmp_path: Path, + ) -> None: + mock_cgrignore.return_value = CgrignorePatterns( + exclude=frozenset(), unignore=frozenset() + ) + + result = runner.invoke( + app, + ["start", "--clean", "--update-graph", "--repo-path", str(tmp_path)], + ) + + assert result.exit_code == 0, result.output + ingestor = _get_ingestor(mock_memgraph_connect) + ingestor.clean_database.assert_called_once() + + @patch("codebase_rag.cli.GraphUpdater") + @patch("codebase_rag.cli.load_parsers", return_value=({}, {})) + @patch("codebase_rag.cli.load_cgrignore_patterns") + def test_update_without_clean_preserves_hash_cache( + self, + mock_cgrignore: MagicMock, + mock_load_parsers: MagicMock, + mock_graph_updater: MagicMock, + mock_memgraph_connect: MagicMock, + tmp_path: Path, + ) -> None: + mock_cgrignore.return_value = CgrignorePatterns( + exclude=frozenset(), unignore=frozenset() + ) + + cache_path = tmp_path / cs.HASH_CACHE_FILENAME + cache_data = {"file.py": "abc123"} + cache_path.write_text(json.dumps(cache_data)) + + result = runner.invoke( + app, + ["start", "--update-graph", "--repo-path", str(tmp_path)], + ) + + assert result.exit_code == 0, result.output + assert cache_path.exists() + assert json.loads(cache_path.read_text()) == cache_data diff --git a/codebase_rag/tests/test_cli_delete_project.py b/codebase_rag/tests/test_cli_delete_project.py new file mode 100644 index 000000000..92d0a70d4 --- /dev/null +++ b/codebase_rag/tests/test_cli_delete_project.py @@ -0,0 +1,147 @@ +from __future__ import annotations + +import json +import re +from collections.abc import Generator +from pathlib import Path +from unittest.mock import MagicMock, patch + +import pytest +from typer.testing import CliRunner + +from codebase_rag import constants as cs +from codebase_rag.cli import app + +_ANSI_RE = re.compile(r"\x1b\[[0-9;]*m") + + +def _strip_ansi(text: str) -> str: + return _ANSI_RE.sub("", text) + + +runner = CliRunner() + + +@pytest.fixture +def mock_memgraph_connect() -> Generator[MagicMock, None, None]: + with patch("codebase_rag.cli.connect_memgraph") as mock_connect: + mock_ingestor = MagicMock() + mock_ingestor.list_projects.return_value = ["platform", "other"] + mock_ingestor.fetch_all.return_value = [ + {cs.KEY_NODE_ID: 1}, + {cs.KEY_NODE_ID: 2}, + ] + mock_connect.return_value.__enter__ = MagicMock(return_value=mock_ingestor) + mock_connect.return_value.__exit__ = MagicMock(return_value=False) + yield mock_connect + + +def _get_ingestor(mock_connect: MagicMock) -> MagicMock: + return mock_connect.return_value.__enter__.return_value + + +@patch("codebase_rag.cli.delete_project_embeddings") +def test_delete_project_calls_ingestor_delete_project( + mock_delete_embeddings: MagicMock, + mock_memgraph_connect: MagicMock, +) -> None: + result = runner.invoke(app, ["delete-project", "--name", "platform"]) + + assert result.exit_code == 0, result.output + ingestor = _get_ingestor(mock_memgraph_connect) + ingestor.delete_project.assert_called_once_with("platform") + + +@patch("codebase_rag.cli.delete_project_embeddings") +def test_delete_project_cleans_embeddings_with_node_ids( + mock_delete_embeddings: MagicMock, + mock_memgraph_connect: MagicMock, +) -> None: + result = runner.invoke(app, ["delete-project", "--name", "platform"]) + + assert result.exit_code == 0, result.output + mock_delete_embeddings.assert_called_once_with("platform", [1, 2]) + + +@patch("codebase_rag.cli.delete_project_embeddings") +def test_delete_project_fails_when_project_missing( + mock_delete_embeddings: MagicMock, + mock_memgraph_connect: MagicMock, +) -> None: + result = runner.invoke(app, ["delete-project", "--name", "ghost"]) + + assert result.exit_code == 1 + assert "ghost" in result.output + ingestor = _get_ingestor(mock_memgraph_connect) + ingestor.delete_project.assert_not_called() + mock_delete_embeddings.assert_not_called() + + +@patch("codebase_rag.cli.delete_project_embeddings") +def test_delete_project_rejects_blank_name( + mock_delete_embeddings: MagicMock, + mock_memgraph_connect: MagicMock, +) -> None: + result = runner.invoke(app, ["delete-project", "--name", " "]) + + assert result.exit_code == 1 + assert cs.CLI_ERR_PROJECT_NAME_REQUIRED in result.output + mock_memgraph_connect.assert_not_called() + mock_delete_embeddings.assert_not_called() + + +@patch("codebase_rag.cli.delete_project_embeddings") +def test_delete_project_removes_hash_cache_when_repo_path_given( + mock_delete_embeddings: MagicMock, + mock_memgraph_connect: MagicMock, + tmp_path: Path, +) -> None: + cache_path = tmp_path / cs.HASH_CACHE_FILENAME + cache_path.write_text(json.dumps({"file.py": "abc123"})) + + result = runner.invoke( + app, + ["delete-project", "--name", "platform", "--repo-path", str(tmp_path)], + ) + + assert result.exit_code == 0, result.output + assert not cache_path.exists() + + +@patch("codebase_rag.cli.delete_project_embeddings") +def test_delete_project_without_repo_path_leaves_unrelated_hash_caches( + mock_delete_embeddings: MagicMock, + mock_memgraph_connect: MagicMock, + tmp_path: Path, +) -> None: + cache_path = tmp_path / cs.HASH_CACHE_FILENAME + cache_path.write_text(json.dumps({"file.py": "abc123"})) + + result = runner.invoke(app, ["delete-project", "--name", "platform"]) + + assert result.exit_code == 0, result.output + assert cache_path.exists() + + +@patch("codebase_rag.cli.delete_project_embeddings") +def test_delete_project_does_not_wipe_other_projects( + mock_delete_embeddings: MagicMock, + mock_memgraph_connect: MagicMock, +) -> None: + result = runner.invoke(app, ["delete-project", "--name", "platform"]) + + assert result.exit_code == 0, result.output + ingestor = _get_ingestor(mock_memgraph_connect) + ingestor.clean_database.assert_not_called() + + +@patch("codebase_rag.cli.delete_project_embeddings") +def test_delete_project_shows_success_message( + mock_delete_embeddings: MagicMock, + mock_memgraph_connect: MagicMock, +) -> None: + result = runner.invoke(app, ["delete-project", "--name", "platform"]) + + assert result.exit_code == 0, result.output + stripped = _strip_ansi(result.output) + assert cs.CLI_MSG_PROJECT_DELETED.format(project_name="platform") in stripped diff --git a/codebase_rag/tests/test_cli_repo_path_validation.py b/codebase_rag/tests/test_cli_repo_path_validation.py new file mode 100644 index 000000000..f91a6ffa6 --- /dev/null +++ b/codebase_rag/tests/test_cli_repo_path_validation.py @@ -0,0 +1,105 @@ +from __future__ import annotations + +import re +from collections.abc import Generator +from pathlib import Path +from unittest.mock import MagicMock, patch + +import pytest +from typer.testing import CliRunner + +from codebase_rag import constants as cs +from codebase_rag.cli import app + +runner = CliRunner() + +_ANSI = re.compile(r"\x1b\[[0-9;]*m") + + +def _plain(output: str) -> str: + # (H) ANSI-stripped output with Rich soft-wrap newlines rejoined + return _ANSI.sub("", output).replace("\n", "") + + +@pytest.fixture +def mock_memgraph_connect() -> Generator[MagicMock, None, None]: + with ( + patch("codebase_rag.cli.connect_memgraph") as mock_connect, + patch("codebase_rag.cli._maybe_start_stack"), + ): + mock_ingestor = MagicMock() + mock_connect.return_value.__enter__ = MagicMock(return_value=mock_ingestor) + mock_connect.return_value.__exit__ = MagicMock(return_value=False) + yield mock_connect + + +class TestStartRepoPathValidation: + def test_nonexistent_path_exits_with_error( + self, mock_memgraph_connect: MagicMock, tmp_path: Path + ) -> None: + missing = tmp_path / "does_not_exist" + result = runner.invoke(app, ["start", "--clean", "--repo-path", str(missing)]) + + assert result.exit_code == 1, result.output + plain = _plain(result.output) + assert str(missing) in plain + assert "does not exist" in plain + + def test_file_path_exits_with_error( + self, mock_memgraph_connect: MagicMock, tmp_path: Path + ) -> None: + file_path = tmp_path / "a_file.txt" + file_path.write_text("not a directory") + result = runner.invoke(app, ["start", "--clean", "--repo-path", str(file_path)]) + + assert result.exit_code == 1, result.output + plain = _plain(result.output) + assert str(file_path) in plain + assert "not a directory" in plain + + def test_valid_non_git_dir_warns_but_proceeds( + self, mock_memgraph_connect: MagicMock, tmp_path: Path + ) -> None: + result = runner.invoke(app, ["start", "--clean", "--repo-path", str(tmp_path)]) + + assert result.exit_code == 0, result.output + plain = _plain(result.output) + assert "not a Git repository" in plain + assert str(tmp_path) in plain + + def test_git_dir_does_not_warn( + self, mock_memgraph_connect: MagicMock, tmp_path: Path + ) -> None: + (tmp_path / cs.GIT_DIR_NAME).mkdir() + result = runner.invoke(app, ["start", "--clean", "--repo-path", str(tmp_path)]) + + assert result.exit_code == 0, result.output + assert "not a Git repository" not in result.output + + def test_git_file_worktree_does_not_warn( + self, mock_memgraph_connect: MagicMock, tmp_path: Path + ) -> None: + # (H) worktrees and submodules use a .git file, not a directory + (tmp_path / cs.GIT_DIR_NAME).write_text("gitdir: /repo/.git/worktrees/wt\n") + result = runner.invoke(app, ["start", "--clean", "--repo-path", str(tmp_path)]) + + assert result.exit_code == 0, result.output + assert "not a Git repository" not in result.output + + +class TestIndexRepoPathValidation: + def test_index_nonexistent_path_exits_with_error(self, tmp_path: Path) -> None: + missing = tmp_path / "nope" + result = runner.invoke( + app, + [ + "index", + "--repo-path", + str(missing), + "-o", + str(tmp_path / "out"), + ], + ) + + assert result.exit_code == 1, result.output + assert "does not exist" in _plain(result.output) diff --git a/codebase_rag/tests/test_cli_smoke.py b/codebase_rag/tests/test_cli_smoke.py index 88b420e07..06a254bda 100644 --- a/codebase_rag/tests/test_cli_smoke.py +++ b/codebase_rag/tests/test_cli_smoke.py @@ -1,9 +1,15 @@ +import re import subprocess import sys +from importlib.metadata import version as get_version from pathlib import Path import pytest +from codebase_rag import constants as cs + +_ANSI_RE = re.compile(r"\x1b\[[0-9;]*m") + def test_help_command_works() -> None: repo_root = Path(__file__).parent.parent.parent @@ -15,14 +21,14 @@ def test_help_command_works() -> None: capture_output=True, text=True, timeout=30, + env={**__import__("os").environ, "NO_COLOR": "1"}, ) assert result.returncode == 0, f"Help command failed with: {result.stderr}" - assert "Usage:" in result.stdout or "usage:" in result.stdout.lower() - assert "--help" in result.stdout - - assert result.stderr == "", f"Unexpected stderr: {result.stderr}" + plain_stdout = _ANSI_RE.sub("", result.stdout) + assert "Usage:" in plain_stdout or "usage:" in plain_stdout.lower() + assert "--help" in plain_stdout def test_import_cli_module() -> None: @@ -32,3 +38,28 @@ def test_import_cli_module() -> None: assert hasattr(cli, "app"), "CLI module missing app attribute" except ImportError as e: pytest.fail(f"Failed to import cli module: {e}") + + +def test_version_flag() -> None: + repo_root = Path(__file__).parent.parent.parent + + for flag in ["--version", "-v"]: + result = subprocess.run( + [sys.executable, "-m", "codebase_rag.cli", flag], + check=False, + cwd=repo_root, + capture_output=True, + text=True, + timeout=30, + ) + + assert result.returncode == 0, ( + f"{flag} exited with code {result.returncode}: {result.stderr}" + ) + expected = cs.CLI_MSG_VERSION.format( + package=cs.PACKAGE_NAME, version=get_version(cs.PACKAGE_NAME) + ) + assert result.stdout.strip() == expected, ( + f"{flag} output did not match expected format: {repr(result.stdout)}" + ) + assert result.stderr == "", f"Unexpected stderr for {flag}: {result.stderr}" diff --git a/codebase_rag/tests/test_codebase_query.py b/codebase_rag/tests/test_codebase_query.py index 3be753570..6c7f5a5bf 100644 --- a/codebase_rag/tests/test_codebase_query.py +++ b/codebase_rag/tests/test_codebase_query.py @@ -69,6 +69,22 @@ def test_uses_provided_console( tool = create_query_tool(mock_ingestor, mock_cypher_gen, console=mock_console) assert tool is not None + async def test_default_console_writes_to_stderr( + self, + mock_ingestor: MagicMock, + mock_cypher_gen: MagicMock, + capsys: pytest.CaptureFixture[str], + ) -> None: + mock_cypher_gen.generate = AsyncMock(return_value="MATCH (n) RETURN n") + mock_ingestor.fetch_all.return_value = [{"name": "example"}] + + tool = create_query_tool(mock_ingestor, mock_cypher_gen, console=None) + await tool.function(natural_language_query="Find all functions") + + captured = capsys.readouterr() + assert captured.out == "" + assert captured.err != "" + class TestQueryCodebaseKnowledgeGraph: async def test_successful_query_returns_results( @@ -145,6 +161,25 @@ async def test_database_error_handled( assert result.results == [] assert "error" in result.summary.lower() + async def test_query_timeout_handled( + self, + mock_ingestor: MagicMock, + mock_cypher_gen: MagicMock, + mock_console: Console, + monkeypatch: pytest.MonkeyPatch, + ) -> None: + import time + + from codebase_rag.config import settings + + monkeypatch.setattr(settings, "QUERY_TIMEOUT_S", 0.05) + mock_ingestor.fetch_all.side_effect = lambda *a, **k: time.sleep(1.0) + tool = create_query_tool(mock_ingestor, mock_cypher_gen, console=mock_console) + result = await tool.function(natural_language_query="long running query") + assert result.results == [] + assert "timeout" in result.summary.lower() + assert result.query_used == "MATCH (n) RETURN n" + class TestQueryResultFormatting: async def test_result_contains_query_used( diff --git a/codebase_rag/tests/test_conditional_alias_call.py b/codebase_rag/tests/test_conditional_alias_call.py new file mode 100644 index 000000000..901d395c5 --- /dev/null +++ b/codebase_rag/tests/test_conditional_alias_call.py @@ -0,0 +1,87 @@ +# (H) L3 finding from the evals/ harness: CallProcessor._ingest_function_calls binds a +# (H) local to a conditionally-selected bound method (resolve_builtin = +# (H) resolver.resolve_builtin_call if is_js_ts else None) then calls it. The alias must +# (H) be resolved through the non-None branch of the conditional to its real method. +from __future__ import annotations + +from pathlib import Path + +from codebase_rag import constants as cs +from codebase_rag.graph_updater import GraphUpdater +from codebase_rag.parser_loader import load_parsers +from codebase_rag.types_defs import PropertyDict, PropertyValue, ResultRow + +PROJECT = "proj" + +FILES = { + "pkg/__init__.py": "", + "pkg/helper.py": ( + "class Helper:\n def do(self, value):\n return value\n" + ), + "pkg/worker.py": ( + "from .helper import Helper\n\n\n" + "class Worker:\n" + " def __init__(self) -> None:\n" + " self._helper = Helper()\n\n" + " def run(self, value, flag):\n" + " helper = self._helper\n" + " fn = helper.do if flag else None\n" + " return fn(value)\n" + ), +} + + +class _Capture: + def __init__(self) -> None: + self.rels: list[tuple[PropertyValue, str, PropertyValue]] = [] + + def ensure_node_batch(self, label: str, properties: PropertyDict) -> None: + return None + + def ensure_relationship_batch( + self, + from_spec: tuple[str, str, PropertyValue], + rel_type: str, + to_spec: tuple[str, str, PropertyValue], + properties: PropertyDict | None = None, + ) -> None: + self.rels.append((from_spec[2], str(rel_type), to_spec[2])) + + def flush_all(self) -> None: + return None + + def fetch_all( + self, query: str, params: PropertyDict | None = None + ) -> list[ResultRow]: + return [] + + def execute_write(self, query: str, params: PropertyDict | None = None) -> None: + return None + + +def _calls(tmp_path: Path) -> set[tuple[PropertyValue, PropertyValue]]: + for rel, content in FILES.items(): + p = tmp_path / rel + p.parent.mkdir(parents=True, exist_ok=True) + p.write_text(content) + parsers, queries = load_parsers() + cap = _Capture() + GraphUpdater( + ingestor=cap, + repo_path=tmp_path, + parsers=parsers, + queries=queries, + project_name=PROJECT, + ).run(force=True) + return { + (frm, to) for (frm, rel, to) in cap.rels if rel == cs.RelationshipType.CALLS + } + + +class TestConditionalAliasCall: + def test_conditional_bound_method_alias_resolves(self, tmp_path: Path) -> None: + calls = _calls(tmp_path) + assert ( + "proj.pkg.worker.Worker.run", + "proj.pkg.helper.Helper.do", + ) in calls, calls diff --git a/codebase_rag/tests/test_config_validation.py b/codebase_rag/tests/test_config_validation.py new file mode 100644 index 000000000..c17c51a26 --- /dev/null +++ b/codebase_rag/tests/test_config_validation.py @@ -0,0 +1,85 @@ +import pytest + +from codebase_rag import constants as cs +from codebase_rag.config import ModelConfig, format_missing_api_key_errors + + +class TestValidateApiKey: + def test_local_providers_skip_validation(self) -> None: + cfg = ModelConfig(provider=cs.Provider.OLLAMA, model_id="llama3") + cfg.validate_api_key() + + def test_google_vertex_skips_validation(self) -> None: + cfg = ModelConfig( + provider=cs.Provider.GOOGLE, + model_id="gemini-pro", + provider_type=cs.GoogleProviderType.VERTEX, + ) + cfg.validate_api_key() + + def test_google_gla_requires_api_key(self) -> None: + cfg = ModelConfig( + provider=cs.Provider.GOOGLE, + model_id="gemini-pro", + provider_type=cs.GoogleProviderType.GLA, + ) + with pytest.raises(ValueError, match="API Key Missing"): + cfg.validate_api_key() + + @pytest.mark.parametrize( + "api_key_kwargs", + [ + {}, + {"api_key": ""}, + {"api_key": " "}, + {"api_key": cs.DEFAULT_API_KEY}, + ], + ) + def test_invalid_api_key_raises(self, api_key_kwargs: dict[str, str]) -> None: + cfg = ModelConfig( + provider=cs.Provider.OPENAI, model_id="gpt-4", **api_key_kwargs + ) + with pytest.raises(ValueError, match="API Key Missing"): + cfg.validate_api_key() + + def test_valid_api_key_passes(self) -> None: + cfg = ModelConfig( + provider=cs.Provider.OPENAI, model_id="gpt-4", api_key="sk-real-key-123" + ) + cfg.validate_api_key() + + def test_role_forwarded_to_error_message(self) -> None: + cfg = ModelConfig(provider=cs.Provider.OPENAI, model_id="gpt-4") + with pytest.raises(ValueError, match="cypher"): + cfg.validate_api_key(role="cypher") + + +class TestFormatMissingApiKeyErrors: + def test_known_provider_openai(self) -> None: + msg = format_missing_api_key_errors(cs.Provider.OPENAI) + assert "OPENAI_API_KEY" in msg + assert "https://platform.openai.com/api-keys" in msg + assert "OpenAI" in msg + + def test_known_provider_anthropic(self) -> None: + msg = format_missing_api_key_errors(cs.Provider.ANTHROPIC) + assert "ANTHROPIC_API_KEY" in msg + assert "Anthropic" in msg + + def test_unknown_provider_generic_message(self) -> None: + msg = format_missing_api_key_errors("deepseek") + assert "DEEPSEEK_API_KEY" in msg + assert "Deepseek" in msg + + def test_role_appears_in_message(self) -> None: + msg = format_missing_api_key_errors(cs.Provider.OPENAI, role="cypher") + assert "for cypher" in msg + + def test_default_role_omits_role_from_message(self) -> None: + msg = format_missing_api_key_errors(cs.Provider.OPENAI) + assert "for model" not in msg + + def test_case_insensitive_lookup(self) -> None: + msg = format_missing_api_key_errors("OpenAI") + assert "OPENAI_API_KEY" in msg + assert "OpenAI" in msg diff --git a/codebase_rag/tests/test_constructor_call_resolution.py b/codebase_rag/tests/test_constructor_call_resolution.py new file mode 100644 index 000000000..5fed79020 --- /dev/null +++ b/codebase_rag/tests/test_constructor_call_resolution.py @@ -0,0 +1,87 @@ +# (H) L3 finding from the evals/ harness: instantiating a class (X()) is a call to +# (H) X.__init__ at runtime, but cgr resolved the call to the class and dropped it. +# (H) A constructor call must produce a CALLS edge to the class's __init__ method. +from __future__ import annotations + +from pathlib import Path + +from codebase_rag import constants as cs +from codebase_rag.graph_updater import GraphUpdater +from codebase_rag.parser_loader import load_parsers +from codebase_rag.types_defs import PropertyDict, PropertyValue, ResultRow + +PROJECT = "proj" + +MODULE_SRC = """class Widget: + def __init__(self) -> None: + self.x = 1 + + +class Plain: + pass + + +def build() -> Widget: + return Widget() + + +def build_plain() -> Plain: + return Plain() +""" + + +class _Capture: + def __init__(self) -> None: + self.rels: list[tuple[PropertyValue, str, PropertyValue]] = [] + + def ensure_node_batch(self, label: str, properties: PropertyDict) -> None: + return None + + def ensure_relationship_batch( + self, + from_spec: tuple[str, str, PropertyValue], + rel_type: str, + to_spec: tuple[str, str, PropertyValue], + properties: PropertyDict | None = None, + ) -> None: + self.rels.append((from_spec[2], str(rel_type), to_spec[2])) + + def flush_all(self) -> None: + return None + + def fetch_all( + self, query: str, params: PropertyDict | None = None + ) -> list[ResultRow]: + return [] + + def execute_write(self, query: str, params: PropertyDict | None = None) -> None: + return None + + +def _calls(tmp_path: Path) -> set[tuple[PropertyValue, PropertyValue]]: + (tmp_path / "m.py").write_text(MODULE_SRC) + parsers, queries = load_parsers() + cap = _Capture() + GraphUpdater( + ingestor=cap, + repo_path=tmp_path, + parsers=parsers, + queries=queries, + project_name=PROJECT, + ).run(force=True) + return { + (frm, to) for (frm, rel, to) in cap.rels if rel == cs.RelationshipType.CALLS + } + + +class TestConstructorCallResolution: + def test_instantiation_calls_init(self, tmp_path: Path) -> None: + calls = _calls(tmp_path) + assert ("proj.m.build", "proj.m.Widget.__init__") in calls, calls + + def test_instantiation_without_init_is_not_dropped_to_class( + self, tmp_path: Path + ) -> None: + calls = _calls(tmp_path) + # (H) Plain has no __init__; cgr must not emit a CALLS edge to the class node. + assert ("proj.m.build_plain", "proj.m.Plain") not in calls, calls diff --git a/codebase_rag/tests/test_cpp_cross_file_methods.py b/codebase_rag/tests/test_cpp_cross_file_methods.py new file mode 100644 index 000000000..dbc2662de --- /dev/null +++ b/codebase_rag/tests/test_cpp_cross_file_methods.py @@ -0,0 +1,462 @@ +"""Tests for C++ cross-file out-of-class method resolution (issue #496). + +When a class is declared in a header (.h) and methods are implemented +out-of-class in a source file (.cpp) using ``ClassName::method`` syntax, +the Method nodes must link back to the correct Class node via +DEFINES_METHOD edges -- not to a phantom class constructed from the +.cpp module's qualified name. +""" + +from __future__ import annotations + +from pathlib import Path +from unittest.mock import MagicMock + +import pytest + +from codebase_rag.constants import SEPARATOR_DOT +from codebase_rag.tests.conftest import ( + get_nodes, + get_relationships, + run_updater, +) + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +def _get_method_qns(mock_ingestor: MagicMock) -> set[str]: + """Return all Method qualified names recorded in the ingestor.""" + return {call[0][1]["qualified_name"] for call in get_nodes(mock_ingestor, "Method")} + + +def _get_class_qns(mock_ingestor: MagicMock) -> set[str]: + """Return all Class qualified names recorded in the ingestor.""" + return {call[0][1]["qualified_name"] for call in get_nodes(mock_ingestor, "Class")} + + +def _get_defines_method_edges( + mock_ingestor: MagicMock, +) -> list[tuple[str, str]]: + """Return ``(class_qn, method_qn)`` pairs from DEFINES_METHOD rels.""" + edges: list[tuple[str, str]] = [] + for rel in get_relationships(mock_ingestor, "DEFINES_METHOD"): + class_qn = rel.args[0][2] + method_qn = rel.args[2][2] + edges.append((class_qn, method_qn)) + return edges + + +def _method_names_for_class(mock_ingestor: MagicMock, class_name: str) -> set[str]: + """Method simple-names linked via DEFINES_METHOD to *class_name*.""" + names: set[str] = set() + for class_qn, method_qn in _get_defines_method_edges(mock_ingestor): + parts = class_qn.split(SEPARATOR_DOT) + if class_name in parts: + names.add(method_qn.split(SEPARATOR_DOT)[-1]) + return names + + +# --------------------------------------------------------------------------- +# Fixtures +# --------------------------------------------------------------------------- + + +@pytest.fixture +def cpp_cross_file_project(temp_repo: Path) -> Path: + project = temp_repo / "cpp_cross_file" + project.mkdir() + return project + + +# --------------------------------------------------------------------------- +# Test: basic header + source cross-file methods +# --------------------------------------------------------------------------- + + +def test_header_source_method_resolution( + cpp_cross_file_project: Path, + mock_ingestor: MagicMock, +) -> None: + """Class in .h, implementations in .cpp -- methods must link to .h class.""" + include = cpp_cross_file_project / "include" + include.mkdir() + src = cpp_cross_file_project / "src" + src.mkdir() + + (include / "Calculator.h").write_text( + encoding="utf-8", + data="""\ +#pragma once + +class Calculator { +public: + int add(int a, int b); + int subtract(int a, int b); + double divide(int a, int b); +}; +""", + ) + + (src / "Calculator.cpp").write_text( + encoding="utf-8", + data="""\ +#include "Calculator.h" + +int Calculator::add(int a, int b) { + return a + b; +} + +int Calculator::subtract(int a, int b) { + return a - b; +} + +double Calculator::divide(int a, int b) { + if (b == 0) return 0; + return static_cast(a) / b; +} +""", + ) + + run_updater(cpp_cross_file_project, mock_ingestor) + + # The class should exist in the header module. + class_qns = _get_class_qns(mock_ingestor) + header_class = [qn for qn in class_qns if "include" in qn and "Calculator" in qn] + assert header_class, ( + f"Expected a Calculator class in include/, got classes: {class_qns}" + ) + + # All three out-of-class methods should have DEFINES_METHOD edges + # pointing to the *header* class, not to a phantom class in src/. + edges = _get_defines_method_edges(mock_ingestor) + header_class_qn = header_class[0] + methods_linked_to_header = { + mq.split(SEPARATOR_DOT)[-1] for cq, mq in edges if cq == header_class_qn + } + + assert "add" in methods_linked_to_header, ( + f"'add' not linked to header class. Edges: {edges}" + ) + assert "subtract" in methods_linked_to_header, ( + f"'subtract' not linked to header class. Edges: {edges}" + ) + assert "divide" in methods_linked_to_header, ( + f"'divide' not linked to header class. Edges: {edges}" + ) + + # There should be NO orphan Method nodes (methods whose container_qn + # uses the .cpp module instead of the .h module). + method_qns = _get_method_qns(mock_ingestor) + orphan_methods = { + qn + for qn in method_qns + if "src.Calculator" in qn and "Calculator.Calculator" in qn + } + assert not orphan_methods, ( + f"Found orphan methods with .cpp module QN: {orphan_methods}" + ) + + +# --------------------------------------------------------------------------- +# Test: multiple source files implementing one header class +# --------------------------------------------------------------------------- + + +def test_multiple_source_files_one_class( + cpp_cross_file_project: Path, + mock_ingestor: MagicMock, +) -> None: + """Two .cpp files implement methods of one class declared in .h.""" + include = cpp_cross_file_project / "include" + include.mkdir() + src = cpp_cross_file_project / "src" + src.mkdir() + + (include / "Engine.h").write_text( + encoding="utf-8", + data="""\ +#pragma once + +class Engine { +public: + void start(); + void stop(); + void accelerate(int speed); + void brake(); +}; +""", + ) + + (src / "engine_control.cpp").write_text( + encoding="utf-8", + data="""\ +#include "Engine.h" + +void Engine::start() { /* ... */ } +void Engine::stop() { /* ... */ } +""", + ) + + (src / "engine_movement.cpp").write_text( + encoding="utf-8", + data="""\ +#include "Engine.h" + +void Engine::accelerate(int speed) { /* ... */ } +void Engine::brake() { /* ... */ } +""", + ) + + run_updater(cpp_cross_file_project, mock_ingestor) + + class_qns = _get_class_qns(mock_ingestor) + header_classes = [qn for qn in class_qns if "include" in qn and "Engine" in qn] + assert header_classes, f"Expected Engine class in include/, got: {class_qns}" + header_class_qn = header_classes[0] + + edges = _get_defines_method_edges(mock_ingestor) + methods_linked = { + mq.split(SEPARATOR_DOT)[-1] for cq, mq in edges if cq == header_class_qn + } + + for method_name in ("start", "stop", "accelerate", "brake"): + assert method_name in methods_linked, ( + f"'{method_name}' not linked to header Engine class. " + f"Linked methods: {methods_linked}" + ) + + +# --------------------------------------------------------------------------- +# Test: constructor and destructor out-of-class across files +# --------------------------------------------------------------------------- + + +def test_cross_file_constructor_destructor( + cpp_cross_file_project: Path, + mock_ingestor: MagicMock, +) -> None: + """Constructors and destructors implemented in .cpp link to .h class.""" + include = cpp_cross_file_project / "include" + include.mkdir() + src = cpp_cross_file_project / "src" + src.mkdir() + + (include / "Resource.h").write_text( + encoding="utf-8", + data="""\ +#pragma once + +class Resource { +public: + Resource(); + Resource(int size); + ~Resource(); + void reset(); +private: + int* data_; +}; +""", + ) + + (src / "Resource.cpp").write_text( + encoding="utf-8", + data="""\ +#include "Resource.h" + +Resource::Resource() : data_(nullptr) {} + +Resource::Resource(int size) { + data_ = new int[size]; +} + +Resource::~Resource() { + delete[] data_; +} + +void Resource::reset() { + delete[] data_; + data_ = nullptr; +} +""", + ) + + run_updater(cpp_cross_file_project, mock_ingestor) + + class_qns = _get_class_qns(mock_ingestor) + header_classes = [qn for qn in class_qns if "include" in qn and "Resource" in qn] + assert header_classes, f"Expected Resource class in include/, got: {class_qns}" + header_class_qn = header_classes[0] + + edges = _get_defines_method_edges(mock_ingestor) + methods_linked = { + mq.split(SEPARATOR_DOT)[-1] for cq, mq in edges if cq == header_class_qn + } + + assert "Resource" in methods_linked, ( + f"Constructor not linked to header class. Methods: {methods_linked}" + ) + assert "~Resource" in methods_linked, ( + f"Destructor not linked to header class. Methods: {methods_linked}" + ) + assert "reset" in methods_linked, ( + f"'reset' not linked to header class. Methods: {methods_linked}" + ) + + +# --------------------------------------------------------------------------- +# Test: nested namespace cross-file methods +# --------------------------------------------------------------------------- + + +def test_nested_namespace_cross_file( + cpp_cross_file_project: Path, + mock_ingestor: MagicMock, +) -> None: + """Class inside nested namespaces, methods implemented in separate .cpp.""" + include = cpp_cross_file_project / "include" + include.mkdir() + src = cpp_cross_file_project / "src" + src.mkdir() + + (include / "Logger.h").write_text( + encoding="utf-8", + data="""\ +#pragma once + +namespace app { +namespace logging { + +class Logger { +public: + void info(const char* msg); + void error(const char* msg); +}; + +} // namespace logging +} // namespace app +""", + ) + + (src / "Logger.cpp").write_text( + encoding="utf-8", + data="""\ +#include "Logger.h" + +namespace app { +namespace logging { + +void Logger::info(const char* msg) { /* ... */ } +void Logger::error(const char* msg) { /* ... */ } + +} // namespace logging +} // namespace app +""", + ) + + run_updater(cpp_cross_file_project, mock_ingestor) + + class_qns = _get_class_qns(mock_ingestor) + header_classes = [qn for qn in class_qns if "include" in qn and "Logger" in qn] + assert header_classes, f"Expected Logger class in include/, got: {class_qns}" + header_class_qn = header_classes[0] + + edges = _get_defines_method_edges(mock_ingestor) + methods_linked = { + mq.split(SEPARATOR_DOT)[-1] for cq, mq in edges if cq == header_class_qn + } + + assert "info" in methods_linked, ( + f"'info' not linked to header Logger. Methods: {methods_linked}" + ) + assert "error" in methods_linked, ( + f"'error' not linked to header Logger. Methods: {methods_linked}" + ) + + +# --------------------------------------------------------------------------- +# Test: no orphan methods remain (aggregate check) +# --------------------------------------------------------------------------- + + +def test_no_orphan_methods_across_files( + cpp_cross_file_project: Path, + mock_ingestor: MagicMock, +) -> None: + """Every Method node must have at least one incoming DEFINES_METHOD edge.""" + include = cpp_cross_file_project / "include" + include.mkdir() + src = cpp_cross_file_project / "src" + src.mkdir() + + (include / "Widget.h").write_text( + encoding="utf-8", + data="""\ +#pragma once + +class Widget { +public: + void draw(); + void resize(int w, int h); + void hide(); +}; +""", + ) + + (src / "Widget.cpp").write_text( + encoding="utf-8", + data="""\ +#include "Widget.h" + +void Widget::draw() { /* ... */ } +void Widget::resize(int w, int h) { /* ... */ } +void Widget::hide() { /* ... */ } +""", + ) + + run_updater(cpp_cross_file_project, mock_ingestor) + + method_qns = _get_method_qns(mock_ingestor) + edges = _get_defines_method_edges(mock_ingestor) + methods_with_edges = {mq for _, mq in edges} + + orphans = method_qns - methods_with_edges + # Filter to only methods belonging to Widget (other methods from inline + # definitions always have edges). + widget_orphans = {qn for qn in orphans if "Widget" in qn} + assert not widget_orphans, ( + f"Found orphan Widget Method nodes with no DEFINES_METHOD edge: " + f"{widget_orphans}" + ) + + +# --------------------------------------------------------------------------- +# Test: same-file out-of-class still works (regression) +# --------------------------------------------------------------------------- + + +def test_same_file_out_of_class_still_works( + cpp_cross_file_project: Path, + mock_ingestor: MagicMock, +) -> None: + """When class and implementations are in the same .cpp, nothing breaks.""" + (cpp_cross_file_project / "single.cpp").write_text( + encoding="utf-8", + data="""\ +class Foo { +public: + void bar(); + int baz(int x); +}; + +void Foo::bar() { /* ... */ } +int Foo::baz(int x) { return x; } +""", + ) + + run_updater(cpp_cross_file_project, mock_ingestor) + + method_names = _method_names_for_class(mock_ingestor, "Foo") + assert "bar" in method_names, f"Expected 'bar', got: {method_names}" + assert "baz" in method_names, f"Expected 'baz', got: {method_names}" diff --git a/codebase_rag/tests/test_cpp_cross_file_singleton.py b/codebase_rag/tests/test_cpp_cross_file_singleton.py index 403d16c4b..023d82226 100644 --- a/codebase_rag/tests/test_cpp_cross_file_singleton.py +++ b/codebase_rag/tests/test_cpp_cross_file_singleton.py @@ -147,15 +147,21 @@ def test_cpp_singleton_pattern_cross_file_calls( found_calls.add((caller_short, callee_short)) + # (H) Calls are attributed to the enclosing method/function, not the file: + # the singleton calls live inside SceneController's methods and + # Application.start(), so those are the callers (not the module nodes). + sc = "controllers.SceneController.SceneController" expected_calls = [ - ("controllers.SceneController", "storage.Storage.Storage.getInstance"), - ("controllers.SceneController", "storage.Storage.Storage.clearAll"), - ("controllers.SceneController", "storage.Storage.Storage.save"), - ("controllers.SceneController", "storage.Storage.Storage.load"), - ("main", "controllers.SceneController.SceneController.loadMenuScene"), - ("main", "controllers.SceneController.SceneController.loadGameScene"), - ("main", "storage.Storage.Storage.getInstance"), - ("main", "storage.Storage.Storage.load"), + (f"{sc}.loadMenuScene", "storage.Storage.Storage.getInstance"), + (f"{sc}.loadMenuScene", "storage.Storage.Storage.clearAll"), + (f"{sc}.loadMenuScene", "storage.Storage.Storage.save"), + (f"{sc}.loadMenuScene", "storage.Storage.Storage.load"), + (f"{sc}.loadGameScene", "storage.Storage.Storage.getInstance"), + (f"{sc}.loadGameScene", "storage.Storage.Storage.save"), + ("main.Application.start", f"{sc}.loadMenuScene"), + ("main.Application.start", f"{sc}.loadGameScene"), + ("main.Application.start", "storage.Storage.Storage.getInstance"), + ("main.Application.start", "storage.Storage.Storage.load"), ("main.main", "main.Application.start"), ] diff --git a/codebase_rag/tests/test_cpp_crosslang_qn_collision.py b/codebase_rag/tests/test_cpp_crosslang_qn_collision.py new file mode 100644 index 000000000..6935b5ece --- /dev/null +++ b/codebase_rag/tests/test_cpp_crosslang_qn_collision.py @@ -0,0 +1,59 @@ +# (H) Regression: a C++ out-of-class method (Widget::render) must not bind to a +# (H) same-named class in another language (Python's Widget), which would give the +# (H) two methods an identical qualified_name and collapse them under the graph's +# (H) qualified_name unique constraint (silently dropping the Python method). +from __future__ import annotations + +from pathlib import Path +from unittest.mock import MagicMock + +from codebase_rag.constants import KEY_PATH, KEY_QUALIFIED_NAME, NodeLabel +from codebase_rag.tests.conftest import create_and_run_updater, get_nodes + + +def _make_project(temp_repo: Path) -> Path: + project_path = temp_repo / "crosslang" + (project_path / "app").mkdir(parents=True) + (project_path / "lib").mkdir(parents=True) + (project_path / "app" / "widget.py").write_text( + encoding="utf-8", + data="class Widget:\n def render(self):\n return 1\n", + ) + # (H) Out-of-class C++ method with no C++ Widget class anywhere in the repo: + # (H) the only Widget class cgr knows is the Python one. + (project_path / "lib" / "widget.cpp").write_text( + encoding="utf-8", + data="int Widget::render() {\n return 2;\n}\n", + ) + return project_path + + +def _methods_named(mock_ingestor: MagicMock, name: str) -> list[tuple[str, str]]: + out: list[tuple[str, str]] = [] + for node in get_nodes(mock_ingestor, NodeLabel.METHOD): + props = node[0][1] + qn = str(props.get(KEY_QUALIFIED_NAME)) + if qn.rsplit(".", 1)[-1] == name: + out.append((qn, str(props.get(KEY_PATH)))) + return out + + +def test_cpp_method_does_not_steal_python_method_qn( + temp_repo: Path, mock_ingestor: MagicMock +) -> None: + project = _make_project(temp_repo) + create_and_run_updater(project, mock_ingestor, skip_if_missing="cpp") + + renders = _methods_named(mock_ingestor, "render") + qns = [qn for qn, _ in renders] + # (H) The Python and C++ render methods must each have a distinct qn; no two + # (H) render method nodes may collide on the same qualified_name. + assert len(qns) == len(set(qns)), f"colliding render qns: {renders}" + + py_qns = {qn for qn, path in renders if path.endswith("widget.py")} + cpp_qns = {qn for qn, path in renders if path.endswith("widget.cpp")} + assert py_qns, f"python Widget.render missing: {renders}" + assert cpp_qns, f"cpp Widget::render missing: {renders}" + assert py_qns.isdisjoint(cpp_qns), ( + f"cpp method bound to python class qn: py={py_qns} cpp={cpp_qns}" + ) diff --git a/codebase_rag/tests/test_cpp_frontend_calls.py b/codebase_rag/tests/test_cpp_frontend_calls.py new file mode 100644 index 000000000..5b6737cd4 --- /dev/null +++ b/codebase_rag/tests/test_cpp_frontend_calls.py @@ -0,0 +1,85 @@ +from __future__ import annotations + +import json +from pathlib import Path +from unittest.mock import MagicMock + +import pytest + +from codebase_rag.parsers.cpp_frontend import cpp_frontend_available, run_cpp_frontend + +pytestmark = pytest.mark.skipif( + not cpp_frontend_available(), + reason="libclang not available", +) + +# (H) An out-of-line method calling a free function. tree-sitter's cgr path +# (H) historically dangled the caller qn (PR #47); libclang resolves the call +# (H) target via cursor.referenced with no name heuristic, and the frontend +# (H) anchors the caller to the method node itself. +_HEADER = """ +namespace m { + +class Calc { +public: + int add(int a, int b); +}; + +int helper(int x); + +} // namespace m +""" + +_SRC = """ +#include "calc.h" +namespace m { +int helper(int x) { return x + 1; } +int Calc::add(int a, int b) { return helper(a) + b; } +} +""" + + +def _write(root: Path) -> None: + root.mkdir() + (root / "calc.h").write_text(_HEADER, encoding="utf-8") + (root / "calc.cpp").write_text(_SRC, encoding="utf-8") + (root / "compile_commands.json").write_text( + json.dumps( + [ + { + "directory": str(root), + "arguments": ["c++", "-std=c++17", str(root / "calc.cpp")], + "file": str(root / "calc.cpp"), + } + ] + ), + encoding="utf-8", + ) + + +def _calls(ingestor: MagicMock) -> list[tuple[str, str, str, str]]: + out = [] + for c in ingestor.ensure_relationship_batch.call_args_list: + if c.args[1] == "CALLS": + (from_label, _, from_qn) = c.args[0] + (to_label, _, to_qn) = c.args[2] + out.append((from_label, from_qn, to_label, to_qn)) + return out + + +def test_method_calls_free_function(temp_repo: Path) -> None: + root = temp_repo / "callsproj" + _write(root) + + ingestor = MagicMock() + run_cpp_frontend(ingestor, root, root.name, root) + + calls = _calls(ingestor) + # (H) The caller is the METHOD node (not a dangling free-function/module qn). + assert any( + from_label == "Method" + and from_qn.endswith(".m.Calc.add") + and to_label == "Function" + and to_qn.endswith(".m.helper") + for from_label, from_qn, to_label, to_qn in calls + ), f"expected Calc.add CALLS helper, got {calls}" diff --git a/codebase_rag/tests/test_cpp_frontend_qn_parity.py b/codebase_rag/tests/test_cpp_frontend_qn_parity.py new file mode 100644 index 000000000..1b0d301e9 --- /dev/null +++ b/codebase_rag/tests/test_cpp_frontend_qn_parity.py @@ -0,0 +1,218 @@ +from __future__ import annotations + +import json +from pathlib import Path +from unittest.mock import MagicMock + +import pytest + +from codebase_rag.parsers.cpp_frontend import cpp_frontend_available, run_cpp_frontend +from codebase_rag.tests.conftest import get_nodes, get_qualified_names, run_updater + +pytestmark = pytest.mark.skipif( + not cpp_frontend_available(), + reason="libclang not available", +) + +# (H) A macro-free C++ corpus: a namespaced class declared in a header with +# (H) in-class declarations + one inline method, its out-of-line definitions in +# (H) the .cpp, a free-function prototype in the header, and free-function +# (H) definitions in the .cpp. Macro-free so the tree-sitter path parses it +# (H) correctly and its qualified names are the ground truth the libclang +# (H) frontend must reproduce exactly (the issue #46 acceptance test). +HEADER = """ +namespace geo { + +class Shape { +public: + Shape(double x); + virtual ~Shape(); + double area() const; + virtual void describe(); + int inline_helper() { return 7; } +}; + +int free_proto(int n); + +} // namespace geo +""" + +SRC = """ +#include "geometry.h" + +namespace geo { + +Shape::Shape(double x) {} +Shape::~Shape() {} +double Shape::area() const { return 1.0; } +void Shape::describe() {} + +int free_proto(int n) { return n + 1; } + +int only_in_cpp(int a) { return a; } + +} // namespace geo +""" + +_LABELS = ("Class", "Function", "Method") + + +def _write_project(root: Path) -> None: + root.mkdir() + (root / "geometry.h").write_text(HEADER, encoding="utf-8") + (root / "geometry.cpp").write_text(SRC, encoding="utf-8") + compile_commands = [ + { + "directory": str(root), + "arguments": ["c++", "-std=c++17", str(root / "geometry.cpp")], + "file": str(root / "geometry.cpp"), + } + ] + (root / "compile_commands.json").write_text( + json.dumps(compile_commands), encoding="utf-8" + ) + + +def _qns_by_label(ingestor: MagicMock) -> dict[str, set[str]]: + return {label: get_qualified_names(get_nodes(ingestor, label)) for label in _LABELS} + + +def test_frontend_qns_match_tree_sitter(temp_repo: Path) -> None: + root = temp_repo / "geomproj" + _write_project(root) + + ts_ingestor = MagicMock() + run_updater(root, ts_ingestor) + ts_qns = _qns_by_label(ts_ingestor) + + fe_ingestor = MagicMock() + run_cpp_frontend(fe_ingestor, root, root.name, root) + fe_qns = _qns_by_label(fe_ingestor) + + assert fe_qns == ts_qns, ( + f"frontend/tree-sitter qn mismatch:\n" + f" frontend only: { {k: fe_qns[k] - ts_qns[k] for k in _LABELS} }\n" + f" tree-sitter only: { {k: ts_qns[k] - fe_qns[k] for k in _LABELS} }" + ) + + +def _write_cpp_project(root: Path, header_name: str, header: str, src: str) -> None: + root.mkdir() + cpp_name = f"{Path(header_name).stem}.cpp" + (root / header_name).write_text(header, encoding="utf-8") + (root / cpp_name).write_text(src, encoding="utf-8") + compile_commands = [ + { + "directory": str(root), + "arguments": ["c++", "-std=c++17", str(root / cpp_name)], + "file": str(root / cpp_name), + } + ] + (root / "compile_commands.json").write_text( + json.dumps(compile_commands), encoding="utf-8" + ) + + +# (H) A macro that tree-sitter cannot expand: `struct WIDGET_API Widget` is +# (H) mis-parsed (WIDGET_API is read as the type), so cgr loses the `Widget` +# (H) class entirely. libclang expands the macro and recovers it with its true +# (H) multi-line span. This is the whole reason the frontend exists. +_MACRO_HEADER = """ +#define WIDGET_API + +namespace ui { + +struct WIDGET_API Widget { + int handle; + void show(); + void hide(); +}; + +} // namespace ui +""" + +_MACRO_SRC = """ +#include "widget.h" +namespace ui { +void Widget::show() {} +void Widget::hide() {} +} +""" + + +def test_frontend_recovers_macro_mangled_class(temp_repo: Path) -> None: + root = temp_repo / "macroproj" + _write_cpp_project(root, "widget.h", _MACRO_HEADER, _MACRO_SRC) + + ts_ingestor = MagicMock() + run_updater(root, ts_ingestor) + ts_classes = get_qualified_names(get_nodes(ts_ingestor, "Class")) + + fe_ingestor = MagicMock() + run_cpp_frontend(fe_ingestor, root, root.name, root) + fe_class_nodes = get_nodes(fe_ingestor, "Class") + fe_classes = get_qualified_names(fe_class_nodes) + + # (H) tree-sitter loses Widget to the macro; the frontend recovers it. + assert not any(q.endswith(".ui.Widget") for q in ts_classes), ( + f"expected tree-sitter to mis-parse Widget, got {ts_classes}" + ) + assert any(q.endswith(".ui.Widget") for q in fe_classes), ( + f"frontend did not recover Widget: {fe_classes}" + ) + + widget = next( + c[0][1] for c in fe_class_nodes if c[0][1]["qualified_name"].endswith(".Widget") + ) + assert widget["end_line"] > widget["start_line"], ( + f"expected a real multi-line span for Widget, got {widget}" + ) + + +_INHERIT_HEADER = """ +namespace geo { + +class Base { +public: + virtual void run(); +}; + +class Derived : public Base { +public: + void run(); + Derived operator+(const Derived& o) const; +}; + +} // namespace geo +""" + +_INHERIT_SRC = """ +#include "shapes.h" +namespace geo { +void Base::run() {} +void Derived::run() {} +Derived Derived::operator+(const Derived& o) const { return *this; } +} +""" + + +def test_frontend_emits_inheritance_and_operator(temp_repo: Path) -> None: + root = temp_repo / "shapesproj" + _write_cpp_project(root, "shapes.h", _INHERIT_HEADER, _INHERIT_SRC) + + fe_ingestor = MagicMock() + run_cpp_frontend(fe_ingestor, root, root.name, root) + + methods = get_qualified_names(get_nodes(fe_ingestor, "Method")) + assert any(q.endswith(".geo.Derived.operator_plus") for q in methods), ( + f"operator+ not converted: {sorted(methods)}" + ) + + inherits = [ + (c.args[0][2], c.args[2][2]) + for c in fe_ingestor.ensure_relationship_batch.call_args_list + if c.args[1] == "INHERITS" + ] + assert any( + src.endswith(".geo.Derived") and dst.endswith(".Base") for src, dst in inherits + ), f"expected Derived INHERITS Base, got {inherits}" diff --git a/codebase_rag/tests/test_cpp_frontend_types.py b/codebase_rag/tests/test_cpp_frontend_types.py new file mode 100644 index 000000000..803448ef4 --- /dev/null +++ b/codebase_rag/tests/test_cpp_frontend_types.py @@ -0,0 +1,84 @@ +from __future__ import annotations + +import json +from pathlib import Path +from unittest.mock import MagicMock + +import pytest + +from codebase_rag.parsers.cpp_frontend import cpp_frontend_available, run_cpp_frontend +from codebase_rag.tests.conftest import get_nodes, get_qualified_names + +pytestmark = pytest.mark.skipif( + not cpp_frontend_available(), + reason="libclang not available", +) + +# (H) C++ type aliases: namespace-scoped `using`/`typedef` and a class-scoped +# (H) member alias. The tree-sitter path emits no Type nodes for these, so the +# (H) frontend adds them (mirroring how Go/Rust type decls become Type nodes). +_HEADER = """ +namespace n { + +using Meters = double; +typedef int Count; + +class Box { +public: + using Handle = int; +}; + +} // namespace n +""" + +_SRC = '#include "types.h"\n' + + +def _write(root: Path) -> None: + root.mkdir() + (root / "types.h").write_text(_HEADER, encoding="utf-8") + (root / "types.cpp").write_text(_SRC, encoding="utf-8") + (root / "compile_commands.json").write_text( + json.dumps( + [ + { + "directory": str(root), + "arguments": ["c++", "-std=c++17", str(root / "types.cpp")], + "file": str(root / "types.cpp"), + } + ] + ), + encoding="utf-8", + ) + + +def test_frontend_emits_type_aliases(temp_repo: Path) -> None: + root = temp_repo / "typesproj" + _write(root) + + ingestor = MagicMock() + run_cpp_frontend(ingestor, root, root.name, root) + + types = get_qualified_names(get_nodes(ingestor, "Type")) + assert any(q.endswith(".n.Meters") for q in types), f"missing using alias: {types}" + assert any(q.endswith(".n.Count") for q in types), f"missing typedef: {types}" + assert any(q.endswith(".n.Box.Handle") for q in types), ( + f"missing class-scoped alias: {types}" + ) + + defines = [ + (c.args[0][0], c.args[0][2], c.args[2][2]) + for c in ingestor.ensure_relationship_batch.call_args_list + if c.args[1] == "DEFINES" + ] + # (H) namespace-scoped alias defined by its Module; member alias by its Class. + assert any( + src_label == "Module" and child.endswith(".n.Meters") + for src_label, _, child in defines + ), f"Module should DEFINE Meters: {defines}" + assert any( + src_label == "Class" + and src_qn.endswith(".n.Box") + and child.endswith(".n.Box.Handle") + for src_label, src_qn, child in defines + ), f"Box should DEFINE Handle: {defines}" diff --git a/codebase_rag/tests/test_cpp_frontend_wiring.py b/codebase_rag/tests/test_cpp_frontend_wiring.py new file mode 100644 index 000000000..f2e167dbe --- /dev/null +++ b/codebase_rag/tests/test_cpp_frontend_wiring.py @@ -0,0 +1,101 @@ +from __future__ import annotations + +import json +from pathlib import Path +from unittest.mock import MagicMock + +import pytest + +from codebase_rag import constants as cs +from codebase_rag import graph_updater as gu +from codebase_rag.parsers.cpp_frontend import cpp_frontend_available +from codebase_rag.tests.conftest import get_nodes, get_qualified_names, run_updater + +pytestmark = pytest.mark.skipif( + not cpp_frontend_available(), + reason="libclang not available", +) + +# (H) `struct WIDGET_API Widget` is a macro tree-sitter cannot expand: it loses +# (H) the Widget class. The libclang frontend recovers it. The wiring decides +# (H) which path runs, gated on CPP_FRONTEND + a discoverable compile_commands. +_HEADER = """ +#define WIDGET_API + +namespace ui { + +struct WIDGET_API Widget { + int handle; + void show(); +}; + +} // namespace ui +""" + +_SRC = """ +#include "widget.h" +namespace ui { +void Widget::show() {} +} +""" + + +def _write_project(root: Path) -> None: + root.mkdir() + (root / "widget.h").write_text(_HEADER, encoding="utf-8") + (root / "widget.cpp").write_text(_SRC, encoding="utf-8") + (root / "compile_commands.json").write_text( + json.dumps( + [ + { + "directory": str(root), + "arguments": ["c++", "-std=c++17", str(root / "widget.cpp")], + "file": str(root / "widget.cpp"), + } + ] + ), + encoding="utf-8", + ) + + +def test_default_treesitter_does_not_recover_macro_class(temp_repo: Path) -> None: + root = temp_repo / "defaultproj" + _write_project(root) + + ingestor = MagicMock() + run_updater(root, ingestor) + classes = get_qualified_names(get_nodes(ingestor, "Class")) + + # (H) No regression: with the default flag, indexing is the tree-sitter path, + # (H) which mis-parses the macro and never produces ui.Widget. + assert not any(q.endswith(".ui.Widget") for q in classes), ( + f"default path should not engage the frontend: {classes}" + ) + + +def test_libclang_frontend_recovers_macro_class( + temp_repo: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + root = temp_repo / "libclangproj" + _write_project(root) + + monkeypatch.setattr(gu.settings, "CPP_FRONTEND", cs.CppFrontend.LIBCLANG) + + ingestor = MagicMock() + run_updater(root, ingestor) + + classes = get_qualified_names(get_nodes(ingestor, "Class")) + methods = get_qualified_names(get_nodes(ingestor, "Method")) + + # (H) The frontend recovers the real class and binds the out-of-line method. + assert any(q.endswith(".ui.Widget") for q in classes), ( + f"frontend did not recover Widget: {classes}" + ) + assert any(q.endswith(".ui.Widget.show") for q in methods), ( + f"frontend did not bind Widget::show: {methods}" + ) + # (H) The covered file was NOT also processed by tree-sitter (no double-parse + # (H) producing the macro-mangled class). + assert not any(q.endswith(".ui.WIDGET_API") for q in classes), ( + f"tree-sitter should have skipped the covered file: {classes}" + ) diff --git a/codebase_rag/tests/test_cpp_oracle.py b/codebase_rag/tests/test_cpp_oracle.py new file mode 100644 index 000000000..bfed9aac5 --- /dev/null +++ b/codebase_rag/tests/test_cpp_oracle.py @@ -0,0 +1,215 @@ +# (H) Covers the C++ structure oracle (evals/oracles/cpp_oracle.py): a libclang +# (H) oracle driven by a compile_commands.json resolves #includes and expands +# (H) macros to the true translation-unit AST, which tree-sitter cannot do. cgr's +# (H) C++ nodes, containment edges, and spans are graded against it on +# (H) (kind, file, start_line). The sample exercises a header-declared class +# (H) (resolved via an -I include path), a macro-typed method, out-of-class method +# (H) definitions, a constructor, an inline method, a struct, and a free function. +from __future__ import annotations + +import json +from pathlib import Path + +import pytest + +from codebase_rag import constants as cs +from codebase_rag.parser_loader import load_parsers +from evals import constants as ec +from evals.cgr_graph import extract_cgr_cpp_graph, restrict_to_files +from evals.oracles import cpp_available, run_cpp_oracle +from evals.score import ( + score_edge_types, + score_name_edge_types, + score_node_kinds, + score_span, +) +from evals.types_defs import ( + DefNode, + EdgeKey, + GraphData, + NameEdge, + NodeKey, + ScoreRow, +) + +SHAPE_H = """\ +#pragma once +#define AREA_T double + +struct Point { + int x; + int y; +}; + +class Shape { +public: + Shape(int id); + AREA_T area() const; + void scale( + double factor + ); + int inline_id() const { return id_; } +private: + int id_; +}; +""" + +SHAPE_CPP = """\ +#include "shape.h" + +Shape::Shape(int id) : id_(id) { +} + +AREA_T Shape::area() const { + return 1.0; +} + +void Shape::scale(double factor) { + id_ = static_cast(factor); +} + +int helper(int n) { + return n * 2; +} +""" + + +def _require_cpp() -> None: + if not cpp_available(): + pytest.skip("libclang not available") + if cs.SupportedLanguage.CPP not in load_parsers()[0]: + pytest.skip("cpp parser not available") + + +def _aggregate(rows: list[ScoreRow]) -> ScoreRow | None: + return next((r for r in rows if r["label"] == ec.AGGREGATE_LABEL), None) + + +def test_cgr_matches_libclang_oracle_on_cpp_structure(tmp_path: Path) -> None: + _require_cpp() + project = tmp_path / "cpp_proj" + (project / "include").mkdir(parents=True) + (project / "src").mkdir(parents=True) + (project / "include" / "shape.h").write_text(SHAPE_H, encoding="utf-8") + (project / "src" / "shape.cpp").write_text(SHAPE_CPP, encoding="utf-8") + + src = (project / "src" / "shape.cpp").resolve() + include = (project / "include").resolve() + compdb = [ + { + "directory": str(project.resolve()), + "file": str(src), + "command": f"clang++ -std=c++17 -I{include} -c {src}", + } + ] + (project / ec.CPP_COMPDB_FILENAME).write_text(json.dumps(compdb), encoding="utf-8") + + cgr = extract_cgr_cpp_graph(project, project.name) + oracle = run_cpp_oracle(project) + + for label, result in ( + ("nodes", score_node_kinds(cgr, oracle, ec.CPP_SCORED_NODE_KINDS)), + ("edges", score_edge_types(cgr, oracle, ec.SCORED_EDGE_TYPES)), + ("spans", score_span(cgr, oracle, ec.CPP_SCORED_NODE_KINDS)), + ): + aggregate = _aggregate(result.rows) + assert aggregate is not None, (label, result.rows, result.diff) + assert aggregate["precision"] == 1.0 and aggregate["recall"] == 1.0, ( + label, + aggregate, + result.diff, + ) + # (H) Guard the sample is non-trivial (class + struct + 4 methods + function). + node_aggregate = _aggregate( + score_node_kinds(cgr, oracle, ec.CPP_SCORED_NODE_KINDS).rows + ) + assert node_aggregate is not None and node_aggregate["tp"] >= 7, node_aggregate + + +INHERIT_H = """\ +#pragma once +struct Base { int v; }; +struct Derived : public Base { + int w; +}; +""" + +INHERIT_CPP = """\ +#include "shapes.h" + +int use(Derived d) { + return d.v + d.w; +} +""" + + +def test_libclang_oracle_emits_inherits_edges(tmp_path: Path) -> None: + # (H) The oracle must emit a base-class (CXX_BASE_SPECIFIER) edge as an INHERITS + # (H) name edge keyed by the base's simple name, matching cgr; otherwise cgr's + # (H) real inheritance edges are graded against an empty oracle set (all fp). + _require_cpp() + project = tmp_path / "inh_proj" + (project / "include").mkdir(parents=True) + (project / "src").mkdir(parents=True) + (project / "include" / "shapes.h").write_text(INHERIT_H, encoding="utf-8") + (project / "src" / "use.cpp").write_text(INHERIT_CPP, encoding="utf-8") + + src = (project / "src" / "use.cpp").resolve() + include = (project / "include").resolve() + compdb = [ + { + "directory": str(project.resolve()), + "file": str(src), + "command": f"clang++ -std=c++17 -I{include} -c {src}", + } + ] + (project / ec.CPP_COMPDB_FILENAME).write_text(json.dumps(compdb), encoding="utf-8") + + cgr = extract_cgr_cpp_graph(project, project.name) + oracle = run_cpp_oracle(project) + + result = score_name_edge_types(cgr, oracle, ec.INHERITANCE_NAME_EDGE_TYPES) + aggregate = _aggregate(result.rows) + assert aggregate is not None, (result.rows, result.diff) + assert aggregate["tp"] >= 1, (aggregate, result.diff) + assert aggregate["precision"] == 1.0 and aggregate["recall"] == 1.0, ( + aggregate, + result.diff, + ) + + +def test_restrict_to_files_scopes_graph_to_universe() -> None: + # (H) Scale grading over a compile_commands.json must score cgr only on the + # (H) files the oracle actually compiled; restrict_to_files drops cgr nodes, + # (H) edges, and name edges that touch any out-of-universe file. + keep = "include/a.h" + drop = "test/gtest.h" + mod_keep = NodeKey(cs.NodeLabel.MODULE.value, keep, ec.MODULE_START_LINE) + cls_keep = NodeKey(cs.NodeLabel.CLASS.value, keep, 3) + cls_drop = NodeKey(cs.NodeLabel.CLASS.value, drop, 5) + graph = GraphData( + nodes={ + cls_keep: DefNode(cls_keep, "Keep", 9), + cls_drop: DefNode(cls_drop, "Drop", 11), + }, + edges={ + EdgeKey(cs.RelationshipType.DEFINES.value, mod_keep, cls_keep), + EdgeKey( + cs.RelationshipType.DEFINES.value, + NodeKey(cs.NodeLabel.MODULE.value, drop, ec.MODULE_START_LINE), + cls_drop, + ), + }, + name_edges={ + NameEdge(cs.RelationshipType.INHERITS.value, cls_keep, "Other"), + NameEdge(cs.RelationshipType.INHERITS.value, cls_drop, "Other"), + }, + ) + + scoped = restrict_to_files(graph, {keep}) + + assert set(scoped.nodes) == {cls_keep} + assert all(e.parent.file == keep and e.child.file == keep for e in scoped.edges) + assert len(scoped.edges) == 1 + assert {n.source.file for n in scoped.name_edges} == {keep} + assert len(scoped.name_edges) == 1 diff --git a/codebase_rag/tests/test_cpp_out_of_class_method_calls.py b/codebase_rag/tests/test_cpp_out_of_class_method_calls.py new file mode 100644 index 000000000..27173dc76 --- /dev/null +++ b/codebase_rag/tests/test_cpp_out_of_class_method_calls.py @@ -0,0 +1,56 @@ +from __future__ import annotations + +from pathlib import Path +from unittest.mock import MagicMock + +from codebase_rag.tests.conftest import ( + get_nodes, + get_qualified_names, + get_relationships, + run_updater, +) + +# (H) An out-of-line C++ method definition (`int Calculator::add(...) {...}` at +# (H) namespace/file scope) calling a free function. cgr's definition pass binds +# (H) the METHOD node to the class (qn `...Calculator.add`), but the call pass +# (H) computed the caller qn as a module-rooted free function (`...calc.add`), +# (H) so the CALLS edge's source dangled (matched no node). The caller of a call +# (H) inside an out-of-line method body must be the method's own node qn. +CPP_SOURCE = """ +class Calculator { +public: + int add(int a, int b); +}; + +int helper_fn(int x) { return x + 1; } + +int Calculator::add(int a, int b) { + return helper_fn(a) + b; +} +""" + + +def test_out_of_class_method_call_attributed_to_method_qn( + temp_repo: Path, + mock_ingestor: MagicMock, +) -> None: + project = temp_repo / "cpp_ooc_calls" + project.mkdir() + (project / "calc.cpp").write_text(CPP_SOURCE, encoding="utf-8") + + run_updater(project, mock_ingestor) + + method_qns = get_qualified_names(get_nodes(mock_ingestor, "Method")) + add_qn = next((q for q in method_qns if q.endswith(".Calculator.add")), None) + assert add_qn is not None, f"no Calculator.add Method node: {method_qns}" + + calls = get_relationships(mock_ingestor, "CALLS") + # (H) ensure_relationship_batch(from_spec, rel_type, to_spec): from_spec[2] is + # (H) the caller qn, to_spec[2] the callee qn. + callers_of_helper = { + c.args[0][2] for c in calls if "helper_fn" in str(c.args[2][2]) + } + assert add_qn in callers_of_helper, ( + f"expected CALLS from {add_qn} to helper_fn; " + f"got callers {sorted(callers_of_helper)}" + ) diff --git a/codebase_rag/tests/test_cypher_validation.py b/codebase_rag/tests/test_cypher_validation.py new file mode 100644 index 000000000..8a1c9017e --- /dev/null +++ b/codebase_rag/tests/test_cypher_validation.py @@ -0,0 +1,258 @@ +import re + +import pytest + +from codebase_rag import constants as cs +from codebase_rag import exceptions as ex +from codebase_rag.services.llm import ( + _build_keyword_pattern, + _validate_call_procedures, + _validate_cypher_read_only, + _validate_no_unbounded_paths, +) + + +class TestBuildKeywordPattern: + def test_single_word_uses_word_boundaries(self) -> None: + pattern = _build_keyword_pattern("DELETE") + assert pattern.search("DELETE n") is not None + assert pattern.search("XDELETE") is None + assert pattern.search("DELETEX") is None + + def test_multi_word_allows_whitespace_between_parts(self) -> None: + pattern = _build_keyword_pattern("LOAD CSV") + assert pattern.search("LOAD CSV") is not None + assert pattern.search("LOAD CSV") is not None + assert pattern.search("LOAD\nCSV") is not None + assert pattern.search("LOAD\t CSV") is not None + + def test_multi_word_allows_block_comment_between_parts(self) -> None: + pattern = _build_keyword_pattern("LOAD CSV") + assert pattern.search("LOAD/*bypass*/CSV") is not None + assert pattern.search("LOAD /* comment */ CSV") is not None + + def test_multi_word_allows_single_line_comment_between_parts(self) -> None: + pattern = _build_keyword_pattern("LOAD CSV") + assert pattern.search("LOAD //comment\nCSV") is not None + assert pattern.search("LOAD //\nCSV") is not None + + def test_multi_word_respects_word_boundaries(self) -> None: + pattern = _build_keyword_pattern("LOAD CSV") + assert pattern.search("PRELOAD CSV") is None + assert pattern.search("LOAD CSVX") is None + + def test_single_word_is_case_sensitive_on_input(self) -> None: + pattern = _build_keyword_pattern("DELETE") + assert pattern.search("DELETE") is not None + assert pattern.search("delete") is None + + def test_returns_compiled_pattern(self) -> None: + pattern = _build_keyword_pattern("SET") + assert isinstance(pattern, re.Pattern) + + def test_multi_word_has_dotall_flag(self) -> None: + pattern = _build_keyword_pattern("CREATE INDEX") + assert pattern.flags & re.DOTALL + + def test_all_dangerous_keywords_produce_valid_patterns(self) -> None: + for kw in cs.CYPHER_DANGEROUS_KEYWORDS: + pattern = _build_keyword_pattern(kw) + assert pattern.search(kw) is not None + + +class TestValidateCypherReadOnly: + def test_safe_match_query_passes(self) -> None: + _validate_cypher_read_only("MATCH (n) RETURN n;") + + def test_safe_match_with_where_passes(self) -> None: + _validate_cypher_read_only("MATCH (n:Function) WHERE n.name = 'foo' RETURN n;") + + def test_safe_optional_match_passes(self) -> None: + _validate_cypher_read_only( + "MATCH (a)-[:CALLS]->(b) OPTIONAL MATCH (b)-[:DEFINES]->(c) RETURN a, b, c;" + ) + + @pytest.mark.parametrize( + "keyword", + sorted(cs.CYPHER_DANGEROUS_KEYWORDS), + ) + def test_rejects_all_dangerous_keywords(self, keyword: str) -> None: + query = f"MATCH (n) {keyword} n;" + with pytest.raises(ex.LLMGenerationError): + _validate_cypher_read_only(query) + + def test_rejects_delete(self) -> None: + with pytest.raises(ex.LLMGenerationError, match="DELETE"): + _validate_cypher_read_only("MATCH (n) DELETE n;") + + def test_rejects_detach_delete(self) -> None: + with pytest.raises(ex.LLMGenerationError): + _validate_cypher_read_only("MATCH (n) DETACH DELETE n;") + + def test_rejects_drop(self) -> None: + with pytest.raises(ex.LLMGenerationError, match="DROP"): + _validate_cypher_read_only("MATCH (n) DROP INDEX idx;") + + def test_rejects_set(self) -> None: + with pytest.raises(ex.LLMGenerationError, match="SET"): + _validate_cypher_read_only("MATCH (n) SET n.name = 'x';") + + def test_rejects_merge(self) -> None: + with pytest.raises(ex.LLMGenerationError, match="MERGE"): + _validate_cypher_read_only("MERGE (n:Node {id: 1});") + + def test_rejects_create(self) -> None: + with pytest.raises(ex.LLMGenerationError, match="CREATE"): + _validate_cypher_read_only("CREATE (n:Node {name: 'test'});") + + def test_rejects_load_csv(self) -> None: + with pytest.raises(ex.LLMGenerationError, match="LOAD CSV"): + _validate_cypher_read_only( + "LOAD CSV FROM 'http://evil.com/data.csv' AS row;" + ) + + def test_rejects_create_index(self) -> None: + with pytest.raises(ex.LLMGenerationError, match="CREATE INDEX"): + _validate_cypher_read_only("CREATE INDEX ON :Node(name);") + + def test_case_insensitive(self) -> None: + with pytest.raises(ex.LLMGenerationError): + _validate_cypher_read_only("match (n) delete n;") + + def test_rejects_block_comment_bypass(self) -> None: + with pytest.raises(ex.LLMGenerationError): + _validate_cypher_read_only("LOAD/*bypass*/CSV FROM 'http://evil.com';") + + def test_rejects_single_line_comment_bypass(self) -> None: + with pytest.raises(ex.LLMGenerationError): + _validate_cypher_read_only("LOAD //bypass\nCSV FROM 'http://evil.com';") + + def test_does_not_flag_substring_matches(self) -> None: + _validate_cypher_read_only("MATCH (n) WHERE n.name = 'DATASET' RETURN n;") + + def test_does_not_flag_reset(self) -> None: + _validate_cypher_read_only("MATCH (n) WHERE n.name = 'RESET' RETURN n;") + + def test_does_not_flag_created_at(self) -> None: + _validate_cypher_read_only("MATCH (n) WHERE n.created_at > 0 RETURN n;") + + def test_error_includes_keyword_and_query(self) -> None: + query = "MATCH (n) DELETE n;" + with pytest.raises(ex.LLMGenerationError, match="DELETE") as exc_info: + _validate_cypher_read_only(query) + assert query in str(exc_info.value) + + def test_rejects_foreach(self) -> None: + with pytest.raises(ex.LLMGenerationError, match="FOREACH"): + _validate_cypher_read_only( + "MATCH p=(a)-[*]->(b) FOREACH (n IN nodes(p) | SET n.marked = true);" + ) + + def test_rejects_remove(self) -> None: + with pytest.raises(ex.LLMGenerationError, match="REMOVE"): + _validate_cypher_read_only("MATCH (n) REMOVE n.prop;") + + def test_call_no_longer_in_keyword_blocklist(self) -> None: + _validate_cypher_read_only("CALL nxalg.strongly_connected_components();") + + def test_rejects_create_constraint(self) -> None: + with pytest.raises(ex.LLMGenerationError, match="CREATE CONSTRAINT"): + _validate_cypher_read_only( + "CREATE CONSTRAINT ON (n:Node) ASSERT n.id IS UNIQUE;" + ) + + def test_rejects_multiline_block_comment_bypass(self) -> None: + with pytest.raises(ex.LLMGenerationError): + _validate_cypher_read_only("LOAD/*\nbypass\n*/CSV FROM 'http://evil.com';") + + +class TestValidateNoUnboundedPaths: + @pytest.mark.parametrize( + "query", + [ + "MATCH (n) RETURN n;", + "MATCH (a)-[:CALLS]->(b) RETURN a, b;", + "MATCH (a)-[:CALLS*5]->(b) RETURN a, b;", + "MATCH (a)-[:CALLS*1..6]->(b) RETURN a, b;", + "MATCH (a)-[:CALLS*..6]->(b) RETURN a, b;", + "MATCH (a)-[r:CALLS*1..6]->(b) RETURN r;", + "MATCH (a)-[*1..3]->(b) RETURN a, b;", + "MATCH (a)-[:CALLS*2..2]->(b) RETURN a, b;", + "MATCH (a)-[:CALLS*1..6 {weight: 1}]->(b) RETURN a, b;", + ], + ) + def test_bounded_or_no_varlen_passes(self, query: str) -> None: + _validate_no_unbounded_paths(query) + + @pytest.mark.parametrize( + "query", + [ + "MATCH path = (a)-[:CALLS*]->(b) RETURN path;", + "MATCH (a)-[:CALLS*1..]->(b) RETURN a, b;", + "MATCH (a)-[:CALLS*..]->(b) RETURN a, b;", + "MATCH (a)-[*]->(b) RETURN a, b;", + "MATCH (a)-[r:CALLS*]->(b) RETURN r;", + "MATCH (a)-[:CALLS*10..]->(b) RETURN a, b;", + ], + ) + def test_unbounded_varlen_rejected(self, query: str) -> None: + with pytest.raises(ex.LLMGenerationError, match="unbounded"): + _validate_no_unbounded_paths(query) + + def test_error_includes_query(self) -> None: + query = "MATCH (a)-[:CALLS*]->(b) RETURN a;" + with pytest.raises(ex.LLMGenerationError) as exc_info: + _validate_no_unbounded_paths(query) + assert query in str(exc_info.value) + + +class TestValidateCallProcedures: + @pytest.mark.parametrize( + "query", + [ + "MATCH (n) RETURN n;", + "CALL nxalg.strongly_connected_components() YIELD components RETURN components;", + "CALL nxalg.simple_cycles() YIELD cycles RETURN cycles LIMIT 10;", + "CALL nxalg.topological_sort() YIELD nodes RETURN nodes;", + "CALL pagerank.get() YIELD node, rank RETURN node, rank ORDER BY rank DESC LIMIT 10;", + "CALL betweenness_centrality.get() YIELD node, betweenness_centrality RETURN node;", + "CALL community_detection.get() YIELD node, community_id RETURN node, community_id;", + "CALL leiden_community_detection.get() YIELD node, community_id RETURN node;", + "CALL weakly_connected_components.get() YIELD node, component_id RETURN node;", + "CALL graph_util.ancestors(node) YIELD ancestors RETURN ancestors;", + "CALL path.expand(start, ['CALLS>'], ['Function'], 1, 6) YIELD path RETURN path;", + "CALL algo.all_simple_paths(src, tgt, ['CALLS'], 10) YIELD paths RETURN paths;", + "CALL bridges.get() YIELD bridges RETURN bridges;", + "CALL biconnected_components.get() YIELD components RETURN components;", + ], + ) + def test_allowed_procedure_passes(self, query: str) -> None: + _validate_call_procedures(query) + + @pytest.mark.parametrize( + "query", + [ + "CALL db.schema.visualization();", + "CALL refactor.merge_nodes([a, b]) YIELD node RETURN node;", + "CALL create.node(['Foo'], {x: 1}) YIELD node RETURN node;", + "CALL export_util.json('out.json');", + "CALL migrate.postgresql('...') YIELD row RETURN row;", + "CALL mg.load('mod');", + "CALL csv_utils.create_csv_file('a','b');", + "CALL link_prediction.train();", + ], + ) + def test_disallowed_procedure_rejected(self, query: str) -> None: + with pytest.raises(ex.LLMGenerationError, match="outside the read-only"): + _validate_call_procedures(query) + + def test_call_is_case_insensitive(self) -> None: + _validate_call_procedures( + "call nxalg.strongly_connected_components() YIELD components RETURN components;" + ) + + def test_error_includes_procedure_name(self) -> None: + with pytest.raises(ex.LLMGenerationError, match="refactor.merge_nodes"): + _validate_call_procedures( + "CALL refactor.merge_nodes([a, b]) YIELD n RETURN n;" + ) diff --git a/codebase_rag/tests/test_dead_code_command.py b/codebase_rag/tests/test_dead_code_command.py new file mode 100644 index 000000000..aad627ee3 --- /dev/null +++ b/codebase_rag/tests/test_dead_code_command.py @@ -0,0 +1,226 @@ +from __future__ import annotations + +import json +from pathlib import Path +from unittest.mock import MagicMock, patch + +import pytest +from typer.testing import CliRunner + +from codebase_rag.cli import app +from codebase_rag.types_defs import ResultRow + + +@pytest.fixture +def runner() -> CliRunner: + return CliRunner() + + +@pytest.fixture +def dead_rows() -> list[ResultRow]: + return [ + { + "label": "Function", + "name": "orphan_one", + "qualified_name": "myproj.mod.orphan_one", + "start_line": 5, + "end_line": 9, + }, + { + "label": "Method", + "name": "orphan_two", + "qualified_name": "myproj.mod.Thing.orphan_two", + "start_line": 20, + "end_line": 25, + }, + ] + + +def _make_mock_ingestor( + *, projects: list[str], fetch_result: list[ResultRow] +) -> MagicMock: + mock = MagicMock() + mock.list_projects.return_value = projects + mock.fetch_all.return_value = fetch_result + mock.__enter__ = MagicMock(return_value=mock) + mock.__exit__ = MagicMock(return_value=False) + return mock + + +class TestDeadCodeCommand: + def test_lists_orphans_in_table( + self, runner: CliRunner, dead_rows: list[ResultRow] + ) -> None: + mock_ingestor = _make_mock_ingestor(projects=["myproj"], fetch_result=dead_rows) + with patch("codebase_rag.cli.connect_memgraph", return_value=mock_ingestor): + result = runner.invoke(app, ["dead-code"]) + + assert result.exit_code == 0 + assert "orphan_one" in result.output + assert "orphan_two" in result.output + + def test_json_format_emits_qualified_names( + self, runner: CliRunner, dead_rows: list[ResultRow] + ) -> None: + mock_ingestor = _make_mock_ingestor(projects=["myproj"], fetch_result=dead_rows) + with patch("codebase_rag.cli.connect_memgraph", return_value=mock_ingestor): + result = runner.invoke(app, ["dead-code", "--format", "json"]) + + assert result.exit_code == 0 + payload = json.loads(result.output) + names = {row["qualified_name"] for row in payload} + assert names == { + "myproj.mod.orphan_one", + "myproj.mod.Thing.orphan_two", + } + + def test_fail_on_found_exits_one_when_dead_code( + self, runner: CliRunner, dead_rows: list[ResultRow] + ) -> None: + mock_ingestor = _make_mock_ingestor(projects=["myproj"], fetch_result=dead_rows) + with patch("codebase_rag.cli.connect_memgraph", return_value=mock_ingestor): + result = runner.invoke(app, ["dead-code", "--fail-on-found"]) + + assert result.exit_code == 1 + + def test_fail_on_found_exits_zero_when_clean(self, runner: CliRunner) -> None: + mock_ingestor = _make_mock_ingestor(projects=["myproj"], fetch_result=[]) + with patch("codebase_rag.cli.connect_memgraph", return_value=mock_ingestor): + result = runner.invoke(app, ["dead-code", "--fail-on-found"]) + + assert result.exit_code == 0 + + def test_explicit_project_name_used( + self, runner: CliRunner, dead_rows: list[ResultRow] + ) -> None: + mock_ingestor = _make_mock_ingestor( + projects=["myproj", "other"], fetch_result=dead_rows + ) + with patch("codebase_rag.cli.connect_memgraph", return_value=mock_ingestor): + result = runner.invoke(app, ["dead-code", "--project-name", "myproj"]) + + assert result.exit_code == 0 + _query, params = mock_ingestor.fetch_all.call_args.args + assert params["project_prefix"] == "myproj." + + def test_errors_when_project_ambiguous(self, runner: CliRunner) -> None: + mock_ingestor = _make_mock_ingestor(projects=["a", "b"], fetch_result=[]) + with patch("codebase_rag.cli.connect_memgraph", return_value=mock_ingestor): + result = runner.invoke(app, ["dead-code"]) + + assert result.exit_code == 1 + mock_ingestor.fetch_all.assert_not_called() + + def test_errors_when_no_projects(self, runner: CliRunner) -> None: + mock_ingestor = _make_mock_ingestor(projects=[], fetch_result=[]) + with patch("codebase_rag.cli.connect_memgraph", return_value=mock_ingestor): + result = runner.invoke(app, ["dead-code"]) + + assert result.exit_code == 1 + + def test_entry_point_forwarded_to_query( + self, runner: CliRunner, dead_rows: list[ResultRow] + ) -> None: + mock_ingestor = _make_mock_ingestor(projects=["myproj"], fetch_result=dead_rows) + with patch("codebase_rag.cli.connect_memgraph", return_value=mock_ingestor): + result = runner.invoke(app, ["dead-code", "-e", "main", "-e", "run"]) + + assert result.exit_code == 0 + _query, params = mock_ingestor.fetch_all.call_args.args + assert params["entry_points"] == ["main", "run"] + + def test_decorator_root_extends_defaults( + self, runner: CliRunner, dead_rows: list[ResultRow] + ) -> None: + mock_ingestor = _make_mock_ingestor(projects=["myproj"], fetch_result=dead_rows) + with patch("codebase_rag.cli.connect_memgraph", return_value=mock_ingestor): + result = runner.invoke(app, ["dead-code", "--decorator-root", "myhandler"]) + + assert result.exit_code == 0 + _query, params = mock_ingestor.fetch_all.call_args.args + assert "myhandler" in params["root_decorators"] + assert "task" in params["root_decorators"] + + def test_writes_json_to_output_file( + self, runner: CliRunner, dead_rows: list[ResultRow], tmp_path: Path + ) -> None: + out = tmp_path / "dead.json" + mock_ingestor = _make_mock_ingestor(projects=["myproj"], fetch_result=dead_rows) + with patch("codebase_rag.cli.connect_memgraph", return_value=mock_ingestor): + result = runner.invoke( + app, + ["dead-code", "--format", "json", "--output", str(out)], + ) + + assert result.exit_code == 0 + payload = json.loads(out.read_text()) + assert len(payload) == 2 + + def test_writes_table_to_output_file( + self, runner: CliRunner, dead_rows: list[ResultRow], tmp_path: Path + ) -> None: + out = tmp_path / "dead.txt" + mock_ingestor = _make_mock_ingestor(projects=["myproj"], fetch_result=dead_rows) + with patch("codebase_rag.cli.connect_memgraph", return_value=mock_ingestor): + result = runner.invoke(app, ["dead-code", "--output", str(out)]) + + assert result.exit_code == 0 + written = out.read_text() + assert "orphan_one" in written + + def test_handles_connection_error(self, runner: CliRunner) -> None: + with patch( + "codebase_rag.cli.connect_memgraph", + side_effect=ConnectionError("Cannot connect"), + ): + result = runner.invoke(app, ["dead-code"]) + + assert result.exit_code == 1 + + def test_include_tests_default_passes_test_patterns( + self, runner: CliRunner, dead_rows: list[ResultRow] + ) -> None: + mock_ingestor = _make_mock_ingestor(projects=["myproj"], fetch_result=dead_rows) + with patch("codebase_rag.cli.connect_memgraph", return_value=mock_ingestor): + result = runner.invoke(app, ["dead-code"]) + + assert result.exit_code == 0 + query, params = mock_ingestor.fetch_all.call_args.args + assert "test_patterns" in params + assert "$test_patterns" in query + + def test_no_include_tests_omits_test_patterns( + self, runner: CliRunner, dead_rows: list[ResultRow] + ) -> None: + mock_ingestor = _make_mock_ingestor(projects=["myproj"], fetch_result=dead_rows) + with patch("codebase_rag.cli.connect_memgraph", return_value=mock_ingestor): + result = runner.invoke(app, ["dead-code", "--no-include-tests"]) + + assert result.exit_code == 0 + query, params = mock_ingestor.fetch_all.call_args.args + # (H) test_patterns is still passed (it filters test modules out of the + # (H) module-load roots), but test functions themselves are not roots. + assert "test_patterns" in params + assert "n.path CONTAINS" not in query + + def test_classes_flag_includes_class_candidates( + self, runner: CliRunner, dead_rows: list[ResultRow] + ) -> None: + mock_ingestor = _make_mock_ingestor(projects=["myproj"], fetch_result=dead_rows) + with patch("codebase_rag.cli.connect_memgraph", return_value=mock_ingestor): + result = runner.invoke(app, ["dead-code", "--classes"]) + + assert result.exit_code == 0 + query, _params = mock_ingestor.fetch_all.call_args.args + assert "Function|Method|Class" in query + + def test_classes_off_by_default( + self, runner: CliRunner, dead_rows: list[ResultRow] + ) -> None: + mock_ingestor = _make_mock_ingestor(projects=["myproj"], fetch_result=dead_rows) + with patch("codebase_rag.cli.connect_memgraph", return_value=mock_ingestor): + result = runner.invoke(app, ["dead-code"]) + + assert result.exit_code == 0 + query, _params = mock_ingestor.fetch_all.call_args.args + assert "Function|Method|Class" not in query diff --git a/codebase_rag/tests/test_decorator_call_edges.py b/codebase_rag/tests/test_decorator_call_edges.py new file mode 100644 index 000000000..5778b5efa --- /dev/null +++ b/codebase_rag/tests/test_decorator_call_edges.py @@ -0,0 +1,157 @@ +from __future__ import annotations + +from pathlib import Path +from unittest.mock import MagicMock + +from codebase_rag import constants as cs +from codebase_rag.tests.conftest import run_updater + + +def _calls(mock_ingestor: MagicMock) -> list[tuple[str, str, str]]: + # (H) CALLS edges as (caller_label, caller_qn, callee_qn). + out: list[tuple[str, str, str]] = [] + for c in mock_ingestor.ensure_relationship_batch.call_args_list: + if c.args[1] == cs.RelationshipType.CALLS: + out.append((c.args[0][0], c.args[0][2], c.args[2][2])) + return out + + +class TestDecoratorCallEdges: + def test_bare_decorator_emits_module_call( + self, temp_repo: Path, mock_ingestor: MagicMock + ) -> None: + # (H) `@task` applies task(handler) at module load -> a module-level call. + (temp_repo / "app.py").write_text( + "def task(fn):\n return fn\n\n\n@task\ndef handler():\n return 1\n", + encoding="utf-8", + ) + + run_updater(temp_repo, mock_ingestor, skip_if_missing="python") + calls = _calls(mock_ingestor) + + assert any( + label == cs.NodeLabel.MODULE + and caller.endswith(".app") + and callee.endswith(".task") + for label, caller, callee in calls + ), f"no module->task decorator edge; calls={sorted(calls)}" + + def test_call_decorator_emits_module_call( + self, temp_repo: Path, mock_ingestor: MagicMock + ) -> None: + # (H) `@register(...)` also runs at module load. + (temp_repo / "app.py").write_text( + "def register(name):\n" + " def wrap(fn):\n" + " return fn\n" + " return wrap\n" + "\n" + "\n" + '@register("x")\n' + "def handler():\n" + " return 1\n", + encoding="utf-8", + ) + + run_updater(temp_repo, mock_ingestor, skip_if_missing="python") + calls = _calls(mock_ingestor) + + assert any( + label == cs.NodeLabel.MODULE + and caller.endswith(".app") + and callee.endswith(".register") + for label, caller, callee in calls + ), f"no module->register decorator edge; calls={sorted(calls)}" + + def test_class_decorator_emits_module_call( + self, temp_repo: Path, mock_ingestor: MagicMock + ) -> None: + # (H) a bare decorator on a class also runs at module load. + (temp_repo / "app.py").write_text( + "def deco(cls):\n return cls\n\n\n@deco\nclass MyClass:\n pass\n", + encoding="utf-8", + ) + + run_updater(temp_repo, mock_ingestor, skip_if_missing="python") + calls = _calls(mock_ingestor) + + assert any( + label == cs.NodeLabel.MODULE + and caller.endswith(".app") + and callee.endswith(".deco") + for label, caller, callee in calls + ), f"no module->deco class decorator edge; calls={sorted(calls)}" + + def test_alias_decorator_resolves_to_first_party( + self, temp_repo: Path, mock_ingestor: MagicMock + ) -> None: + # (H) `@alias` where `alias = task` still calls task at module load. + (temp_repo / "app.py").write_text( + "def task(fn):\n" + " return fn\n" + "\n" + "\n" + "alias = task\n" + "\n" + "\n" + "@alias\n" + "def handler():\n" + " return 1\n", + encoding="utf-8", + ) + + run_updater(temp_repo, mock_ingestor, skip_if_missing="python") + calls = _calls(mock_ingestor) + + assert any( + label == cs.NodeLabel.MODULE + and caller.endswith(".app") + and callee.endswith(".task") + for label, caller, callee in calls + ), f"alias decorator not resolved; calls={sorted(calls)}" + + def test_decorator_on_nested_function_not_module_attributed( + self, temp_repo: Path, mock_ingestor: MagicMock + ) -> None: + # (H) a decorator on a function nested in another function runs when the + # (H) outer function is called, not at module load -> no module edge. + (temp_repo / "app.py").write_text( + "def deco(fn):\n" + " return fn\n" + "\n" + "\n" + "def outer():\n" + " @deco\n" + " def inner():\n" + " return 1\n" + "\n" + " return inner\n", + encoding="utf-8", + ) + + run_updater(temp_repo, mock_ingestor, skip_if_missing="python") + module_callees = { + callee.rsplit(cs.SEPARATOR_DOT, 1)[-1] + for label, _caller, callee in _calls(mock_ingestor) + if label == cs.NodeLabel.MODULE + } + + assert "deco" not in module_callees + + def test_undecorated_function_has_no_decorator_edge( + self, temp_repo: Path, mock_ingestor: MagicMock + ) -> None: + (temp_repo / "app.py").write_text( + "def plain():\n return 1\n\n\ndef other():\n return 2\n", + encoding="utf-8", + ) + + run_updater(temp_repo, mock_ingestor, skip_if_missing="python") + module_callees = { + callee.rsplit(cs.SEPARATOR_DOT, 1)[-1] + for label, _caller, callee in _calls(mock_ingestor) + if label == cs.NodeLabel.MODULE + } + + assert "plain" not in module_callees + assert "other" not in module_callees diff --git a/codebase_rag/tests/test_diff_autowrap.py b/codebase_rag/tests/test_diff_autowrap.py new file mode 100644 index 000000000..d5c9c6eb1 --- /dev/null +++ b/codebase_rag/tests/test_diff_autowrap.py @@ -0,0 +1,90 @@ +from __future__ import annotations + +from codebase_rag.main import _autowrap_diff_blocks + + +class TestNoDiff: + def test_plain_text_unchanged(self) -> None: + text = "Here is some explanation without any diff." + assert _autowrap_diff_blocks(text) == text + + def test_text_without_diff_marker_unchanged(self) -> None: + text = "Lines starting with - or + but no diff --git header\n- not a diff\n+ also not" + assert _autowrap_diff_blocks(text) == text + + +class TestWrappingUnfencedDiff: + def test_full_git_diff_gets_fenced_as_diff(self) -> None: + text = ( + "diff --git a/file.py b/file.py\n" + "index abc..def 100644\n" + "--- a/file.py\n" + "+++ b/file.py\n" + "@@ -1,3 +1,3 @@\n" + " context\n" + "-old\n" + "+new\n" + ) + out = _autowrap_diff_blocks(text) + assert out.startswith("```diff\n") + assert out.rstrip().endswith("```") + assert "diff --git a/file.py b/file.py" in out + assert "+new" in out + + def test_diff_followed_by_explanation_text(self) -> None: + text = ( + "diff --git a/x b/x\n" + "--- a/x\n" + "+++ b/x\n" + "@@ -1 +1 @@\n" + "-a\n" + "+b\n" + "\n" + "This adds the new feature.\n" + ) + out = _autowrap_diff_blocks(text) + assert "```diff\n" in out + explanation_pos = out.index("This adds the new feature.") + fence_close_pos = out.rindex("```", 0, explanation_pos) + assert fence_close_pos < explanation_pos, ( + "explanation text must appear after the closing fence" + ) + assert "diff --git" in out[:fence_close_pos] + + def test_preamble_before_diff_preserved(self) -> None: + text = ( + "Here are the changes I made:\n" + "diff --git a/foo.py b/foo.py\n" + "--- a/foo.py\n" + "+++ b/foo.py\n" + "@@ -1 +1 @@\n" + "-x\n" + "+y\n" + ) + out = _autowrap_diff_blocks(text) + assert "Here are the changes I made:" in out + assert "```diff" in out + + +class TestAlreadyFenced: + def test_already_fenced_diff_not_double_wrapped(self) -> None: + text = ( + "Here is a diff:\n" + "```diff\n" + "diff --git a/x b/x\n" + "--- a/x\n" + "+++ b/x\n" + "@@ -1 +1 @@\n" + "-a\n" + "+b\n" + "```\n" + ) + out = _autowrap_diff_blocks(text) + assert out.count("```diff") == 1 + assert out.count("```") == 2 + + def test_fenced_with_other_language_not_rewrapped(self) -> None: + text = "```bash\ngit diff\ndiff --git a/x b/x\n```\n" + out = _autowrap_diff_blocks(text) + assert "```bash" in out + assert "```diff" not in out diff --git a/codebase_rag/tests/test_directory_lister.py b/codebase_rag/tests/test_directory_lister.py index 9a7f480bc..40759be36 100644 --- a/codebase_rag/tests/test_directory_lister.py +++ b/codebase_rag/tests/test_directory_lister.py @@ -5,6 +5,7 @@ import pytest from pydantic_ai import Tool +from codebase_rag import tool_errors as te from codebase_rag.tools.directory_lister import ( DirectoryLister, create_directory_lister_tool, @@ -113,6 +114,24 @@ def test_list_with_hidden_files( assert ".hidden_file" in result assert "visible_file" in result + def test_list_directory_returns_error_for_path_outside_root( + self, directory_lister: DirectoryLister + ) -> None: + result = directory_lister.list_directory_contents("../../../etc") + expected = te.DIRECTORY_PATH_OUTSIDE_ROOT.format( + path="../../../etc", root=directory_lister.project_root + ) + assert result == expected + + def test_list_directory_returns_error_for_absolute_path_outside_root( + self, directory_lister: DirectoryLister + ) -> None: + result = directory_lister.list_directory_contents("/etc/passwd") + expected = te.DIRECTORY_PATH_OUTSIDE_ROOT.format( + path="/etc/passwd", root=directory_lister.project_root + ) + assert result == expected + class TestGetSafePath: def test_safe_path_with_relative_path( diff --git a/codebase_rag/tests/test_document_analyzer.py b/codebase_rag/tests/test_document_analyzer.py deleted file mode 100644 index 1d88dfe2f..000000000 --- a/codebase_rag/tests/test_document_analyzer.py +++ /dev/null @@ -1,259 +0,0 @@ -from __future__ import annotations - -from pathlib import Path -from unittest.mock import MagicMock, patch - -import pytest -from pydantic_ai import Tool - -from codebase_rag.constants import Provider -from codebase_rag.tools.document_analyzer import ( - DocumentAnalyzer, - _NotSupportedClient, - create_document_analyzer_tool, -) - - -@pytest.fixture -def temp_project_root(tmp_path: Path) -> Path: - return tmp_path - - -@pytest.fixture -def mock_settings() -> MagicMock: - settings = MagicMock() - settings.active_orchestrator_config.provider = Provider.GOOGLE - settings.active_orchestrator_config.provider_type = "api" - settings.active_orchestrator_config.api_key = "test-api-key" - settings.active_orchestrator_config.model_id = "gemini-1.5-flash" - return settings - - -@pytest.fixture -def mock_genai_client() -> MagicMock: - client = MagicMock() - response = MagicMock() - response.text = "Analysis result" - client.models.generate_content.return_value = response - return client - - -class TestNotSupportedClient: - def test_raises_not_implemented_error(self) -> None: - client = _NotSupportedClient() - with pytest.raises(NotImplementedError): - client.generate_content() - - def test_any_attribute_raises_error(self) -> None: - client = _NotSupportedClient() - with pytest.raises(NotImplementedError): - client.any_method() - - -class TestDocumentAnalyzerInit: - def test_init_resolves_project_root( - self, temp_project_root: Path, mock_settings: MagicMock - ) -> None: - with patch("codebase_rag.tools.document_analyzer.settings", mock_settings): - with patch("codebase_rag.tools.document_analyzer.genai.Client"): - analyzer = DocumentAnalyzer(str(temp_project_root)) - assert analyzer.project_root == temp_project_root.resolve() - - def test_init_with_google_api_provider( - self, temp_project_root: Path, mock_settings: MagicMock - ) -> None: - mock_settings.active_orchestrator_config.provider = Provider.GOOGLE - mock_settings.active_orchestrator_config.provider_type = "api" - with patch("codebase_rag.tools.document_analyzer.settings", mock_settings): - with patch( - "codebase_rag.tools.document_analyzer.genai.Client" - ) as mock_client: - DocumentAnalyzer(str(temp_project_root)) - mock_client.assert_called_once_with(api_key="test-api-key") - - def test_init_with_non_google_provider( - self, temp_project_root: Path, mock_settings: MagicMock - ) -> None: - mock_settings.active_orchestrator_config.provider = "anthropic" - with patch("codebase_rag.tools.document_analyzer.settings", mock_settings): - analyzer = DocumentAnalyzer(str(temp_project_root)) - assert isinstance(analyzer.client, _NotSupportedClient) - - -class TestDocumentAnalyzerAnalyze: - def test_analyze_returns_error_for_unsupported_provider( - self, temp_project_root: Path, mock_settings: MagicMock - ) -> None: - mock_settings.active_orchestrator_config.provider = "anthropic" - with patch("codebase_rag.tools.document_analyzer.settings", mock_settings): - analyzer = DocumentAnalyzer(str(temp_project_root)) - result = analyzer.analyze("test.pdf", "What is this?") - assert "Error:" in result - assert "not supported" in result.lower() - - def test_analyze_file_not_found( - self, - temp_project_root: Path, - mock_settings: MagicMock, - mock_genai_client: MagicMock, - ) -> None: - with patch("codebase_rag.tools.document_analyzer.settings", mock_settings): - with patch( - "codebase_rag.tools.document_analyzer.genai.Client", - return_value=mock_genai_client, - ): - analyzer = DocumentAnalyzer(str(temp_project_root)) - result = analyzer.analyze("nonexistent.pdf", "What is this?") - assert "Error:" in result - assert "not found" in result.lower() - - def test_analyze_security_path_traversal( - self, - temp_project_root: Path, - mock_settings: MagicMock, - mock_genai_client: MagicMock, - ) -> None: - with patch("codebase_rag.tools.document_analyzer.settings", mock_settings): - with patch( - "codebase_rag.tools.document_analyzer.genai.Client", - return_value=mock_genai_client, - ): - analyzer = DocumentAnalyzer(str(temp_project_root)) - result = analyzer.analyze("../../../etc/passwd", "What is this?") - assert "security" in result.lower() - - def test_analyze_existing_file_returns_response( - self, - temp_project_root: Path, - mock_settings: MagicMock, - mock_genai_client: MagicMock, - ) -> None: - test_file = temp_project_root / "test.txt" - test_file.write_text("Test content", encoding="utf-8") - with patch("codebase_rag.tools.document_analyzer.settings", mock_settings): - with patch( - "codebase_rag.tools.document_analyzer.genai.Client", - return_value=mock_genai_client, - ): - analyzer = DocumentAnalyzer(str(temp_project_root)) - result = analyzer.analyze("test.txt", "What is this?") - assert result == "Analysis result" - - def test_analyze_with_absolute_path( - self, - temp_project_root: Path, - mock_settings: MagicMock, - mock_genai_client: MagicMock, - ) -> None: - test_file = temp_project_root / "test.txt" - test_file.write_text("Test content", encoding="utf-8") - with patch("codebase_rag.tools.document_analyzer.settings", mock_settings): - with patch( - "codebase_rag.tools.document_analyzer.genai.Client", - return_value=mock_genai_client, - ): - analyzer = DocumentAnalyzer(str(temp_project_root)) - result = analyzer.analyze(str(test_file), "What is this?") - assert result == "Analysis result" - - def test_analyze_handles_no_text_response( - self, - temp_project_root: Path, - mock_settings: MagicMock, - ) -> None: - mock_client = MagicMock() - response = MagicMock() - response.text = None - response.candidates = None - mock_client.models.generate_content.return_value = response - - test_file = temp_project_root / "test.txt" - test_file.write_text("Test content", encoding="utf-8") - with patch("codebase_rag.tools.document_analyzer.settings", mock_settings): - with patch( - "codebase_rag.tools.document_analyzer.genai.Client", - return_value=mock_client, - ): - analyzer = DocumentAnalyzer(str(temp_project_root)) - result = analyzer.analyze("test.txt", "What is this?") - assert "no" in result.lower() and "content" in result.lower() - - def test_analyze_extracts_from_candidates( - self, - temp_project_root: Path, - mock_settings: MagicMock, - ) -> None: - mock_client = MagicMock() - response = MagicMock() - response.text = None - - candidate = MagicMock() - part = MagicMock() - part.text = "Candidate text" - candidate.content.parts = [part] - response.candidates = [candidate] - mock_client.models.generate_content.return_value = response - - test_file = temp_project_root / "test.txt" - test_file.write_text("Test content", encoding="utf-8") - with patch("codebase_rag.tools.document_analyzer.settings", mock_settings): - with patch( - "codebase_rag.tools.document_analyzer.genai.Client", - return_value=mock_client, - ): - analyzer = DocumentAnalyzer(str(temp_project_root)) - result = analyzer.analyze("test.txt", "What is this?") - assert result == "Candidate text" - - -class TestCreateDocumentAnalyzerTool: - def test_creates_tool_instance( - self, - temp_project_root: Path, - mock_settings: MagicMock, - mock_genai_client: MagicMock, - ) -> None: - with patch("codebase_rag.tools.document_analyzer.settings", mock_settings): - with patch( - "codebase_rag.tools.document_analyzer.genai.Client", - return_value=mock_genai_client, - ): - analyzer = DocumentAnalyzer(str(temp_project_root)) - tool = create_document_analyzer_tool(analyzer) - assert isinstance(tool, Tool) - - def test_tool_has_description( - self, - temp_project_root: Path, - mock_settings: MagicMock, - mock_genai_client: MagicMock, - ) -> None: - with patch("codebase_rag.tools.document_analyzer.settings", mock_settings): - with patch( - "codebase_rag.tools.document_analyzer.genai.Client", - return_value=mock_genai_client, - ): - analyzer = DocumentAnalyzer(str(temp_project_root)) - tool = create_document_analyzer_tool(analyzer) - assert tool.description is not None - assert ( - "document" in tool.description.lower() - or "pdf" in tool.description.lower() - ) - - def test_tool_has_correct_name( - self, - temp_project_root: Path, - mock_settings: MagicMock, - mock_genai_client: MagicMock, - ) -> None: - with patch("codebase_rag.tools.document_analyzer.settings", mock_settings): - with patch( - "codebase_rag.tools.document_analyzer.genai.Client", - return_value=mock_genai_client, - ): - from codebase_rag.tools.tool_descriptions import AgenticToolName - - analyzer = DocumentAnalyzer(str(temp_project_root)) - tool = create_document_analyzer_tool(analyzer) - assert tool.name == AgenticToolName.ANALYZE_DOCUMENT diff --git a/codebase_rag/tests/test_duplicate_qn_definitions.py b/codebase_rag/tests/test_duplicate_qn_definitions.py new file mode 100644 index 000000000..d3670086c --- /dev/null +++ b/codebase_rag/tests/test_duplicate_qn_definitions.py @@ -0,0 +1,184 @@ +# (H) Regression tests for the duplicate-qualified-name finding surfaced by the +# (H) evals/ harness: the `if has_x(): else: ` import-fallback +# (H) idiom defines one qualified name twice. cgr used to collapse the two into a +# (H) single node (last-writer-wins kept the else-branch stub). Both definitions +# (H) must survive as distinct nodes, and a call must link to BOTH. +from __future__ import annotations + +from pathlib import Path + +from codebase_rag import constants as cs +from codebase_rag.graph_updater import GraphUpdater +from codebase_rag.parser_loader import load_parsers +from codebase_rag.types_defs import PropertyDict, PropertyValue, ResultRow + +PROJECT = "dupproj" + +MODULE_SRC = """import os + + +if os.environ.get("FLAG"): + + def impl() -> str: + return "real" + +else: + + def impl() -> str: + return "stub" + + +def caller() -> str: + return impl() +""" + +_RelTuple = tuple[str, PropertyValue, str, str, PropertyValue] + + +class _Capture: + def __init__(self) -> None: + self.nodes: dict[tuple[str, PropertyValue], PropertyDict] = {} + self.rels: list[_RelTuple] = [] + + def ensure_node_batch(self, label: str, properties: PropertyDict) -> None: + uid = properties[cs.NODE_UNIQUE_CONSTRAINTS[label]] + self.nodes[(str(label), uid)] = dict(properties) + + def ensure_relationship_batch( + self, + from_spec: tuple[str, str, PropertyValue], + rel_type: str, + to_spec: tuple[str, str, PropertyValue], + properties: PropertyDict | None = None, + ) -> None: + self.rels.append( + ( + str(from_spec[0]), + from_spec[2], + str(rel_type), + str(to_spec[0]), + to_spec[2], + ) + ) + + def flush_all(self) -> None: + return None + + def fetch_all( + self, query: str, params: PropertyDict | None = None + ) -> list[ResultRow]: + return [] + + def execute_write(self, query: str, params: PropertyDict | None = None) -> None: + return None + + +def _build(tmp_path: Path, src: str = MODULE_SRC) -> _Capture: + (tmp_path / "m.py").write_text(src) + parsers, queries = load_parsers() + cap = _Capture() + GraphUpdater( + ingestor=cap, + repo_path=tmp_path, + parsers=parsers, + queries=queries, + project_name=PROJECT, + ).run(force=True) + return cap + + +class TestDuplicateQualifiedNameDefinitions: + def test_both_branch_definitions_become_distinct_nodes( + self, tmp_path: Path + ) -> None: + cap = _build(tmp_path) + impl_start_lines = sorted( + int(props[cs.KEY_START_LINE]) + for (label, _uid), props in cap.nodes.items() + if label == cs.NodeLabel.FUNCTION + and props.get(cs.KEY_NAME) == "impl" + and props.get(cs.KEY_START_LINE) is not None + ) + assert impl_start_lines == [6, 11], impl_start_lines + + def test_call_links_to_both_duplicate_definitions(self, tmp_path: Path) -> None: + cap = _build(tmp_path) + calls_to_impl = [ + target + for (_fl, from_val, rel_type, _tl, target) in cap.rels + if rel_type == cs.RelationshipType.CALLS + and str(from_val).endswith(".caller") + and ".impl" in str(target) + ] + assert len(calls_to_impl) == 2, calls_to_impl + + +CLASS_SRC = """import os + + +if os.environ.get("FLAG"): + + class Widget: + def render(self) -> str: + return "real" + +else: + + class Widget: + def render(self) -> str: + return "stub" +""" + + +class TestDuplicateQualifiedNameClasses: + def test_both_branch_classes_become_distinct_nodes(self, tmp_path: Path) -> None: + cap = _build(tmp_path, CLASS_SRC) + widget_start_lines = sorted( + int(props[cs.KEY_START_LINE]) + for (label, _uid), props in cap.nodes.items() + if label == cs.NodeLabel.CLASS + and props.get(cs.KEY_NAME) == "Widget" + and props.get(cs.KEY_START_LINE) is not None + ) + assert widget_start_lines == [6, 12], widget_start_lines + + def test_methods_of_both_branch_classes_survive(self, tmp_path: Path) -> None: + cap = _build(tmp_path, CLASS_SRC) + render_start_lines = sorted( + int(props[cs.KEY_START_LINE]) + for (label, _uid), props in cap.nodes.items() + if label == cs.NodeLabel.METHOD + and props.get(cs.KEY_NAME) == "render" + and props.get(cs.KEY_START_LINE) is not None + ) + assert render_start_lines == [7, 13], render_start_lines + + +METHOD_DUP_SRC = """import os + + +class Service: + + if os.environ.get("FLAG"): + + def run(self) -> str: + return "real" + + else: + + def run(self) -> str: + return "stub" +""" + + +class TestDuplicateQualifiedNameMethodsInOneClass: + def test_both_branch_methods_in_one_class_survive(self, tmp_path: Path) -> None: + cap = _build(tmp_path, METHOD_DUP_SRC) + run_start_lines = sorted( + int(props[cs.KEY_START_LINE]) + for (label, _uid), props in cap.nodes.items() + if label == cs.NodeLabel.METHOD + and props.get(cs.KEY_NAME) == "run" + and props.get(cs.KEY_START_LINE) is not None + ) + assert run_start_lines == [8, 13], run_start_lines diff --git a/codebase_rag/tests/test_embedder.py b/codebase_rag/tests/test_embedder.py index 401044582..6eb009f3e 100644 --- a/codebase_rag/tests/test_embedder.py +++ b/codebase_rag/tests/test_embedder.py @@ -1,10 +1,13 @@ from __future__ import annotations +import tempfile from collections.abc import Generator +from pathlib import Path from unittest.mock import MagicMock, patch import pytest +from codebase_rag.embedder import EmbeddingCache, clear_embedding_cache from codebase_rag.utils.dependencies import has_torch, has_transformers @@ -44,6 +47,13 @@ def reset_model_cache() -> Generator[None, None, None]: get_model.cache_clear() +@pytest.fixture(autouse=True) +def reset_cache() -> Generator[None, None, None]: + clear_embedding_cache() + yield + clear_embedding_cache() + + @pytest.mark.skipif(not _has_semantic_deps(), reason="torch/transformers not installed") def test_embed_code_returns_768_dimensional_vector( mock_unixcoder: MagicMock, reset_model_cache: None @@ -146,6 +156,65 @@ def test_get_model_does_not_use_cuda_when_unavailable(reset_model_cache: None) - mock_instance.cuda.assert_not_called() +@pytest.mark.skipif(not _has_semantic_deps(), reason="torch/transformers not installed") +def test_select_device_prefers_cuda() -> None: + from codebase_rag.embedder import ( + _select_device, # ty: ignore[possibly-missing-import] + ) + + with patch("codebase_rag.embedder.torch.cuda.is_available", return_value=True): + with patch( + "codebase_rag.embedder.torch.backends.mps.is_available", return_value=True + ): + assert _select_device() == "cuda" + + +@pytest.mark.skipif(not _has_semantic_deps(), reason="torch/transformers not installed") +def test_select_device_uses_mps_when_cuda_unavailable() -> None: + from codebase_rag.embedder import ( + _select_device, # ty: ignore[possibly-missing-import] + ) + + with patch("codebase_rag.embedder.torch.cuda.is_available", return_value=False): + with patch( + "codebase_rag.embedder.torch.backends.mps.is_available", return_value=True + ): + assert _select_device() == "mps" + + +@pytest.mark.skipif(not _has_semantic_deps(), reason="torch/transformers not installed") +def test_select_device_falls_back_to_cpu() -> None: + from codebase_rag.embedder import ( + _select_device, # ty: ignore[possibly-missing-import] + ) + + with patch("codebase_rag.embedder.torch.cuda.is_available", return_value=False): + with patch( + "codebase_rag.embedder.torch.backends.mps.is_available", return_value=False + ): + assert _select_device() == "cpu" + + +@pytest.mark.skipif(not _has_semantic_deps(), reason="torch/transformers not installed") +def test_get_model_moves_to_mps_when_available(reset_model_cache: None) -> None: + from codebase_rag.embedder import get_model # ty: ignore[possibly-missing-import] + + with patch("codebase_rag.embedder.UniXcoder") as mock_unixcoder_class: + mock_instance = MagicMock() + mock_instance.eval.return_value = mock_instance + mock_instance.to.return_value = mock_instance + mock_unixcoder_class.return_value = mock_instance + + with patch("codebase_rag.embedder.torch.cuda.is_available", return_value=False): + with patch( + "codebase_rag.embedder.torch.backends.mps.is_available", + return_value=True, + ): + get_model() + + mock_instance.to.assert_called_once_with("mps") + + @pytest.mark.skipif(not _has_semantic_deps(), reason="torch/transformers not installed") @pytest.mark.slow def test_embed_code_integration(reset_model_cache: None) -> None: @@ -192,3 +261,311 @@ def test_embed_code_raises_without_dependencies() -> None: with pytest.raises(RuntimeError, match="Semantic search requires"): embed_code("x = 1") + + +def test_embedding_cache_put_and_get() -> None: + cache = EmbeddingCache() + embedding = [0.1, 0.2, 0.3] + cache.put("def foo(): pass", embedding) + assert cache.get("def foo(): pass") == embedding + + +def test_embedding_cache_miss_returns_none() -> None: + cache = EmbeddingCache() + assert cache.get("unknown code") is None + + +def test_embedding_cache_different_content_different_key() -> None: + cache = EmbeddingCache() + cache.put("code_a", [1.0]) + cache.put("code_b", [2.0]) + assert cache.get("code_a") == [1.0] + assert cache.get("code_b") == [2.0] + + +def test_embedding_cache_overwrite() -> None: + cache = EmbeddingCache() + cache.put("code_a", [1.0]) + cache.put("code_a", [9.9]) + assert cache.get("code_a") == [9.9] + + +def test_embedding_cache_len() -> None: + cache = EmbeddingCache() + assert len(cache) == 0 + cache.put("a", [1.0]) + assert len(cache) == 1 + cache.put("b", [2.0]) + assert len(cache) == 2 + + +def test_embedding_cache_clear() -> None: + cache = EmbeddingCache() + cache.put("a", [1.0]) + cache.put("b", [2.0]) + cache.clear() + assert len(cache) == 0 + assert cache.get("a") is None + + +def test_embedding_cache_get_many() -> None: + cache = EmbeddingCache() + cache.put("a", [1.0]) + cache.put("b", [2.0]) + results = cache.get_many(["a", "c", "b"]) + assert results == {0: [1.0], 2: [2.0]} + + +def test_embedding_cache_put_many() -> None: + cache = EmbeddingCache() + cache.put_many(["x", "y"], [[1.0], [2.0]]) + assert cache.get("x") == [1.0] + assert cache.get("y") == [2.0] + + +def test_embedding_cache_save_and_load() -> None: + with tempfile.TemporaryDirectory() as tmpdir: + cache_path = Path(tmpdir) / "test_cache.json" + cache = EmbeddingCache(path=cache_path) + cache.put("hello", [0.5, 0.6]) + cache.save() + + assert cache_path.exists() + + cache2 = EmbeddingCache(path=cache_path) + cache2.load() + assert cache2.get("hello") == [0.5, 0.6] + + +def test_embedding_cache_load_nonexistent_path() -> None: + cache = EmbeddingCache(path=Path("/nonexistent/path/cache.json")) + cache.load() + assert len(cache) == 0 + + +def test_embedding_cache_load_corrupt_file() -> None: + with tempfile.TemporaryDirectory() as tmpdir: + cache_path = Path(tmpdir) / "corrupt.json" + cache_path.write_text("not valid json data", encoding="utf-8") + cache = EmbeddingCache(path=cache_path) + cache.load() + assert len(cache) == 0 + + +def test_embedding_cache_save_no_path() -> None: + cache = EmbeddingCache(path=None) + cache.put("a", [1.0]) + cache.save() + + +def test_embedding_cache_load_no_path() -> None: + cache = EmbeddingCache(path=None) + cache.load() + assert len(cache) == 0 + + +@pytest.mark.skipif(not _has_semantic_deps(), reason="torch/transformers not installed") +def test_embed_code_uses_cache( + mock_unixcoder: MagicMock, reset_model_cache: None +) -> None: + import torch + + from codebase_rag.embedder import embed_code, get_embedding_cache + + mock_embedding = torch.zeros(1, 768) + mock_unixcoder.return_value = (torch.zeros(1, 5, 768), mock_embedding) + + cache = get_embedding_cache() + cache.put("cached_code", [0.42] * 768) + + with patch("codebase_rag.embedder.get_model", return_value=mock_unixcoder): + result = embed_code("cached_code") + + assert result == [0.42] * 768 + mock_unixcoder.tokenize.assert_not_called() + + +@pytest.mark.skipif(not _has_semantic_deps(), reason="torch/transformers not installed") +def test_embed_code_populates_cache( + mock_unixcoder: MagicMock, reset_model_cache: None +) -> None: + import torch + + from codebase_rag.embedder import embed_code, get_embedding_cache + + mock_embedding = torch.ones(1, 768) + mock_unixcoder.return_value = (torch.zeros(1, 5, 768), mock_embedding) + + with patch("codebase_rag.embedder.get_model", return_value=mock_unixcoder): + embed_code("new_code") + + cache = get_embedding_cache() + assert cache.get("new_code") is not None + + +@pytest.mark.skipif(not _has_semantic_deps(), reason="torch/transformers not installed") +def test_embed_code_batch_empty_list(reset_model_cache: None) -> None: + from codebase_rag.embedder import embed_code_batch + + assert embed_code_batch([]) == [] + + +@pytest.mark.skipif(not _has_semantic_deps(), reason="torch/transformers not installed") +def test_embed_code_batch_returns_correct_count( + mock_unixcoder: MagicMock, reset_model_cache: None +) -> None: + import torch + + from codebase_rag.embedder import embed_code_batch + + snippets = ["def a(): pass", "def b(): pass", "def c(): pass"] + mock_unixcoder.tokenize.return_value = [[1, 2, 3]] * 3 + mock_embedding = torch.zeros(3, 768) + mock_unixcoder.return_value = (torch.zeros(3, 5, 768), mock_embedding) + + with patch("codebase_rag.embedder.get_model", return_value=mock_unixcoder): + results = embed_code_batch(snippets) + + assert len(results) == 3 + assert all(len(emb) == 768 for emb in results) + + +@pytest.mark.skipif(not _has_semantic_deps(), reason="torch/transformers not installed") +def test_embed_code_batch_uses_padding( + mock_unixcoder: MagicMock, reset_model_cache: None +) -> None: + import torch + + from codebase_rag.embedder import embed_code_batch + + snippets = ["short", "longer code here"] + mock_unixcoder.tokenize.return_value = [[1, 2, 3, 0, 0], [1, 2, 3, 4, 5]] + mock_embedding = torch.zeros(2, 768) + mock_unixcoder.return_value = (torch.zeros(2, 5, 768), mock_embedding) + + with patch("codebase_rag.embedder.get_model", return_value=mock_unixcoder): + embed_code_batch(snippets) + + mock_unixcoder.tokenize.assert_called_once_with( + snippets, max_length=512, padding=True + ) + + +@pytest.mark.skipif(not _has_semantic_deps(), reason="torch/transformers not installed") +def test_embed_code_batch_cache_hit( + mock_unixcoder: MagicMock, reset_model_cache: None +) -> None: + from codebase_rag.embedder import embed_code_batch, get_embedding_cache + + cache = get_embedding_cache() + cache.put("a", [1.0] * 768) + cache.put("b", [2.0] * 768) + + with patch("codebase_rag.embedder.get_model", return_value=mock_unixcoder): + results = embed_code_batch(["a", "b"]) + + mock_unixcoder.tokenize.assert_not_called() + assert results == [[1.0] * 768, [2.0] * 768] + + +@pytest.mark.skipif(not _has_semantic_deps(), reason="torch/transformers not installed") +def test_embed_code_batch_partial_cache( + mock_unixcoder: MagicMock, reset_model_cache: None +) -> None: + import torch + + from codebase_rag.embedder import embed_code_batch, get_embedding_cache + + cache = get_embedding_cache() + cache.put("a", [1.0] * 768) + + mock_unixcoder.tokenize.return_value = [[1, 2, 3]] + mock_embedding = torch.full((1, 768), 3.0) + mock_unixcoder.return_value = (torch.zeros(1, 5, 768), mock_embedding) + + with patch("codebase_rag.embedder.get_model", return_value=mock_unixcoder): + results = embed_code_batch(["a", "b"]) + + assert results[0] == [1.0] * 768 + assert results[1] == [3.0] * 768 + mock_unixcoder.tokenize.assert_called_once_with(["b"], max_length=512, padding=True) + + +@pytest.mark.skipif(not _has_semantic_deps(), reason="torch/transformers not installed") +def test_embed_code_batch_populates_cache( + mock_unixcoder: MagicMock, reset_model_cache: None +) -> None: + import torch + + from codebase_rag.embedder import embed_code_batch, get_embedding_cache + + mock_unixcoder.tokenize.return_value = [[1, 2, 3]] + mock_embedding = torch.ones(1, 768) + mock_unixcoder.return_value = (torch.zeros(1, 5, 768), mock_embedding) + + with patch("codebase_rag.embedder.get_model", return_value=mock_unixcoder): + embed_code_batch(["new_snippet"]) + + cache = get_embedding_cache() + assert cache.get("new_snippet") is not None + + +@pytest.mark.skipif(not _has_semantic_deps(), reason="torch/transformers not installed") +def test_embed_code_batch_respects_batch_size( + mock_unixcoder: MagicMock, reset_model_cache: None +) -> None: + import torch + + from codebase_rag.embedder import embed_code_batch + + snippets = [f"def f{i}(): pass" for i in range(5)] + + def side_effect_tokenize(batch: list[str], **kwargs: int | bool) -> list[list[int]]: + return [[1, 2, 3]] * len(batch) + + mock_unixcoder.tokenize.side_effect = side_effect_tokenize + + def side_effect_forward(tensor: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: + n = tensor.shape[0] + return torch.zeros(n, 5, 768), torch.zeros(n, 768) + + mock_unixcoder.side_effect = side_effect_forward + + with patch("codebase_rag.embedder.get_model", return_value=mock_unixcoder): + results = embed_code_batch(snippets, batch_size=2) + + assert len(results) == 5 + assert mock_unixcoder.tokenize.call_count == 3 + + +def test_embed_code_batch_raises_without_dependencies() -> None: + if _has_semantic_deps(): + pytest.skip("Dependencies are installed") + + from codebase_rag.embedder import embed_code_batch + + with pytest.raises(RuntimeError, match="Semantic search requires"): + embed_code_batch(["x = 1"]) + + +def test_embedding_default_batch_size_at_least_64() -> None: + from codebase_rag import constants as cs + + assert cs.EMBEDDING_DEFAULT_BATCH_SIZE >= 64 + + +def test_embedding_cache_persistence_roundtrip() -> None: + with tempfile.TemporaryDirectory() as tmpdir: + cache_path = Path(tmpdir) / "subdir" / "cache.json" + + cache1 = EmbeddingCache(path=cache_path) + cache1.put("fn_a", [0.1, 0.2]) + cache1.put("fn_b", [0.3, 0.4]) + cache1.save() + + cache2 = EmbeddingCache(path=cache_path) + cache2.load() + assert cache2.get("fn_a") == [0.1, 0.2] + assert cache2.get("fn_b") == [0.3, 0.4] + assert cache2.get("fn_c") is None + assert len(cache2) == 2 diff --git a/codebase_rag/tests/test_eval_imports_internal_modules.py b/codebase_rag/tests/test_eval_imports_internal_modules.py new file mode 100644 index 000000000..4a30a707b --- /dev/null +++ b/codebase_rag/tests/test_eval_imports_internal_modules.py @@ -0,0 +1,38 @@ +# (H) Covers the L1 eval (evals/cgr_graph.py): cgr emits placeholder MODULE nodes +# (H) for unresolved imports whose path is the dotted import name (e.g. +# (H) "thrift.TTornado"). Those must not be treated as internal import targets when +# (H) scoring IMPORTS, or every "from .x import ..." collapses onto them as a +# (H) false positive. Only real in-repo .py modules count as internal. +from __future__ import annotations + +from codebase_rag import constants as cs +from evals.cgr_graph import _CapturingIngestor, _to_graph_data + +_MODULE = cs.NodeLabel.MODULE.value +_IMPORTS = cs.RelationshipType.IMPORTS.value + + +def _module(ingestor: _CapturingIngestor, qn: str, path: str) -> None: + ingestor.ensure_node_batch( + _MODULE, + {cs.KEY_QUALIFIED_NAME: qn, cs.KEY_NAME: qn, cs.KEY_PATH: path}, + ) + + +def test_import_placeholder_module_not_scored_as_internal() -> None: + ingestor = _CapturingIngestor() + _module(ingestor, "proj.src", "src.py") + _module(ingestor, "proj.real", "pkg/real.py") + # (H) Placeholder for an unresolved import: path is the dotted name, not a file. + _module(ingestor, "proj.placeholder", "proj.placeholder") + + for target in ("proj.real", "proj.placeholder"): + ingestor.ensure_relationship_batch( + (_MODULE, cs.KEY_QUALIFIED_NAME, "proj.src"), + _IMPORTS, + (_MODULE, cs.KEY_QUALIFIED_NAME, target), + ) + + graph = _to_graph_data(ingestor, "proj") + import_targets = {e.target_name for e in graph.name_edges if e.rel_type == _IMPORTS} + assert import_targets == {"pkg/real.py"}, import_targets diff --git a/codebase_rag/tests/test_eval_module_calls.py b/codebase_rag/tests/test_eval_module_calls.py new file mode 100644 index 000000000..b63938676 --- /dev/null +++ b/codebase_rag/tests/test_eval_module_calls.py @@ -0,0 +1,160 @@ +from __future__ import annotations + +from pathlib import Path + +from evals.module_calls import ( + cgr_module_calls, + oracle_module_calls, + score_module_calls, +) + +_FIXTURE = """def make_default(): + return 1 + + +def helper(): + return 2 + + +def main(): + helper() + + +def with_default(x=make_default()): + return x + + +CONFIG = make_default() + + +if __name__ == "__main__": + main() +""" + + +def _names(edges: set[tuple[str, ...]]) -> set[str]: + return {e.target_name for e in edges} + + +class TestModuleCallEval: + def _write(self, tmp_path: Path) -> Path: + proj = tmp_path / "proj" + proj.mkdir() + (proj / "app.py").write_text(_FIXTURE, encoding="utf-8") + return proj + + def test_oracle_counts_only_definition_time_calls(self, tmp_path: Path) -> None: + proj = self._write(tmp_path) + oracle = oracle_module_calls(proj, "proj") + + # (H) make_default runs at module load (CONFIG = ... and the default arg); + # (H) main runs from the `if __name__` block; helper only runs inside main's + # (H) body, so it is NOT a module-level call. + assert _names(oracle) == {"make_default", "main"} + + def test_cgr_matches_oracle_module_calls(self, tmp_path: Path) -> None: + proj = self._write(tmp_path) + cgr = cgr_module_calls(proj, "proj") + oracle = oracle_module_calls(proj, "proj") + + _tp, fp, fn, precision, recall = score_module_calls(cgr, oracle) + + assert fp == 0, f"spurious module calls: {sorted(_names(cgr - oracle))}" + assert fn == 0, f"missed module calls: {sorted(_names(oracle - cgr))}" + assert precision == 1.0 + assert recall == 1.0 + + def test_nested_call_is_not_module_attributed(self, tmp_path: Path) -> None: + proj = self._write(tmp_path) + cgr = cgr_module_calls(proj, "proj") + + assert "helper" not in _names(cgr) + + def _oracle_for(self, tmp_path: Path, source: str) -> set[str]: + proj = tmp_path / "proj" + proj.mkdir() + (proj / "app.py").write_text(source, encoding="utf-8") + return _names(oracle_module_calls(proj, "proj")) + + def test_lambda_body_call_is_deferred(self, tmp_path: Path) -> None: + # (H) `helper` runs only when `work()` is called, not at import. + names = self._oracle_for( + tmp_path, + "def helper():\n return 1\n\n\nwork = lambda: helper()\n", + ) + assert "helper" not in names + + def test_generator_expression_call_is_deferred(self, tmp_path: Path) -> None: + # (H) a generator is lazy: `helper` runs only when the generator is consumed. + names = self._oracle_for( + tmp_path, + "def helper():\n return 1\n\n\ngen = (helper() for _ in range(2))\n", + ) + assert "helper" not in names + + def test_generator_outermost_iterable_is_eager(self, tmp_path: Path) -> None: + # (H) the first iterable of a generator is evaluated when the generator is + # (H) created (at import), so `load_items` is a module call but the lazy + # (H) body call `helper` is not. + names = self._oracle_for( + tmp_path, + "def helper():\n return 1\n\n\n" + "def load_items():\n return [1]\n\n\n" + "gen = (helper(x) for x in load_items())\n", + ) + assert "load_items" in names + assert "helper" not in names + + def test_list_comprehension_call_is_module_attributed(self, tmp_path: Path) -> None: + # (H) a list comprehension runs eagerly at import, so its call counts. + names = self._oracle_for( + tmp_path, + "def helper():\n return 1\n\n\nout = [helper() for _ in range(2)]\n", + ) + assert "helper" in names + + def test_class_decorator_is_module_attributed(self, tmp_path: Path) -> None: + # (H) a bare class decorator runs at module load -> a module call. + names = self._oracle_for( + tmp_path, + "def deco(cls):\n return cls\n\n\n@deco\nclass Widget:\n pass\n", + ) + assert "deco" in names + + def _cgr_for(self, tmp_path: Path, source: str) -> set[str]: + proj = tmp_path / "proj" + proj.mkdir() + (proj / "app.py").write_text(source, encoding="utf-8") + return _names(cgr_module_calls(proj, "proj")) + + def test_classless_module_construction_credited_via_instantiates( + self, tmp_path: Path + ) -> None: + # (H) a dataclass has no explicit __init__, so cgr emits no CALLS for its + # (H) construction, only INSTANTIATES -> the class. The eval must still + # (H) credit the module-scope `Config(1)` so L2 recall stays 1.0. + source = ( + "from dataclasses import dataclass\n\n\n" + "@dataclass\nclass Config:\n n: int\n\n\n" + "CONFIG = Config(1)\n" + ) + assert "Config" in self._cgr_for(tmp_path, source) + + def test_return_annotation_counted_without_future_import( + self, tmp_path: Path + ) -> None: + # (H) without postponed annotations, `Result()` runs at import. + names = self._oracle_for( + tmp_path, + "def Result():\n return 1\n\n\ndef route() -> Result():\n return 1\n", + ) + assert "Result" in names + + def test_annotation_not_counted_with_future_import(self, tmp_path: Path) -> None: + # (H) with postponed annotations, the annotation is a string and never runs. + names = self._oracle_for( + tmp_path, + "from __future__ import annotations\n\n\n" + "def Result():\n return 1\n\n\ndef route() -> Result():\n return 1\n", + ) + assert "Result" not in names diff --git a/codebase_rag/tests/test_eval_score_span.py b/codebase_rag/tests/test_eval_score_span.py new file mode 100644 index 000000000..2b1031915 --- /dev/null +++ b/codebase_rag/tests/test_eval_score_span.py @@ -0,0 +1,54 @@ +# (H) Covers the L1 eval span grading (evals/score.score_span): among nodes both +# (H) cgr and the oracle identify by (kind, file, start), it grades how often cgr's +# (H) end_line agrees with the oracle's. A disagreement must surface as fp+fn (not +# (H) be masked by node identity already being 1.0), and nodes only one side has +# (H) must not be graded at all. +from __future__ import annotations + +from codebase_rag import constants as cs +from evals import constants as ec +from evals.score import score_span +from evals.types_defs import DefNode, GraphData, NodeKey + +_FUNC = cs.NodeLabel.FUNCTION.value +_KINDS = (cs.NodeLabel.FUNCTION,) + + +def _graph(*nodes: tuple[str, int, int]) -> GraphData: + # (H) Each node is (file, start, end) for a Function. + mapping: dict[NodeKey, DefNode] = {} + for file, start, end in nodes: + key = NodeKey(_FUNC, file, start) + mapping[key] = DefNode(key, "f", end) + return GraphData(nodes=mapping, edges=set(), name_edges=set()) + + +def test_span_exact_match_scores_perfect() -> None: + cgr = _graph(("a.rs", 1, 5), ("a.rs", 10, 20)) + oracle = _graph(("a.rs", 1, 5), ("a.rs", 10, 20)) + by_label = {row["label"]: row for row in score_span(cgr, oracle, _KINDS).rows} + row = by_label[_FUNC] + assert row["precision"] == 1.0 and row["recall"] == 1.0 + assert row["tp"] == 2 and row["fp"] == 0 and row["fn"] == 0 + + +def test_span_end_line_mismatch_is_penalized_and_surfaced() -> None: + cgr = _graph(("a.rs", 1, 5), ("a.rs", 10, 99)) + oracle = _graph(("a.rs", 1, 5), ("a.rs", 10, 20)) + result = score_span(cgr, oracle, _KINDS) + by_label = {row["label"]: row for row in result.rows} + row = by_label[_FUNC] + assert row["tp"] == 1 and row["fp"] == 1 and row["fn"] == 1 + assert row["precision"] == 0.5 and row["recall"] == 0.5 + bucket = result.diff[ec.DIFF_SPAN_PREFIX + _FUNC] + assert any("10-20" in line for line in bucket["missing"]), bucket + assert any("10-99" in line for line in bucket["extra"]), bucket + + +def test_span_only_grades_co_identified_nodes() -> None: + # (H) cgr has an extra node (start 30) the oracle lacks; it must not be graded. + cgr = _graph(("a.rs", 1, 5), ("a.rs", 30, 40)) + oracle = _graph(("a.rs", 1, 5)) + by_label = {row["label"]: row for row in score_span(cgr, oracle, _KINDS).rows} + row = by_label[_FUNC] + assert row["tp"] == 1 and row["fp"] == 0 and row["fn"] == 0 diff --git a/codebase_rag/tests/test_external_package_name_collision.py b/codebase_rag/tests/test_external_package_name_collision.py new file mode 100644 index 000000000..f5c6d51d7 --- /dev/null +++ b/codebase_rag/tests/test_external_package_name_collision.py @@ -0,0 +1,87 @@ +# (H) L2 residual from the evals/ harness: when cgr is pointed at a directory that +# (H) is itself a package (has __init__.py), a bare absolute import like +# (H) `from mcp.server import X` is the EXTERNAL top-level package, not the internal +# (H) sibling subpackage `.mcp` (which is reachable only as that dotted name +# (H) or relatively). cgr used to mis-resolve it to the internal package. +from __future__ import annotations + +from pathlib import Path + +from codebase_rag import constants as cs +from codebase_rag.graph_updater import GraphUpdater +from codebase_rag.parser_loader import load_parsers +from codebase_rag.types_defs import PropertyDict, PropertyValue, ResultRow + +PROJECT = "proj" + + +class _Capture: + def __init__(self) -> None: + self.rels: list[tuple[PropertyValue, str, PropertyValue]] = [] + + def ensure_node_batch(self, label: str, properties: PropertyDict) -> None: + return None + + def ensure_relationship_batch( + self, + from_spec: tuple[str, str, PropertyValue], + rel_type: str, + to_spec: tuple[str, str, PropertyValue], + properties: PropertyDict | None = None, + ) -> None: + self.rels.append((from_spec[2], str(rel_type), to_spec[2])) + + def flush_all(self) -> None: + return None + + def fetch_all( + self, query: str, params: PropertyDict | None = None + ) -> list[ResultRow]: + return [] + + def execute_write(self, query: str, params: PropertyDict | None = None) -> None: + return None + + +def _build(tmp_path: Path, importer: str, src: str) -> _Capture: + (tmp_path / "__init__.py").touch() + mcp = tmp_path / "mcp" + mcp.mkdir() + mcp.joinpath("__init__.py").touch() + mcp.joinpath("server.py").write_text("Thing = 1\n") + (tmp_path / importer).write_text(src) + parsers, queries = load_parsers() + cap = _Capture() + GraphUpdater( + ingestor=cap, + repo_path=tmp_path, + parsers=parsers, + queries=queries, + project_name=PROJECT, + ).run(force=True) + return cap + + +def _imports(cap: _Capture) -> set[tuple[PropertyValue, PropertyValue]]: + return { + (frm, to) for (frm, rel, to) in cap.rels if rel == cs.RelationshipType.IMPORTS + } + + +class TestExternalPackageNameCollision: + def test_bare_absolute_import_is_external_not_internal( + self, tmp_path: Path + ) -> None: + cap = _build( + tmp_path, "client.py", "from mcp.server import Thing\n\nx = Thing\n" + ) + edges = _imports(cap) + assert ("proj.client", "proj.mcp.server") not in edges, edges + assert ("proj.client", "proj.mcp") not in edges, edges + + def test_relative_import_to_subpackage_still_internal(self, tmp_path: Path) -> None: + cap = _build( + tmp_path, "client.py", "from .mcp.server import Thing\n\nx = Thing\n" + ) + edges = _imports(cap) + assert ("proj.client", "proj.mcp.server") in edges, edges diff --git a/codebase_rag/tests/test_function_ingest.py b/codebase_rag/tests/test_function_ingest.py index 814380ce4..1d7b6e8a6 100644 --- a/codebase_rag/tests/test_function_ingest.py +++ b/codebase_rag/tests/test_function_ingest.py @@ -234,7 +234,7 @@ def inner_func(): lang_config = queries[cs.SupportedLanguage.PYTHON]["config"] result = definition_processor._is_method(inner_func, lang_config) - assert result is True + assert result is False class TestFormatNestedQn: @@ -317,7 +317,7 @@ def test_top_level_function( lang_config = queries[cs.SupportedLanguage.PYTHON]["config"] parent_type, parent_qn = definition_processor._determine_function_parent( - func_node, "proj.module", lang_config + func_node, "proj.module.my_function", "proj.module", lang_config ) assert parent_type == "Module" assert parent_qn == "proj.module" @@ -342,7 +342,7 @@ def inner(): lang_config = queries[cs.SupportedLanguage.PYTHON]["config"] parent_type, parent_qn = definition_processor._determine_function_parent( - inner_func, "proj.module", lang_config + inner_func, "proj.module.outer.inner", "proj.module", lang_config ) assert parent_type == "Function" assert parent_qn == "proj.module.outer" @@ -466,7 +466,9 @@ def test_basic_function_props( is_exported=False, ) - result = definition_processor._build_function_props(func_node, resolution) + result = definition_processor._build_function_props( + func_node, resolution, "proj.module" + ) assert result["qualified_name"] == "proj.module.my_function" assert result["name"] == "my_function" @@ -497,7 +499,9 @@ def test_exported_function_props( is_exported=True, ) - result = definition_processor._build_function_props(func_node, resolution) + result = definition_processor._build_function_props( + func_node, resolution, "proj.module" + ) assert result["is_exported"] is True diff --git a/codebase_rag/tests/test_function_local_definitions.py b/codebase_rag/tests/test_function_local_definitions.py new file mode 100644 index 000000000..2bd844626 --- /dev/null +++ b/codebase_rag/tests/test_function_local_definitions.py @@ -0,0 +1,111 @@ +# (H) Finding #3 from the evals/ harness: methods of a class defined inside a +# (H) function body (function-local class) were dropped. They are now captured by +# (H) default (CAPTURE_FUNCTION_LOCAL_DEFINITIONS=True); explicitly disabling the +# (H) flag restores the historical behaviour of skipping them. +from __future__ import annotations + +from pathlib import Path + +import pytest + +from codebase_rag import constants as cs +from codebase_rag.config import settings +from codebase_rag.graph_updater import GraphUpdater +from codebase_rag.parser_loader import load_parsers +from codebase_rag.types_defs import PropertyDict, PropertyValue, ResultRow + +PROJECT = "localproj" + +MODULE_SRC = """class Holder: + def make(self) -> object: + class Local: + def helper(self) -> str: + return "x" + + return Local() +""" + +_RelTuple = tuple[str, PropertyValue, str, str, PropertyValue] + + +class _Capture: + def __init__(self) -> None: + self.nodes: dict[tuple[str, PropertyValue], PropertyDict] = {} + self.rels: list[_RelTuple] = [] + + def ensure_node_batch(self, label: str, properties: PropertyDict) -> None: + uid = properties[cs.NODE_UNIQUE_CONSTRAINTS[label]] + self.nodes[(str(label), uid)] = dict(properties) + + def ensure_relationship_batch( + self, + from_spec: tuple[str, str, PropertyValue], + rel_type: str, + to_spec: tuple[str, str, PropertyValue], + properties: PropertyDict | None = None, + ) -> None: + self.rels.append( + ( + str(from_spec[0]), + from_spec[2], + str(rel_type), + str(to_spec[0]), + to_spec[2], + ) + ) + + def flush_all(self) -> None: + return None + + def fetch_all( + self, query: str, params: PropertyDict | None = None + ) -> list[ResultRow]: + return [] + + def execute_write(self, query: str, params: PropertyDict | None = None) -> None: + return None + + +def _build(tmp_path: Path) -> _Capture: + (tmp_path / "m.py").write_text(MODULE_SRC) + parsers, queries = load_parsers() + cap = _Capture() + GraphUpdater( + ingestor=cap, + repo_path=tmp_path, + parsers=parsers, + queries=queries, + project_name=PROJECT, + ).run(force=True) + return cap + + +def _local_method_lines(cap: _Capture) -> list[int]: + return sorted( + int(props[cs.KEY_START_LINE]) + for (label, _uid), props in cap.nodes.items() + if label == cs.NodeLabel.METHOD + and props.get(cs.KEY_NAME) == "helper" + and props.get(cs.KEY_START_LINE) is not None + ) + + +class TestFunctionLocalDefinitions: + def test_default_captures_local_class_methods(self, tmp_path: Path) -> None: + cap = _build(tmp_path) + assert _local_method_lines(cap) == [4] + + defines_method_to_helper = [ + target + for (_fl, _fv, rel_type, _tl, target) in cap.rels + if rel_type == cs.RelationshipType.DEFINES_METHOD + and str(target).endswith(".Local.helper") + ] + assert len(defines_method_to_helper) == 1, defines_method_to_helper + + def test_flag_off_skips_local_class_methods( + self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch + ) -> None: + monkeypatch.setattr(settings, "CAPTURE_FUNCTION_LOCAL_DEFINITIONS", False) + cap = _build(tmp_path) + assert _local_method_lines(cap) == [] diff --git a/codebase_rag/tests/test_getattr_dispatch.py b/codebase_rag/tests/test_getattr_dispatch.py new file mode 100644 index 000000000..eab8f8e39 --- /dev/null +++ b/codebase_rag/tests/test_getattr_dispatch.py @@ -0,0 +1,101 @@ +# (H) L3 finding from the evals/ harness: JavaTypeResolverMixin._find_registry_entries_under +# (H) does `finder = getattr(self.function_registry, cs.METHOD_FIND_WITH_PREFIX, None)` then +# (H) calls finder(...). The call dispatches to FunctionRegistryTrie.find_with_prefix at +# (H) runtime. Resolving it needs getattr(recv, name) modelled as recv., where the +# (H) name argument is a string literal or a module constant resolved to its string value. +from __future__ import annotations + +from pathlib import Path + +from codebase_rag import constants as cs +from codebase_rag.graph_updater import GraphUpdater +from codebase_rag.parser_loader import load_parsers +from codebase_rag.types_defs import PropertyDict, PropertyValue, ResultRow + +PROJECT = "proj" + +FILES = { + "pkg/__init__.py": "", + "pkg/names.py": 'METHOD_DO = "do"\n', + "pkg/helper.py": ( + "class Helper:\n def do(self, value):\n return value\n" + ), + "pkg/worker.py": ( + "from . import names\n" + "from .helper import Helper\n\n\n" + "class Worker:\n" + " def __init__(self) -> None:\n" + " self._helper = Helper()\n\n" + " def via_constant(self, value):\n" + " fn = getattr(self._helper, names.METHOD_DO, None)\n" + " if callable(fn):\n" + " return fn(value)\n" + " return None\n\n" + " def via_literal(self, value):\n" + ' fn = getattr(self._helper, "do", None)\n' + " return fn(value)\n" + ), +} + + +class _Capture: + def __init__(self) -> None: + self.rels: list[tuple[PropertyValue, str, PropertyValue]] = [] + + def ensure_node_batch(self, label: str, properties: PropertyDict) -> None: + return None + + def ensure_relationship_batch( + self, + from_spec: tuple[str, str, PropertyValue], + rel_type: str, + to_spec: tuple[str, str, PropertyValue], + properties: PropertyDict | None = None, + ) -> None: + self.rels.append((from_spec[2], str(rel_type), to_spec[2])) + + def flush_all(self) -> None: + return None + + def fetch_all( + self, query: str, params: PropertyDict | None = None + ) -> list[ResultRow]: + return [] + + def execute_write(self, query: str, params: PropertyDict | None = None) -> None: + return None + + +def _calls(tmp_path: Path) -> set[tuple[PropertyValue, PropertyValue]]: + for rel, content in FILES.items(): + p = tmp_path / rel + p.parent.mkdir(parents=True, exist_ok=True) + p.write_text(content) + parsers, queries = load_parsers() + cap = _Capture() + GraphUpdater( + ingestor=cap, + repo_path=tmp_path, + parsers=parsers, + queries=queries, + project_name=PROJECT, + ).run(force=True) + return { + (frm, to) for (frm, rel, to) in cap.rels if rel == cs.RelationshipType.CALLS + } + + +class TestGetattrDispatch: + def test_getattr_with_constant_name_resolves(self, tmp_path: Path) -> None: + calls = _calls(tmp_path) + assert ( + "proj.pkg.worker.Worker.via_constant", + "proj.pkg.helper.Helper.do", + ) in calls, calls + + def test_getattr_with_string_literal_resolves(self, tmp_path: Path) -> None: + calls = _calls(tmp_path) + assert ( + "proj.pkg.worker.Worker.via_literal", + "proj.pkg.helper.Helper.do", + ) in calls, calls diff --git a/codebase_rag/tests/test_github_issues_integration.py b/codebase_rag/tests/test_github_issues_integration.py index 2b6bc081f..423945657 100644 --- a/codebase_rag/tests/test_github_issues_integration.py +++ b/codebase_rag/tests/test_github_issues_integration.py @@ -1,7 +1,10 @@ import os from unittest.mock import patch +import pytest + from codebase_rag.config import AppConfig +from codebase_rag.constants import GoogleProviderType class TestGitHubIssuesIntegration: @@ -142,9 +145,6 @@ def test_openai_compatible_endpoints(self) -> None: assert orchestrator.endpoint == "https://api.together.xyz/v1" def test_vertex_ai_enterprise_scenario(self) -> None: - """ - Test enterprise Vertex AI configuration scenario. - """ env_content = { "ORCHESTRATOR_PROVIDER": "google", "ORCHESTRATOR_MODEL": "gemini-2.5-pro", @@ -162,9 +162,63 @@ def test_vertex_ai_enterprise_scenario(self) -> None: assert orchestrator.model_id == "gemini-2.5-pro" assert orchestrator.project_id == "my-enterprise-project" assert orchestrator.region == "us-central1" - assert orchestrator.provider_type == "vertex" + assert orchestrator.provider_type == GoogleProviderType.VERTEX assert orchestrator.service_account_file == "/path/to/service-account.json" + def test_vertex_ai_skips_api_key_validation(self) -> None: + env_content = { + "ORCHESTRATOR_PROVIDER": "google", + "ORCHESTRATOR_MODEL": "gemini-2.5-pro", + "ORCHESTRATOR_PROJECT_ID": "my-project", + "ORCHESTRATOR_REGION": "us-central1", + "ORCHESTRATOR_PROVIDER_TYPE": "vertex", + "ORCHESTRATOR_SERVICE_ACCOUNT_FILE": "/path/to/sa.json", + "CYPHER_PROVIDER": "google", + "CYPHER_MODEL": "gemini-2.5-flash", + "CYPHER_PROJECT_ID": "my-project", + "CYPHER_REGION": "us-central1", + "CYPHER_PROVIDER_TYPE": "vertex", + "CYPHER_SERVICE_ACCOUNT_FILE": "/path/to/sa.json", + } + + with patch.dict(os.environ, env_content): + config = AppConfig() + + orchestrator = config.active_orchestrator_config + orchestrator.validate_api_key("orchestrator") + + cypher = config.active_cypher_config + cypher.validate_api_key("cypher") + + def test_vertex_ai_with_google_api_key_env_does_not_error(self) -> None: + env_content = { + "ORCHESTRATOR_PROVIDER": "google", + "ORCHESTRATOR_MODEL": "gemini-2.5-pro", + "ORCHESTRATOR_PROJECT_ID": "my-project", + "ORCHESTRATOR_PROVIDER_TYPE": "vertex", + "ORCHESTRATOR_SERVICE_ACCOUNT_FILE": "/path/to/sa.json", + "GOOGLE_API_KEY": "stray-key-from-env", + } + + with patch.dict(os.environ, env_content): + config = AppConfig() + orchestrator = config.active_orchestrator_config + orchestrator.validate_api_key("orchestrator") + + def test_google_gla_without_api_key_raises(self) -> None: + env_content = { + "ORCHESTRATOR_PROVIDER": "google", + "ORCHESTRATOR_MODEL": "gemini-2.5-pro", + "ORCHESTRATOR_PROVIDER_TYPE": "gla", + "ORCHESTRATOR_API_KEY": "", + } + + with patch.dict(os.environ, env_content): + config = AppConfig() + orchestrator = config.active_orchestrator_config + with pytest.raises(ValueError, match="API Key Missing"): + orchestrator.validate_api_key("orchestrator") + def test_reasoning_model_thinking_budget(self) -> None: """ Test configuration for reasoning models with thinking budget. diff --git a/codebase_rag/tests/test_go_containment_oracle.py b/codebase_rag/tests/test_go_containment_oracle.py new file mode 100644 index 000000000..f801132ed --- /dev/null +++ b/codebase_rag/tests/test_go_containment_oracle.py @@ -0,0 +1,67 @@ +# (H) Covers Go containment-edge validation: cgr's DEFINES (Module->top-level +# (H) func/type) and DEFINES_METHOD (struct Class->receiver method) edges are +# (H) graded against the independent go/ast oracle (evals/oracles/go_ast.go), +# (H) joined on (kind, file, line) endpoints. The sample exercises a same-file +# (H) method and a cross-file method (receiver type declared in another file). +from __future__ import annotations + +from pathlib import Path + +import pytest + +from codebase_rag import constants as cs +from codebase_rag.parser_loader import load_parsers +from evals import constants as ec +from evals.cgr_graph import extract_cgr_go_graph +from evals.oracles import go_available, run_go_oracle +from evals.score import score_edge_types + +GO_TYPES = """\ +package demo + +type Shape interface { Area() float64 } + +type Point struct{ X int } + +func (p Point) Area() float64 { return 1.0 } +""" + +GO_MORE = """\ +package demo + +func Free(a int) int { return a + 1 } + +func (p Point) Scale(k int) int { return p.X * k } +""" + + +def _require_go() -> None: + if not go_available(): + pytest.skip("go toolchain not available") + if cs.SupportedLanguage.GO not in load_parsers()[0]: + pytest.skip("go parser not available") + + +def test_cgr_matches_go_oracle_on_containment_edges(tmp_path: Path) -> None: + _require_go() + project = tmp_path / "go_edge_test" + project.mkdir() + (project / "types.go").write_text(GO_TYPES, encoding="utf-8") + (project / "more.go").write_text(GO_MORE, encoding="utf-8") + + cgr = extract_cgr_go_graph(project, project.name) + oracle = run_go_oracle(project) + + result = score_edge_types(cgr, oracle, ec.SCORED_EDGE_TYPES) + by_label = {row["label"]: row for row in result.rows} + for label in ( + cs.RelationshipType.DEFINES.value, + cs.RelationshipType.DEFINES_METHOD.value, + ): + row = by_label.get(label) + assert row is not None, (label, by_label, result.diff) + assert row["precision"] == 1.0 and row["recall"] == 1.0, ( + label, + row, + result.diff, + ) diff --git a/codebase_rag/tests/test_go_receiver_methods.py b/codebase_rag/tests/test_go_receiver_methods.py new file mode 100644 index 000000000..d5ebd4cc3 --- /dev/null +++ b/codebase_rag/tests/test_go_receiver_methods.py @@ -0,0 +1,153 @@ +from __future__ import annotations + +from pathlib import Path +from unittest.mock import MagicMock + +import pytest + +from codebase_rag.constants import ( + KEY_QUALIFIED_NAME, + NodeLabel, + RelationshipType, +) +from codebase_rag.tests.conftest import ( + create_and_run_updater, + get_nodes, + get_relationships, +) + + +@pytest.fixture +def go_methods_project(temp_repo: Path) -> Path: + project_path = temp_repo / "go_methods_test" + project_path.mkdir() + (project_path / "go.mod").write_text( + encoding="utf-8", data="module go_methods_test\n\ngo 1.22\n" + ) + (project_path / "shapes.go").write_text( + encoding="utf-8", + data="""package shapes + +type Point struct { +\tX int +\tY int +} + +type Celsius float64 + +func (p Point) Area() float64 { +\treturn 0.0 +} + +func (p *Point) Scale(f float64) { +\tp.X = p.X * int(f) +} + +func (c Celsius) ToFahrenheit() float64 { +\treturn float64(c)*9/5 + 32 +} + +func NewPoint(x int, y int) Point { +\treturn Point{X: x, Y: y} +} +""", + ) + return project_path + + +def _method_qns(mock_ingestor: MagicMock) -> set[str]: + return { + str(node[0][1].get(KEY_QUALIFIED_NAME)) + for node in get_nodes(mock_ingestor, NodeLabel.METHOD) + } + + +def test_go_value_receiver_method_is_method_node( + go_methods_project: Path, mock_ingestor: MagicMock +) -> None: + create_and_run_updater(go_methods_project, mock_ingestor, skip_if_missing="go") + project = go_methods_project.name + assert f"{project}.shapes.Point.Area" in _method_qns(mock_ingestor) + + +def test_go_pointer_receiver_method_is_method_node( + go_methods_project: Path, mock_ingestor: MagicMock +) -> None: + create_and_run_updater(go_methods_project, mock_ingestor, skip_if_missing="go") + project = go_methods_project.name + assert f"{project}.shapes.Point.Scale" in _method_qns(mock_ingestor) + + +def test_go_defined_type_receiver_method_is_method_node( + go_methods_project: Path, mock_ingestor: MagicMock +) -> None: + create_and_run_updater(go_methods_project, mock_ingestor, skip_if_missing="go") + project = go_methods_project.name + assert f"{project}.shapes.Celsius.ToFahrenheit" in _method_qns(mock_ingestor) + + +def test_go_free_function_not_a_method( + go_methods_project: Path, mock_ingestor: MagicMock +) -> None: + create_and_run_updater(go_methods_project, mock_ingestor, skip_if_missing="go") + project = go_methods_project.name + function_qns = { + str(node[0][1].get(KEY_QUALIFIED_NAME)) + for node in get_nodes(mock_ingestor, NodeLabel.FUNCTION) + } + assert f"{project}.shapes.NewPoint" in function_qns + # (H) A receiver method must not also be emitted as a plain Function. + assert f"{project}.shapes.Area" not in function_qns + assert f"{project}.shapes.Point.Area" not in function_qns + + +def test_go_method_defined_by_receiver_type( + go_methods_project: Path, mock_ingestor: MagicMock +) -> None: + create_and_run_updater(go_methods_project, mock_ingestor, skip_if_missing="go") + project = go_methods_project.name + defines_method = get_relationships( + mock_ingestor, RelationshipType.DEFINES_METHOD.value + ) + pairs = {(call[0][0][2], call[0][2][2]) for call in defines_method} + assert (f"{project}.shapes.Point", f"{project}.shapes.Point.Area") in pairs + assert ( + f"{project}.shapes.Celsius", + f"{project}.shapes.Celsius.ToFahrenheit", + ) in pairs + + +@pytest.fixture +def go_crossfile_project(temp_repo: Path) -> Path: + # (H) Same Go package split across two files: the receiver type lives in + # (H) types.go, a method on it lives in ops.go. A Go package spans every + # (H) file in its directory, so the method must bind to the type's node. + project_path = temp_repo / "go_xfile_test" + project_path.mkdir() + (project_path / "go.mod").write_text( + encoding="utf-8", data="module go_xfile_test\n\ngo 1.22\n" + ) + (project_path / "types.go").write_text( + encoding="utf-8", + data="package shapes\n\ntype Point struct {\n\tX int\n}\n", + ) + (project_path / "ops.go").write_text( + encoding="utf-8", + data="package shapes\n\nfunc (p Point) Scale(k int) int {\n\treturn p.X * k\n}\n", + ) + return project_path + + +def test_go_crossfile_method_binds_to_declaring_type( + go_crossfile_project: Path, mock_ingestor: MagicMock +) -> None: + create_and_run_updater(go_crossfile_project, mock_ingestor, skip_if_missing="go") + project = go_crossfile_project.name + # (H) Point is declared in types.go, so its Class node and the method's qn + # (H) are anchored to the types module, not the ops module that holds Scale. + assert f"{project}.types.Point.Scale" in _method_qns(mock_ingestor) + defines_method = get_relationships( + mock_ingestor, RelationshipType.DEFINES_METHOD.value + ) + pairs = {(call[0][0][2], call[0][2][2]) for call in defines_method} + assert (f"{project}.types.Point", f"{project}.types.Point.Scale") in pairs diff --git a/codebase_rag/tests/test_go_span_oracle.py b/codebase_rag/tests/test_go_span_oracle.py new file mode 100644 index 000000000..aafd3a334 --- /dev/null +++ b/codebase_rag/tests/test_go_span_oracle.py @@ -0,0 +1,72 @@ +# (H) Covers Go node SPAN (end_line) validation: cgr's end_line for each node is +# (H) graded against the go/ast oracle (which emits each declaration's last-token +# (H) line), joined on (kind, file, start). Exercises a multi-line struct, a +# (H) grouped `type (...)` block, an interface, and a multi-line method body. +from __future__ import annotations + +from pathlib import Path + +import pytest + +from codebase_rag import constants as cs +from codebase_rag.parser_loader import load_parsers +from evals import constants as ec +from evals.cgr_graph import extract_cgr_go_graph +from evals.oracles import go_available, run_go_oracle +from evals.score import score_span + +GO_SRC = """\ +package demo + +type Shape interface { + Area() float64 + Name() string +} + +type Point struct { + X int + Y int +} + +type ( + Meters int + Label string +) + +func (p Point) Area( + scale float64, +) float64 { + return float64(p.X) * scale +} + +func Free(a int) int { + return a + 1 +} +""" + + +def _require_go() -> None: + if not go_available(): + pytest.skip("go toolchain not available") + if cs.SupportedLanguage.GO not in load_parsers()[0]: + pytest.skip("go parser not available") + + +def test_cgr_matches_go_oracle_on_node_spans(tmp_path: Path) -> None: + _require_go() + project = tmp_path / "go_span_test" + project.mkdir() + (project / "demo.go").write_text(GO_SRC, encoding="utf-8") + + cgr = extract_cgr_go_graph(project, project.name) + oracle = run_go_oracle(project) + + result = score_span(cgr, oracle, ec.GO_SCORED_NODE_KINDS) + by_label = {row["label"]: row for row in result.rows} + aggregate = by_label.get(ec.AGGREGATE_LABEL) + assert aggregate is not None, (by_label, result.diff) + assert aggregate["precision"] == 1.0 and aggregate["recall"] == 1.0, ( + aggregate, + result.diff, + ) + assert aggregate["tp"] >= 5, aggregate diff --git a/codebase_rag/tests/test_go_structure_oracle.py b/codebase_rag/tests/test_go_structure_oracle.py new file mode 100644 index 000000000..1035cb497 --- /dev/null +++ b/codebase_rag/tests/test_go_structure_oracle.py @@ -0,0 +1,90 @@ +# (H) Covers the Go structure oracle harness (evals/oracles/go_ast.go + +# (H) evals/go_l1.py): the go/ast oracle is authoritative ground truth, and cgr's +# (H) captured Go nodes are graded against it on (kind, file, start_line). +from __future__ import annotations + +from pathlib import Path + +import pytest + +from codebase_rag import constants as cs +from codebase_rag.parser_loader import load_parsers +from evals.cgr_graph import extract_cgr_go_nodes +from evals.oracles import go_available, run_go_oracle +from evals.score import score_node_kinds +from evals.types_defs import GraphData + +GO_SRC = """package shapes + +type Point struct { +\tX int +\tY int +} + +type Shape interface { +\tArea() float64 +} + +type Celsius float64 + +func NewPoint(x int, y int) Point { +\treturn Point{X: x, Y: y} +} + +func (p Point) Area() float64 { +\treturn 0.0 +} +""" + + +def _require_go() -> None: + if not go_available(): + pytest.skip("go toolchain not available") + if cs.SupportedLanguage.GO not in load_parsers()[0]: + pytest.skip("go parser not available") + + +def _go_project(tmp_path: Path) -> Path: + project = tmp_path / "shapes_mod" + project.mkdir() + (project / "go.mod").write_text("module shapes_mod\n\ngo 1.22\n", encoding="utf-8") + (project / "shapes.go").write_text(GO_SRC, encoding="utf-8") + return project + + +def _names(nodes: dict, kind: cs.NodeLabel) -> set[str]: + return {node.name for key, node in nodes.items() if key.kind == kind.value} + + +def test_oracle_labels_go_declarations(tmp_path: Path) -> None: + _require_go() + oracle = run_go_oracle(_go_project(tmp_path)).nodes + assert _names(oracle, cs.NodeLabel.CLASS) == {"Point"} + assert _names(oracle, cs.NodeLabel.INTERFACE) == {"Shape"} + assert _names(oracle, cs.NodeLabel.TYPE) == {"Celsius"} + assert _names(oracle, cs.NodeLabel.FUNCTION) == {"NewPoint"} + # (H) go/ast knows Area has a receiver, so it is a Method, not a Function. + assert _names(oracle, cs.NodeLabel.METHOD) == {"Area"} + + +def test_cgr_matches_oracle_on_type_declarations(tmp_path: Path) -> None: + _require_go() + project = _go_project(tmp_path) + cgr = GraphData( + nodes=extract_cgr_go_nodes(project, project.name), edges=set(), name_edges=set() + ) + oracle = run_go_oracle(project) + + result = score_node_kinds( + cgr, + oracle, + (cs.NodeLabel.CLASS, cs.NodeLabel.INTERFACE, cs.NodeLabel.TYPE), + ) + by_label = {row["label"]: row for row in result.rows} + for label in ( + cs.NodeLabel.CLASS.value, + cs.NodeLabel.INTERFACE.value, + cs.NodeLabel.TYPE.value, + ): + assert by_label[label]["recall"] == 1.0, (label, by_label[label]) + assert by_label[label]["precision"] == 1.0, (label, by_label[label]) diff --git a/codebase_rag/tests/test_go_type_declarations.py b/codebase_rag/tests/test_go_type_declarations.py new file mode 100644 index 000000000..ee6894df3 --- /dev/null +++ b/codebase_rag/tests/test_go_type_declarations.py @@ -0,0 +1,94 @@ +from __future__ import annotations + +from pathlib import Path +from unittest.mock import MagicMock + +import pytest + +from codebase_rag.constants import KEY_NAME, NodeLabel +from codebase_rag.tests.conftest import create_and_run_updater, get_nodes + + +@pytest.fixture +def go_types_project(temp_repo: Path) -> Path: + project_path = temp_repo / "go_types_test" + project_path.mkdir() + (project_path / "go.mod").write_text( + encoding="utf-8", data="module go_types_test\n\ngo 1.22\n" + ) + (project_path / "shapes.go").write_text( + encoding="utf-8", + data="""package shapes + +type Point struct { +\tX int +\tY int +} + +type Shape interface { +\tArea() float64 +} + +type Celsius float64 + +type ( +\tWidget struct { +\t\tID int +\t} +\tDrawable interface { +\t\tDraw() string +\t} +\tFahrenheit float64 +) + +func NewPoint(x int, y int) Point { +\treturn Point{X: x, Y: y} +} +""", + ) + return project_path + + +def _names(mock_ingestor: MagicMock, label: NodeLabel) -> set[str]: + return { + str(node[0][1].get(KEY_NAME)) + for node in get_nodes(mock_ingestor, label) + if str(node[0][1].get(KEY_NAME)) + } + + +def test_go_struct_captured_as_class( + go_types_project: Path, mock_ingestor: MagicMock +) -> None: + create_and_run_updater(go_types_project, mock_ingestor, skip_if_missing="go") + classes = _names(mock_ingestor, NodeLabel.CLASS) + assert "Point" in classes, f"Go struct Point missing from Class nodes: {classes}" + assert "Widget" in classes, ( + f"Grouped Go struct Widget missing from Class nodes: {classes}" + ) + + +def test_go_interface_captured_as_interface( + go_types_project: Path, mock_ingestor: MagicMock +) -> None: + create_and_run_updater(go_types_project, mock_ingestor, skip_if_missing="go") + interfaces = _names(mock_ingestor, NodeLabel.INTERFACE) + assert "Shape" in interfaces, ( + f"Go interface Shape missing from Interface nodes: {interfaces}" + ) + assert "Drawable" in interfaces, ( + f"Grouped Go interface Drawable missing from Interface nodes: {interfaces}" + ) + + +def test_go_type_alias_captured_as_type( + go_types_project: Path, mock_ingestor: MagicMock +) -> None: + create_and_run_updater(go_types_project, mock_ingestor, skip_if_missing="go") + types = _names(mock_ingestor, NodeLabel.TYPE) + assert "Celsius" in types, ( + f"Go defined type Celsius missing from Type nodes: {types}" + ) + assert "Fahrenheit" in types, ( + f"Grouped Go defined type Fahrenheit missing from Type nodes: {types}" + ) diff --git a/codebase_rag/tests/test_graph_service.py b/codebase_rag/tests/test_graph_service.py index c31b30741..76e5a6ed2 100644 --- a/codebase_rag/tests/test_graph_service.py +++ b/codebase_rag/tests/test_graph_service.py @@ -5,7 +5,13 @@ import pytest from codebase_rag.constants import NODE_UNIQUE_CONSTRAINTS -from codebase_rag.cypher_queries import wrap_with_unwind +from codebase_rag.cypher_queries import ( + build_create_node_query, + build_create_relationship_query, + build_merge_node_query, + build_merge_relationship_query, + wrap_with_unwind, +) from codebase_rag.services.graph_service import MemgraphIngestor @@ -38,13 +44,63 @@ def test_init_creates_empty_buffers(self) -> None: ingestor = MemgraphIngestor(host="localhost", port=7687) assert ingestor.node_buffer == [] - assert ingestor.relationship_buffer == [] + assert ingestor._rel_count == 0 def test_init_conn_is_none(self) -> None: ingestor = MemgraphIngestor(host="localhost", port=7687) assert ingestor.conn is None + def test_init_stores_auth_credentials(self) -> None: + ingestor = MemgraphIngestor( + host="localhost", port=7687, username="user", password="pass" + ) + + assert ingestor._username == "user" + assert ingestor._password == "pass" + + def test_init_defaults_auth_to_none(self) -> None: + ingestor = MemgraphIngestor(host="localhost", port=7687) + + assert ingestor._username is None + assert ingestor._password is None + + def test_init_raises_for_username_without_password(self) -> None: + with pytest.raises(ValueError, match="Both username and password"): + MemgraphIngestor(host="localhost", port=7687, username="user") + + def test_init_raises_for_password_without_username(self) -> None: + with pytest.raises(ValueError, match="Both username and password"): + MemgraphIngestor(host="localhost", port=7687, password="pass") + + def test_init_normalizes_empty_strings_to_none(self) -> None: + ingestor = MemgraphIngestor( + host="localhost", port=7687, username="", password="" + ) + + assert ingestor._username is None + assert ingestor._password is None + + def test_init_normalizes_whitespace_only_to_none(self) -> None: + ingestor = MemgraphIngestor( + host="localhost", port=7687, username=" ", password=" " + ) + + assert ingestor._username is None + assert ingestor._password is None + + def test_init_strips_whitespace_from_credentials(self) -> None: + ingestor = MemgraphIngestor( + host="localhost", port=7687, username=" user ", password=" pass " + ) + + assert ingestor._username == "user" + assert ingestor._password == "pass" + + def test_init_raises_for_empty_password_with_valid_username(self) -> None: + with pytest.raises(ValueError, match="Both username and password"): + MemgraphIngestor(host="localhost", port=7687, username="user", password="") + class TestContextManager: def test_enter_connects_to_memgraph(self) -> None: @@ -60,12 +116,36 @@ def test_enter_connects_to_memgraph(self) -> None: assert mock_conn.autocommit is True assert result is ingestor + def test_enter_passes_auth_when_provided(self) -> None: + with patch("codebase_rag.services.graph_service.mgclient") as mock_mgclient: + mock_conn = MagicMock() + mock_mgclient.connect.return_value = mock_conn + + ingestor = MemgraphIngestor( + host="testhost", port=1234, username="user", password="pass" + ) + ingestor.__enter__() + + mock_mgclient.connect.assert_called_once_with( + host="testhost", port=1234, username="user", password="pass" + ) + + def test_enter_omits_auth_when_not_provided(self) -> None: + with patch("codebase_rag.services.graph_service.mgclient") as mock_mgclient: + mock_conn = MagicMock() + mock_mgclient.connect.return_value = mock_conn + + ingestor = MemgraphIngestor(host="testhost", port=1234) + ingestor.__enter__() + + mock_mgclient.connect.assert_called_once_with(host="testhost", port=1234) + def test_exit_flushes_and_closes_connection(self) -> None: ingestor = MemgraphIngestor(host="localhost", port=7687) mock_conn = MagicMock() ingestor.conn = mock_conn - with patch.object(ingestor, "flush_all") as mock_flush: + with patch.object(MemgraphIngestor, "flush_all") as mock_flush: ingestor.__exit__(None, None, None) mock_flush.assert_called_once() @@ -76,7 +156,7 @@ def test_exit_logs_error_on_exception(self) -> None: mock_conn = MagicMock() ingestor.conn = mock_conn - with patch.object(ingestor, "flush_all"): + with patch.object(MemgraphIngestor, "flush_all"): ingestor.__exit__(ValueError, ValueError("test error"), None) mock_conn.close.assert_called_once() @@ -85,7 +165,7 @@ def test_exit_handles_none_connection(self) -> None: ingestor = MemgraphIngestor(host="localhost", port=7687) ingestor.conn = None - with patch.object(ingestor, "flush_all"): + with patch.object(MemgraphIngestor, "flush_all"): ingestor.__exit__(None, None, None) @@ -206,19 +286,13 @@ def test_suppresses_already_exists_errors_in_logs(self) -> None: ingestor._execute_query("CREATE CONSTRAINT") -class TestExecuteBatch: - def test_returns_early_when_not_connected(self) -> None: - ingestor = MemgraphIngestor(host="localhost", port=7687) - ingestor.conn = None - - ingestor._execute_batch("MERGE (n:Test)", [{"id": 1}]) - +class TestExecuteBatchOn: def test_returns_early_when_params_empty(self) -> None: ingestor = MemgraphIngestor(host="localhost", port=7687) mock_conn = MagicMock() ingestor.conn = mock_conn - ingestor._execute_batch("MERGE (n:Test)", []) + ingestor._execute_batch_on(mock_conn, "MERGE (n:Test)", []) mock_conn.cursor.assert_not_called() @@ -229,7 +303,9 @@ def test_wraps_query_with_unwind(self) -> None: mock_conn.cursor.return_value = mock_cursor ingestor.conn = mock_conn - ingestor._execute_batch("MERGE (n:Test {id: row.id})", [{"id": 1}, {"id": 2}]) + ingestor._execute_batch_on( + mock_conn, "MERGE (n:Test {id: row.id})", [{"id": 1}, {"id": 2}] + ) call_args = mock_cursor.execute.call_args[0] assert call_args[0] == wrap_with_unwind("MERGE (n:Test {id: row.id})") @@ -242,7 +318,7 @@ def test_closes_cursor_on_success(self) -> None: mock_conn.cursor.return_value = mock_cursor ingestor.conn = mock_conn - ingestor._execute_batch("MERGE (n:Test)", [{"id": 1}]) + ingestor._execute_batch_on(mock_conn, "MERGE (n:Test)", [{"id": 1}]) mock_cursor.close.assert_called_once() @@ -251,7 +327,7 @@ class TestCleanDatabase: def test_executes_delete_query(self) -> None: ingestor = MemgraphIngestor(host="localhost", port=7687) - with patch.object(ingestor, "_execute_query") as mock_execute: + with patch.object(MemgraphIngestor, "_execute_query") as mock_execute: ingestor.clean_database() mock_execute.assert_called_once_with("MATCH (n) DETACH DELETE n;") @@ -265,7 +341,9 @@ def test_creates_constraint_for_each_node_type(self) -> None: def capture_query(query: str) -> None: executed_queries.append(query) - with patch.object(ingestor, "_execute_query", side_effect=capture_query): + with patch.object( + MemgraphIngestor, "_execute_query", side_effect=capture_query + ): ingestor.ensure_constraints() for label, prop in NODE_UNIQUE_CONSTRAINTS.items(): @@ -282,7 +360,9 @@ def fail_then_succeed(query: str) -> None: if call_count == 1: raise RuntimeError("Constraint already exists") - with patch.object(ingestor, "_execute_query", side_effect=fail_then_succeed): + with patch.object( + MemgraphIngestor, "_execute_query", side_effect=fail_then_succeed + ): ingestor.ensure_constraints() expected_queries = len(NODE_UNIQUE_CONSTRAINTS) * 2 @@ -384,7 +464,7 @@ def mock_fetch_all(query: str, params: dict | None = None) -> list[dict]: return [{"node_id": 1}, {"node_id": 2}, {"node_id": 3}] return [{"from_id": 1, "to_id": 2}] - with patch.object(ingestor, "fetch_all", side_effect=mock_fetch_all): + with patch.object(MemgraphIngestor, "fetch_all", side_effect=mock_fetch_all): result = ingestor.export_graph_to_dict() assert result["metadata"]["total_nodes"] == 3 @@ -396,8 +476,8 @@ def test_calls_flush_nodes_and_flush_relationships(self) -> None: ingestor = MemgraphIngestor(host="localhost", port=7687) with ( - patch.object(ingestor, "flush_nodes") as mock_nodes, - patch.object(ingestor, "flush_relationships") as mock_rels, + patch.object(MemgraphIngestor, "flush_nodes") as mock_nodes, + patch.object(MemgraphIngestor, "flush_relationships") as mock_rels, ): ingestor.flush_all() @@ -407,20 +487,36 @@ def test_calls_flush_nodes_and_flush_relationships(self) -> None: class TestFetchAllAndExecuteWrite: def test_fetch_all_delegates_to_execute_query(self) -> None: + from codebase_rag.config import settings + ingestor = MemgraphIngestor(host="localhost", port=7687) with patch.object( - ingestor, "_execute_query", return_value=[{"n": "result"}] + MemgraphIngestor, "_execute_query", return_value=[{"n": "result"}] ) as mock_exec: result = ingestor.fetch_all("MATCH (n) RETURN n", {"limit": 10}) - mock_exec.assert_called_once_with("MATCH (n) RETURN n", {"limit": 10}) + expected_query = ( + f"MATCH (n) RETURN n QUERY MEMORY LIMIT " + f"{settings.QUERY_MEMORY_LIMIT_MB} MB;" + ) + mock_exec.assert_called_once_with(expected_query, {"limit": 10}) assert result == [{"n": "result"}] + def test_fetch_all_preserves_existing_memory_limit(self) -> None: + ingestor = MemgraphIngestor(host="localhost", port=7687) + query_with_hint = "MATCH (n) RETURN n QUERY MEMORY LIMIT 512 MB;" + + with patch.object( + MemgraphIngestor, "_execute_query", return_value=[] + ) as mock_exec: + ingestor.fetch_all(query_with_hint) + mock_exec.assert_called_once_with(query_with_hint, None) + def test_execute_write_delegates_to_execute_query(self) -> None: ingestor = MemgraphIngestor(host="localhost", port=7687) - with patch.object(ingestor, "_execute_query") as mock_exec: + with patch.object(MemgraphIngestor, "_execute_query") as mock_exec: ingestor.execute_write("CREATE (n:Test)", {"name": "test"}) mock_exec.assert_called_once_with("CREATE (n:Test)", {"name": "test"}) @@ -434,3 +530,187 @@ def test_returns_iso_format_timestamp(self) -> None: assert "T" in result assert len(result) > 10 + + +class TestCreateMode: + def test_default_use_merge_is_true(self) -> None: + ingestor = MemgraphIngestor(host="localhost", port=7687) + assert ingestor._use_merge is True + + def test_use_merge_false(self) -> None: + ingestor = MemgraphIngestor(host="localhost", port=7687, use_merge=False) + assert ingestor._use_merge is False + + def test_flush_nodes_uses_merge_query_by_default(self) -> None: + ingestor = MemgraphIngestor(host="localhost", port=7687, batch_size=10) + mock_conn = MagicMock() + mock_cursor = MagicMock() + mock_conn.cursor.return_value = mock_cursor + ingestor.conn = mock_conn + + ingestor.node_buffer.append(("File", {"path": "/test.py", "name": "test"})) + ingestor.flush_nodes() + + call_args = mock_cursor.execute.call_args[0][0] + assert "MERGE" in call_args + assert "CREATE" not in call_args.split("MERGE")[0] + + def test_flush_nodes_uses_create_query_when_merge_disabled(self) -> None: + ingestor = MemgraphIngestor( + host="localhost", port=7687, batch_size=10, use_merge=False + ) + mock_conn = MagicMock() + mock_cursor = MagicMock() + mock_conn.cursor.return_value = mock_cursor + ingestor.conn = mock_conn + + ingestor.node_buffer.append(("File", {"path": "/test.py", "name": "test"})) + ingestor.flush_nodes() + + call_args = mock_cursor.execute.call_args[0][0] + assert "CREATE" in call_args + assert "MERGE" not in call_args + + def test_flush_relationships_uses_merge_query_by_default(self) -> None: + ingestor = MemgraphIngestor(host="localhost", port=7687, batch_size=10) + mock_conn = MagicMock() + mock_cursor = MagicMock() + mock_conn.cursor.return_value = mock_cursor + mock_cursor.description = [MagicMock(name="created")] + mock_cursor.description[0].name = "created" + mock_cursor.fetchall.return_value = [(1,)] + ingestor.conn = mock_conn + + ingestor.ensure_relationship_batch( + ("File", "path", "/a.py"), "IMPORTS", ("File", "path", "/b.py") + ) + ingestor.flush_relationships() + + call_args = mock_cursor.execute.call_args[0][0] + assert "MERGE" in call_args + + def test_flush_relationships_uses_create_query_when_merge_disabled(self) -> None: + ingestor = MemgraphIngestor( + host="localhost", port=7687, batch_size=10, use_merge=False + ) + mock_conn = MagicMock() + mock_cursor = MagicMock() + mock_conn.cursor.return_value = mock_cursor + mock_cursor.description = [MagicMock(name="created")] + mock_cursor.description[0].name = "created" + mock_cursor.fetchall.return_value = [(1,)] + ingestor.conn = mock_conn + + ingestor.ensure_relationship_batch( + ("File", "path", "/a.py"), "IMPORTS", ("File", "path", "/b.py") + ) + ingestor.flush_relationships() + + call_args = mock_cursor.execute.call_args[0][0] + assert "CREATE" in call_args + assert "MERGE" not in call_args + + +class TestPreGroupedRelBuffer: + def test_rel_groups_populated_on_ensure(self) -> None: + ingestor = MemgraphIngestor(host="localhost", port=7687) + ingestor.ensure_relationship_batch( + ("File", "path", "/a.py"), "IMPORTS", ("File", "path", "/b.py") + ) + assert len(ingestor._rel_groups) == 1 + + def test_rel_groups_groups_by_pattern(self) -> None: + ingestor = MemgraphIngestor(host="localhost", port=7687) + ingestor.ensure_relationship_batch( + ("File", "path", "/a.py"), "IMPORTS", ("File", "path", "/b.py") + ) + ingestor.ensure_relationship_batch( + ("File", "path", "/a.py"), "IMPORTS", ("File", "path", "/c.py") + ) + ingestor.ensure_relationship_batch( + ("Module", "qualified_name", "mod_a"), + "DEFINES", + ("Function", "qualified_name", "func_b"), + ) + assert len(ingestor._rel_groups) == 2 + pattern = ("File", "path", "IMPORTS", "File", "path") + assert len(ingestor._rel_groups[pattern]) == 2 + + def test_rel_groups_cleared_after_flush(self) -> None: + ingestor = MemgraphIngestor(host="localhost", port=7687) + mock_conn = MagicMock() + mock_cursor = MagicMock() + mock_conn.cursor.return_value = mock_cursor + mock_cursor.description = [MagicMock(name="created")] + mock_cursor.description[0].name = "created" + mock_cursor.fetchall.return_value = [(1,)] + ingestor.conn = mock_conn + + ingestor.ensure_relationship_batch( + ("File", "path", "/a.py"), "IMPORTS", ("File", "path", "/b.py") + ) + ingestor.flush_relationships() + + assert len(ingestor._rel_groups) == 0 + + def test_rel_groups_empty_on_init(self) -> None: + ingestor = MemgraphIngestor(host="localhost", port=7687) + assert len(ingestor._rel_groups) == 0 + + def test_rel_groups_correct_batch_row_values(self) -> None: + ingestor = MemgraphIngestor(host="localhost", port=7687) + ingestor.ensure_relationship_batch( + ("File", "path", "/a.py"), + "IMPORTS", + ("File", "path", "/b.py"), + {"weight": 1}, + ) + pattern = ("File", "path", "IMPORTS", "File", "path") + rows = ingestor._rel_groups[pattern] + assert len(rows) == 1 + assert rows[0]["from_val"] == "/a.py" + assert rows[0]["to_val"] == "/b.py" + assert rows[0]["props"] == {"weight": 1} + + +class TestSlots: + def test_has_slots(self) -> None: + assert hasattr(MemgraphIngestor, "__slots__") + + def test_no_dict(self) -> None: + ingestor = MemgraphIngestor(host="localhost", port=7687) + assert not hasattr(ingestor, "__dict__") + + +class TestCypherCreateQueries: + def test_build_create_node_query(self) -> None: + query = build_create_node_query("File", "path") + assert "CREATE" in query + assert "MERGE" not in query + assert "path: row.id" in query + + def test_build_create_relationship_query(self) -> None: + query = build_create_relationship_query( + "File", "path", "IMPORTS", "File", "path" + ) + assert "CREATE (a)-[r:IMPORTS]->(b)" in query + assert "MERGE" not in query + + def test_build_create_relationship_query_with_props(self) -> None: + query = build_create_relationship_query( + "File", "path", "IMPORTS", "File", "path", has_props=True + ) + assert "SET r += row.props" in query + assert "CREATE (a)-[r:IMPORTS]->(b)" in query + + def test_build_merge_node_query_unchanged(self) -> None: + query = build_merge_node_query("File", "path") + assert "MERGE" in query + assert "CREATE" not in query + + def test_build_merge_relationship_query_unchanged(self) -> None: + query = build_merge_relationship_query( + "File", "path", "IMPORTS", "File", "path" + ) + assert "MERGE" in query + assert "CREATE" not in query.replace("MERGE", "") diff --git a/codebase_rag/tests/test_graph_service_calls_failure_logging.py b/codebase_rag/tests/test_graph_service_calls_failure_logging.py index 2af717f06..6bb8f2e99 100644 --- a/codebase_rag/tests/test_graph_service_calls_failure_logging.py +++ b/codebase_rag/tests/test_graph_service_calls_failure_logging.py @@ -56,8 +56,8 @@ def test_calls_failure_logging_single_batch( ) with patch.object( - graph_service, - "_execute_batch_with_return", + MemgraphIngestor, + "_execute_batch_with_return_on", return_value=[{"created": 1}, {"created": 0}, {"created": 0}], ): graph_service.flush_relationships() @@ -72,13 +72,6 @@ def test_calls_failure_logging_single_batch( def test_calls_failure_logging_multiple_batches( graph_service: MemgraphIngestor, log_messages: list[str] ) -> None: - """Test that CALLS failures are logged correctly across multiple batches. - - This is the critical test case that validates the bug fix: - - Previously, the code used cumulative totals (total_attempted - total_successful) - - This would incorrectly report failures for batches after the first one - - Now it correctly uses batch-specific counts (len(params_list) - batch_successful) - """ graph_service.ensure_relationship_batch( ("Method", "qualified_name", "project.module.ClassA.methodA()"), "CALLS", @@ -104,14 +97,16 @@ def test_calls_failure_logging_multiple_batches( call_count = 0 def mock_execute_batch( - query: str, params_list: list[dict[str, Any]] + conn: Any, query: str, params_list: list[dict[str, Any]] ) -> list[dict[str, int]]: nonlocal call_count call_count += 1 return [{"created": 1}, {"created": 0}] with patch.object( - graph_service, "_execute_batch_with_return", side_effect=mock_execute_batch + MemgraphIngestor, + "_execute_batch_with_return_on", + side_effect=mock_execute_batch, ): graph_service.flush_relationships() @@ -127,7 +122,6 @@ def mock_execute_batch( def test_calls_success_no_failure_logging( graph_service: MemgraphIngestor, log_messages: list[str] ) -> None: - """Test that successful CALLS don't trigger failure warnings.""" graph_service.ensure_relationship_batch( ("Method", "qualified_name", "project.module.ClassA.methodA()"), "CALLS", @@ -140,8 +134,8 @@ def test_calls_success_no_failure_logging( ) with patch.object( - graph_service, - "_execute_batch_with_return", + MemgraphIngestor, + "_execute_batch_with_return_on", return_value=[{"created": 1}, {"created": 1}], ): graph_service.flush_relationships() @@ -154,7 +148,6 @@ def test_calls_success_no_failure_logging( def test_non_calls_relationships_no_failure_logging( graph_service: MemgraphIngestor, log_messages: list[str] ) -> None: - """Test that failures in non-CALLS relationships don't trigger CALLS-specific logging.""" graph_service.ensure_relationship_batch( ("Module", "qualified_name", "project.moduleA"), "IMPORTS", @@ -167,8 +160,8 @@ def test_non_calls_relationships_no_failure_logging( ) with patch.object( - graph_service, - "_execute_batch_with_return", + MemgraphIngestor, + "_execute_batch_with_return_on", return_value=[{"created": 1}, {"created": 0}], ): graph_service.flush_relationships() diff --git a/codebase_rag/tests/test_graph_updater_embeddings.py b/codebase_rag/tests/test_graph_updater_embeddings.py new file mode 100644 index 000000000..c88fc54ac --- /dev/null +++ b/codebase_rag/tests/test_graph_updater_embeddings.py @@ -0,0 +1,335 @@ +from pathlib import Path +from unittest.mock import MagicMock, patch + +import pytest + +from codebase_rag import constants as cs +from codebase_rag.graph_updater import GraphUpdater +from codebase_rag.parser_loader import load_parsers +from codebase_rag.services.graph_service import MemgraphIngestor +from codebase_rag.types_defs import ResultRow + +MOCK_EMBEDDING = [0.1] * 768 + + +def _fake_embed_batch(snippets: list[str], **_kwargs: object) -> list[list[float]]: + return [MOCK_EMBEDDING for _ in snippets] + + +_PATCH_DEPS = patch( + "codebase_rag.graph_updater.has_semantic_dependencies", return_value=True +) +_PATCH_EMBED_BATCH = patch( + "codebase_rag.embedder.embed_code_batch", side_effect=_fake_embed_batch +) +_PATCH_STORE_BATCH = patch( + "codebase_rag.vector_store.store_embedding_batch", side_effect=lambda pts: len(pts) +) +_PATCH_RECONCILE = patch( + "codebase_rag.vector_store.verify_stored_ids", side_effect=lambda ids: ids +) + + +@pytest.fixture +def query_ingestor() -> MagicMock: + mock = MagicMock(spec=MemgraphIngestor) + mock.fetch_all = MagicMock(return_value=[]) + mock.execute_write = MagicMock() + return mock + + +@pytest.fixture +def updater_with_query(temp_repo: Path, query_ingestor: MagicMock) -> GraphUpdater: + parsers, queries = load_parsers() + return GraphUpdater( + ingestor=query_ingestor, + repo_path=temp_repo, + parsers=parsers, + queries=queries, + ) + + +class TestCypherQueryEmbeddingsStructure: + def test_contains_starts_with_project_name(self) -> None: + assert "STARTS WITH" in cs.CYPHER_QUERY_EMBEDDINGS + assert "$project_name" in cs.CYPHER_QUERY_EMBEDDINGS + + def test_returns_required_columns(self) -> None: + query = cs.CYPHER_QUERY_EMBEDDINGS.upper() + for col in ["NODE_ID", "QUALIFIED_NAME", "START_LINE", "END_LINE", "PATH"]: + assert col in query + + def test_dot_concatenation_is_parenthesized(self) -> None: + assert "($project_name + '.')" in cs.CYPHER_QUERY_EMBEDDINGS + + def test_no_bare_starts_with_plus(self) -> None: + for line in cs.CYPHER_QUERY_EMBEDDINGS.splitlines(): + stripped = line.strip() + if "STARTS WITH" in stripped and "$project_name" in stripped: + assert "($project_name" in stripped, ( + f"$project_name + '.' must be parenthesized in: {stripped!r}" + ) + + +class TestGenerateSemanticEmbeddings: + @_PATCH_DEPS + @_PATCH_EMBED_BATCH + @_PATCH_STORE_BATCH + @_PATCH_RECONCILE + def test_passes_project_name_without_trailing_dot( + self, + _mock_reconcile: MagicMock, + _mock_store_batch: MagicMock, + _mock_embed_batch: MagicMock, + _mock_deps: MagicMock, + updater_with_query: GraphUpdater, + query_ingestor: MagicMock, + ) -> None: + query_ingestor.fetch_all.return_value = [] + updater_with_query._generate_semantic_embeddings() + + params = query_ingestor.fetch_all.call_args[0][1] + project_name_param = params["project_name"] + assert not project_name_param.endswith("."), ( + f"project_name should not have trailing dot, got: {project_name_param!r}" + ) + + @_PATCH_DEPS + @_PATCH_EMBED_BATCH + @_PATCH_STORE_BATCH + @_PATCH_RECONCILE + def test_uses_cypher_query_embeddings_constant( + self, + _mock_reconcile: MagicMock, + _mock_store_batch: MagicMock, + _mock_embed_batch: MagicMock, + _mock_deps: MagicMock, + updater_with_query: GraphUpdater, + query_ingestor: MagicMock, + ) -> None: + query_ingestor.fetch_all.return_value = [] + updater_with_query._generate_semantic_embeddings() + + query_arg = query_ingestor.fetch_all.call_args[0][0] + assert query_arg == cs.CYPHER_QUERY_EMBEDDINGS + + @patch("codebase_rag.graph_updater.has_semantic_dependencies", return_value=False) + def test_skips_when_no_semantic_dependencies( + self, + _mock_deps: MagicMock, + updater_with_query: GraphUpdater, + query_ingestor: MagicMock, + ) -> None: + updater_with_query._generate_semantic_embeddings() + query_ingestor.fetch_all.assert_not_called() + + @_PATCH_DEPS + @_PATCH_EMBED_BATCH + @_PATCH_STORE_BATCH + @_PATCH_RECONCILE + def test_returns_early_on_empty_results( + self, + _mock_reconcile: MagicMock, + mock_store_batch: MagicMock, + _mock_embed_batch: MagicMock, + _mock_deps: MagicMock, + updater_with_query: GraphUpdater, + query_ingestor: MagicMock, + ) -> None: + query_ingestor.fetch_all.return_value = [] + updater_with_query._generate_semantic_embeddings() + mock_store_batch.assert_not_called() + + @_PATCH_DEPS + @_PATCH_EMBED_BATCH + @_PATCH_STORE_BATCH + @_PATCH_RECONCILE + def test_embeds_valid_function_with_source( + self, + _mock_reconcile: MagicMock, + mock_store_batch: MagicMock, + mock_embed_batch: MagicMock, + _mock_deps: MagicMock, + updater_with_query: GraphUpdater, + query_ingestor: MagicMock, + temp_repo: Path, + ) -> None: + (temp_repo / "module.py").write_text("def hello():\n return 42\n") + row: ResultRow = { + cs.KEY_NODE_ID: 1, + cs.KEY_QUALIFIED_NAME: "myproject.module.hello", + cs.KEY_START_LINE: 1, + cs.KEY_END_LINE: 2, + cs.KEY_PATH: "module.py", + } + query_ingestor.fetch_all.return_value = [row] + + updater_with_query._generate_semantic_embeddings() + + mock_embed_batch.assert_called_once() + snippets_arg = mock_embed_batch.call_args[0][0] + assert len(snippets_arg) == 1 + assert "def hello()" in snippets_arg[0] + mock_store_batch.assert_called_once() + batch_arg = mock_store_batch.call_args[0][0] + assert len(batch_arg) == 1 + assert batch_arg[0] == (1, MOCK_EMBEDDING, "myproject.module.hello") + + @_PATCH_DEPS + @_PATCH_EMBED_BATCH + @_PATCH_STORE_BATCH + @_PATCH_RECONCILE + def test_skips_row_with_missing_source_info( + self, + _mock_reconcile: MagicMock, + mock_store_batch: MagicMock, + mock_embed_batch: MagicMock, + _mock_deps: MagicMock, + updater_with_query: GraphUpdater, + query_ingestor: MagicMock, + ) -> None: + row: ResultRow = { + cs.KEY_NODE_ID: 1, + cs.KEY_QUALIFIED_NAME: "myproject.module.hello", + } + query_ingestor.fetch_all.return_value = [row] + + updater_with_query._generate_semantic_embeddings() + + mock_embed_batch.assert_not_called() + mock_store_batch.assert_not_called() + + @patch("codebase_rag.graph_updater.has_semantic_dependencies", return_value=True) + @patch( + "codebase_rag.embedder.embed_code_batch", + side_effect=RuntimeError("model error"), + ) + @_PATCH_STORE_BATCH + @_PATCH_RECONCILE + def test_handles_embed_failure_gracefully( + self, + _mock_reconcile: MagicMock, + mock_store_batch: MagicMock, + _mock_embed_batch: MagicMock, + _mock_deps: MagicMock, + updater_with_query: GraphUpdater, + query_ingestor: MagicMock, + temp_repo: Path, + ) -> None: + (temp_repo / "module.py").write_text("def hello():\n return 42\n") + row: ResultRow = { + cs.KEY_NODE_ID: 1, + cs.KEY_QUALIFIED_NAME: "myproject.module.hello", + cs.KEY_START_LINE: 1, + cs.KEY_END_LINE: 2, + cs.KEY_PATH: "module.py", + } + query_ingestor.fetch_all.return_value = [row] + + updater_with_query._generate_semantic_embeddings() + + mock_store_batch.assert_not_called() + + @_PATCH_DEPS + @_PATCH_EMBED_BATCH + @_PATCH_STORE_BATCH + @_PATCH_RECONCILE + def test_skips_unparseable_rows( + self, + _mock_reconcile: MagicMock, + mock_store_batch: MagicMock, + mock_embed_batch: MagicMock, + _mock_deps: MagicMock, + updater_with_query: GraphUpdater, + query_ingestor: MagicMock, + ) -> None: + bad_row: ResultRow = { + cs.KEY_NODE_ID: "not_an_int", + cs.KEY_QUALIFIED_NAME: "pkg.func", + } + query_ingestor.fetch_all.return_value = [bad_row] + + updater_with_query._generate_semantic_embeddings() + + mock_embed_batch.assert_not_called() + mock_store_batch.assert_not_called() + + @_PATCH_DEPS + @_PATCH_EMBED_BATCH + @_PATCH_STORE_BATCH + @_PATCH_RECONCILE + def test_counts_embedded_functions( + self, + _mock_reconcile: MagicMock, + mock_store_batch: MagicMock, + mock_embed_batch: MagicMock, + _mock_deps: MagicMock, + updater_with_query: GraphUpdater, + query_ingestor: MagicMock, + temp_repo: Path, + ) -> None: + (temp_repo / "a.py").write_text("def f1():\n pass\n") + (temp_repo / "b.py").write_text("def f2():\n pass\n") + rows: list[ResultRow] = [ + { + cs.KEY_NODE_ID: 1, + cs.KEY_QUALIFIED_NAME: "proj.a.f1", + cs.KEY_START_LINE: 1, + cs.KEY_END_LINE: 2, + cs.KEY_PATH: "a.py", + }, + { + cs.KEY_NODE_ID: 2, + cs.KEY_QUALIFIED_NAME: "proj.b.f2", + cs.KEY_START_LINE: 1, + cs.KEY_END_LINE: 2, + cs.KEY_PATH: "b.py", + }, + ] + query_ingestor.fetch_all.return_value = rows + + updater_with_query._generate_semantic_embeddings() + + mock_embed_batch.assert_called_once() + snippets_arg = mock_embed_batch.call_args[0][0] + assert len(snippets_arg) == 2 + mock_store_batch.assert_called_once() + batch_arg = mock_store_batch.call_args[0][0] + assert len(batch_arg) == 2 + + +class TestBatchedEmbeddingDispatch: + @_PATCH_DEPS + @_PATCH_EMBED_BATCH + @_PATCH_STORE_BATCH + @_PATCH_RECONCILE + def test_dispatches_single_batch_call_for_multiple_snippets( + self, + _mock_reconcile: MagicMock, + _mock_store_batch: MagicMock, + mock_embed_batch: MagicMock, + _mock_deps: MagicMock, + updater_with_query: GraphUpdater, + query_ingestor: MagicMock, + temp_repo: Path, + ) -> None: + (temp_repo / "a.py").write_text("def f1():\n return 1\n") + (temp_repo / "b.py").write_text("def f2():\n return 2\n") + (temp_repo / "c.py").write_text("def f3():\n return 3\n") + rows: list[ResultRow] = [ + { + cs.KEY_NODE_ID: i + 1, + cs.KEY_QUALIFIED_NAME: f"proj.{name}.f{i + 1}", + cs.KEY_START_LINE: 1, + cs.KEY_END_LINE: 2, + cs.KEY_PATH: f"{name}.py", + } + for i, name in enumerate(("a", "b", "c")) + ] + query_ingestor.fetch_all.return_value = rows + + updater_with_query._generate_semantic_embeddings() + + assert mock_embed_batch.call_count == 1 + snippets_arg = mock_embed_batch.call_args[0][0] + assert len(snippets_arg) == 3 diff --git a/codebase_rag/tests/test_graph_updater_incremental.py b/codebase_rag/tests/test_graph_updater_incremental.py new file mode 100644 index 000000000..788e15358 --- /dev/null +++ b/codebase_rag/tests/test_graph_updater_incremental.py @@ -0,0 +1,457 @@ +import json +from pathlib import Path +from unittest.mock import MagicMock, patch + +import pytest + +from codebase_rag import constants as cs +from codebase_rag.graph_updater import ( + BoundedASTCache, + FunctionRegistryTrie, + GraphUpdater, + _hash_file, + _hash_file_with_bytes, + _load_hash_cache, + _save_hash_cache, +) +from codebase_rag.parser_loader import load_parsers + + +@pytest.fixture +def updater(temp_repo: Path, mock_ingestor: MagicMock) -> GraphUpdater: + parsers, queries = load_parsers() + return GraphUpdater( + ingestor=mock_ingestor, + repo_path=temp_repo, + parsers=parsers, + queries=queries, + ) + + +@pytest.fixture +def py_project(temp_repo: Path) -> Path: + (temp_repo / "__init__.py").touch() + (temp_repo / "module_a.py").write_text("def func_a():\n pass\n") + (temp_repo / "module_b.py").write_text("def func_b():\n pass\n") + return temp_repo + + +class TestHashFile: + def test_hash_returns_hex_string(self, temp_repo: Path) -> None: + f = temp_repo / "test.py" + f.write_text("hello") + result = _hash_file(f) + assert isinstance(result, str) + assert len(result) == 32 + + def test_same_content_same_hash(self, temp_repo: Path) -> None: + f1 = temp_repo / "a.py" + f2 = temp_repo / "b.py" + f1.write_text("same content") + f2.write_text("same content") + assert _hash_file(f1) == _hash_file(f2) + + def test_different_content_different_hash(self, temp_repo: Path) -> None: + f1 = temp_repo / "a.py" + f2 = temp_repo / "b.py" + f1.write_text("content one") + f2.write_text("content two") + assert _hash_file(f1) != _hash_file(f2) + + def test_hash_with_bytes_returns_none_for_broken_symlink( + self, temp_repo: Path + ) -> None: + link = temp_repo / "result" + link.symlink_to(temp_repo / "missing-target") + assert _hash_file_with_bytes(link) is None + + def test_hash_with_bytes_returns_none_for_missing_file( + self, temp_repo: Path + ) -> None: + assert _hash_file_with_bytes(temp_repo / "does-not-exist") is None + + +class TestHashCacheIO: + def test_save_and_load_cache(self, temp_repo: Path) -> None: + cache_path = temp_repo / cs.HASH_CACHE_FILENAME + data = {"module_a.py": "abc123", "module_b.py": "def456"} + _save_hash_cache(cache_path, data) + + assert cache_path.is_file() + loaded = _load_hash_cache(cache_path) + assert loaded == data + + def test_load_nonexistent_returns_empty(self, temp_repo: Path) -> None: + cache_path = temp_repo / cs.HASH_CACHE_FILENAME + assert _load_hash_cache(cache_path) == {} + + def test_load_corrupted_returns_empty(self, temp_repo: Path) -> None: + cache_path = temp_repo / cs.HASH_CACHE_FILENAME + cache_path.write_text("not valid json {{{") + assert _load_hash_cache(cache_path) == {} + + def test_save_creates_parent_dirs(self, temp_repo: Path) -> None: + cache_path = temp_repo / "subdir" / "nested" / cs.HASH_CACHE_FILENAME + _save_hash_cache(cache_path, {"a.py": "hash1"}) + assert cache_path.is_file() + + def test_cache_file_is_valid_json(self, temp_repo: Path) -> None: + cache_path = temp_repo / cs.HASH_CACHE_FILENAME + data = {"file.py": "sha256hash"} + _save_hash_cache(cache_path, data) + with cache_path.open() as f: + parsed = json.load(f) + assert parsed == data + + +class TestIncrementalUpdates: + def test_unchanged_file_is_skipped( + self, py_project: Path, mock_ingestor: MagicMock + ) -> None: + parsers, queries = load_parsers() + updater = GraphUpdater( + ingestor=mock_ingestor, + repo_path=py_project, + parsers=parsers, + queries=queries, + ) + updater.run() + + mock_ingestor.reset_mock() + updater2 = GraphUpdater( + ingestor=mock_ingestor, + repo_path=py_project, + parsers=parsers, + queries=queries, + ) + + with patch.object( + updater2, "_process_single_file", wraps=updater2._process_single_file + ) as spy: + updater2.run() + assert spy.call_count == 0 + + def test_changed_file_is_reparsed( + self, py_project: Path, mock_ingestor: MagicMock + ) -> None: + parsers, queries = load_parsers() + updater = GraphUpdater( + ingestor=mock_ingestor, + repo_path=py_project, + parsers=parsers, + queries=queries, + ) + updater.run() + + (py_project / "module_a.py").write_text("def func_a_updated():\n pass\n") + + updater2 = GraphUpdater( + ingestor=mock_ingestor, + repo_path=py_project, + parsers=parsers, + queries=queries, + ) + with patch.object( + updater2, "_process_single_file", wraps=updater2._process_single_file + ) as spy: + updater2.run() + processed_paths = [call.args[0] for call in spy.call_args_list] + assert py_project / "module_a.py" in processed_paths + + def test_deleted_file_removed_from_state( + self, py_project: Path, mock_ingestor: MagicMock + ) -> None: + parsers, queries = load_parsers() + updater = GraphUpdater( + ingestor=mock_ingestor, + repo_path=py_project, + parsers=parsers, + queries=queries, + ) + updater.run() + + (py_project / "module_b.py").unlink() + + updater2 = GraphUpdater( + ingestor=mock_ingestor, + repo_path=py_project, + parsers=parsers, + queries=queries, + ) + with patch.object( + updater2, "remove_file_from_state", wraps=updater2.remove_file_from_state + ) as spy: + updater2.run() + removed_paths = [call.args[0] for call in spy.call_args_list] + assert py_project / "module_b.py" in removed_paths + + def test_force_bypasses_cache( + self, py_project: Path, mock_ingestor: MagicMock + ) -> None: + parsers, queries = load_parsers() + updater = GraphUpdater( + ingestor=mock_ingestor, + repo_path=py_project, + parsers=parsers, + queries=queries, + ) + updater.run() + + updater2 = GraphUpdater( + ingestor=mock_ingestor, + repo_path=py_project, + parsers=parsers, + queries=queries, + ) + with patch.object( + updater2, "_process_single_file", wraps=updater2._process_single_file + ) as spy: + updater2.run(force=True) + assert spy.call_count > 0 + + def test_new_file_is_processed( + self, py_project: Path, mock_ingestor: MagicMock + ) -> None: + parsers, queries = load_parsers() + updater = GraphUpdater( + ingestor=mock_ingestor, + repo_path=py_project, + parsers=parsers, + queries=queries, + ) + updater.run() + + (py_project / "module_c.py").write_text("def func_c():\n pass\n") + + updater2 = GraphUpdater( + ingestor=mock_ingestor, + repo_path=py_project, + parsers=parsers, + queries=queries, + ) + with patch.object( + updater2, "_process_single_file", wraps=updater2._process_single_file + ) as spy: + updater2.run() + processed_paths = [call.args[0] for call in spy.call_args_list] + assert py_project / "module_c.py" in processed_paths + + def test_hash_cache_file_created_after_run( + self, py_project: Path, mock_ingestor: MagicMock + ) -> None: + parsers, queries = load_parsers() + updater = GraphUpdater( + ingestor=mock_ingestor, + repo_path=py_project, + parsers=parsers, + queries=queries, + ) + cache_path = py_project / cs.HASH_CACHE_FILENAME + assert not cache_path.exists() + + updater.run() + + assert cache_path.is_file() + with cache_path.open() as f: + data = json.load(f) + assert isinstance(data, dict) + assert len(data) > 0 + + def test_broken_symlink_does_not_crash_indexing( + self, py_project: Path, mock_ingestor: MagicMock + ) -> None: + broken = py_project / "result" + broken.symlink_to(py_project / "missing-nix-store-path") + + parsers, queries = load_parsers() + updater = GraphUpdater( + ingestor=mock_ingestor, + repo_path=py_project, + parsers=parsers, + queries=queries, + ) + + updater.run() + + cache_path = py_project / cs.HASH_CACHE_FILENAME + assert cache_path.is_file() + with cache_path.open() as f: + data = json.load(f) + assert "result" not in data + assert "module_a.py" in data + + def test_deleted_file_removed_from_hash_cache( + self, py_project: Path, mock_ingestor: MagicMock + ) -> None: + parsers, queries = load_parsers() + updater = GraphUpdater( + ingestor=mock_ingestor, + repo_path=py_project, + parsers=parsers, + queries=queries, + ) + updater.run() + + cache_path = py_project / cs.HASH_CACHE_FILENAME + with cache_path.open() as f: + old_data = json.load(f) + assert "module_b.py" in old_data + + (py_project / "module_b.py").unlink() + + updater2 = GraphUpdater( + ingestor=mock_ingestor, + repo_path=py_project, + parsers=parsers, + queries=queries, + ) + updater2.run() + + with cache_path.open() as f: + new_data = json.load(f) + assert "module_b.py" not in new_data + + +class TestFastPathInSync: + def test_second_run_skips_all_passes( + self, py_project: Path, mock_ingestor: MagicMock + ) -> None: + parsers, queries = load_parsers() + updater = GraphUpdater( + ingestor=mock_ingestor, + repo_path=py_project, + parsers=parsers, + queries=queries, + ) + updater.run() + + updater2 = GraphUpdater( + ingestor=mock_ingestor, + repo_path=py_project, + parsers=parsers, + queries=queries, + ) + assert updater2._is_already_in_sync() is True + with ( + patch.object( + updater2, "_process_single_file", wraps=updater2._process_single_file + ) as spy_files, + patch.object(updater2, "_process_function_calls") as spy_calls, + ): + updater2.run() + assert spy_files.call_count == 0 + assert spy_calls.call_count == 0 + + def test_changed_file_disables_fast_path( + self, py_project: Path, mock_ingestor: MagicMock + ) -> None: + parsers, queries = load_parsers() + updater = GraphUpdater( + ingestor=mock_ingestor, + repo_path=py_project, + parsers=parsers, + queries=queries, + ) + updater.run() + + (py_project / "module_a.py").write_text("def func_a():\n return 1\n") + + updater2 = GraphUpdater( + ingestor=mock_ingestor, + repo_path=py_project, + parsers=parsers, + queries=queries, + ) + assert updater2._is_already_in_sync() is False + + def test_new_file_disables_fast_path( + self, py_project: Path, mock_ingestor: MagicMock + ) -> None: + parsers, queries = load_parsers() + updater = GraphUpdater( + ingestor=mock_ingestor, + repo_path=py_project, + parsers=parsers, + queries=queries, + ) + updater.run() + + (py_project / "module_c.py").write_text("def func_c():\n pass\n") + + updater2 = GraphUpdater( + ingestor=mock_ingestor, + repo_path=py_project, + parsers=parsers, + queries=queries, + ) + assert updater2._is_already_in_sync() is False + + def test_deleted_file_disables_fast_path( + self, py_project: Path, mock_ingestor: MagicMock + ) -> None: + parsers, queries = load_parsers() + updater = GraphUpdater( + ingestor=mock_ingestor, + repo_path=py_project, + parsers=parsers, + queries=queries, + ) + updater.run() + + (py_project / "module_a.py").unlink() + + updater2 = GraphUpdater( + ingestor=mock_ingestor, + repo_path=py_project, + parsers=parsers, + queries=queries, + ) + assert updater2._is_already_in_sync() is False + + def test_no_hash_cache_disables_fast_path( + self, py_project: Path, mock_ingestor: MagicMock + ) -> None: + parsers, queries = load_parsers() + updater = GraphUpdater( + ingestor=mock_ingestor, + repo_path=py_project, + parsers=parsers, + queries=queries, + ) + assert updater._is_already_in_sync() is False + + def test_force_bypasses_fast_path( + self, py_project: Path, mock_ingestor: MagicMock + ) -> None: + parsers, queries = load_parsers() + updater = GraphUpdater( + ingestor=mock_ingestor, + repo_path=py_project, + parsers=parsers, + queries=queries, + ) + updater.run() + + updater2 = GraphUpdater( + ingestor=mock_ingestor, + repo_path=py_project, + parsers=parsers, + queries=queries, + ) + with patch.object(updater2, "_process_function_calls") as spy_calls: + updater2.run(force=True) + spy_calls.assert_called_once() + + +class TestSlots: + def test_function_registry_trie_has_slots(self) -> None: + assert hasattr(FunctionRegistryTrie, "__slots__") + trie = FunctionRegistryTrie() + with pytest.raises(AttributeError): + trie.nonexistent_attr = "value" # type: ignore[attr-defined] + + def test_bounded_ast_cache_has_slots(self) -> None: + assert hasattr(BoundedASTCache, "__slots__") + cache = BoundedASTCache() + with pytest.raises(AttributeError): + cache.nonexistent_attr = "value" # type: ignore[attr-defined] diff --git a/codebase_rag/tests/test_graph_updater_incremental_rename.py b/codebase_rag/tests/test_graph_updater_incremental_rename.py new file mode 100644 index 000000000..ae6fc786b --- /dev/null +++ b/codebase_rag/tests/test_graph_updater_incremental_rename.py @@ -0,0 +1,190 @@ +# (H) Regression tests for Codeberg issue #1: incremental rebuild used to leave +# (H) stale Function/DEFINES/IMPORTS/CALLS entities when a symbol was renamed +# (H) across files, because the incremental path was additive-only. After the +# (H) fix, an incremental rebuild after a rename must yield exactly the same +# (H) graph as a fresh full rebuild of the renamed tree. +from __future__ import annotations + +from pathlib import Path + +import pytest + +from codebase_rag import constants as cs +from codebase_rag.graph_updater import GraphUpdater +from codebase_rag.parser_loader import load_parsers +from codebase_rag.types_defs import PropertyDict, PropertyValue, ResultRow + +PROJECT_NAME = "testproj" + +NodeId = tuple[str, PropertyValue] +RelTuple = tuple[str, str, PropertyValue, str, str, str, PropertyValue] + +_DEFINES_EDGES = (cs.RelationshipType.DEFINES, cs.RelationshipType.DEFINES_METHOD) + + +class InMemoryGraph: + """Minimal in-memory ingestor that applies the exact node/relationship + writes and the DETACH-DELETE queries the updater issues, so final graph + state can be compared between incremental and full rebuilds.""" + + def __init__(self) -> None: + self.nodes: dict[NodeId, PropertyDict] = {} + self.rels: set[RelTuple] = set() + + # (H) IngestorProtocol + def ensure_node_batch(self, label: str, properties: PropertyDict) -> None: + uid = properties[NODE_UNIQUE_KEYS[label]] + self.nodes[(str(label), uid)] = dict(properties) + + def ensure_relationship_batch( + self, + from_spec: tuple[str, str, PropertyValue], + rel_type: str, + to_spec: tuple[str, str, PropertyValue], + properties: PropertyDict | None = None, + ) -> None: + fl, fk, fv = from_spec + tl, tk, tv = to_spec + self.rels.add((str(fl), str(fk), fv, str(rel_type), str(tl), str(tk), tv)) + + def flush_all(self) -> None: + return None + + # (H) QueryProtocol + def fetch_all( + self, query: str, params: PropertyDict | None = None + ) -> list[ResultRow]: + return [] + + def execute_write(self, query: str, params: PropertyDict | None = None) -> None: + params = params or {} + path = params.get(cs.KEY_PATH) + match query: + case cs.CYPHER_DELETE_MODULE: + self._delete_module_subtree(path) + case cs.CYPHER_DELETE_FILE: + self._delete_node_by_path(cs.NodeLabel.FILE, path) + case cs.CYPHER_DELETE_FOLDER: + self._delete_node_by_path(cs.NodeLabel.FOLDER, path) + case _: + return None + + # (H) delete helpers + def _find_nodes(self, label: str, key: str, val: PropertyValue) -> list[NodeId]: + return [ + nid + for nid, props in self.nodes.items() + if nid[0] == label and props.get(key) == val + ] + + def _delete_module_subtree(self, path: PropertyValue) -> None: + seeds = [ + nid + for nid, props in self.nodes.items() + if nid[0] == cs.NodeLabel.MODULE and props.get(cs.KEY_PATH) == path + ] + to_delete: set[NodeId] = set() + stack = list(seeds) + while stack: + nid = stack.pop() + if nid in to_delete: + continue + to_delete.add(nid) + props = self.nodes[nid] + for fl, fk, fv, rt, tl, tk, tv in self.rels: + if rt in _DEFINES_EDGES and fl == nid[0] and props.get(fk) == fv: + for child in self._find_nodes(tl, tk, tv): + if child not in to_delete: + stack.append(child) + self._purge_nodes(to_delete) + + def _delete_node_by_path(self, label: str, path: PropertyValue) -> None: + self._purge_nodes(set(self._find_nodes(label, cs.KEY_PATH, path))) + + def _purge_nodes(self, to_delete: set[NodeId]) -> None: + deleted_props = {nid: self.nodes[nid] for nid in to_delete} + for nid in to_delete: + self.nodes.pop(nid, None) + + def touches(label: str, key: str, val: PropertyValue) -> bool: + return any( + nid[0] == label and props.get(key) == val + for nid, props in deleted_props.items() + ) + + self.rels = { + (fl, fk, fv, rt, tl, tk, tv) + for (fl, fk, fv, rt, tl, tk, tv) in self.rels + if not touches(fl, fk, fv) and not touches(tl, tk, tv) + } + + # (H) comparison + def snapshot(self) -> tuple[frozenset[NodeId], frozenset[RelTuple]]: + return frozenset(self.nodes.keys()), frozenset(self.rels) + + +NODE_UNIQUE_KEYS = cs.NODE_UNIQUE_CONSTRAINTS + + +def _write_tree(root: Path, new_name: str) -> None: + (root / "__init__.py").touch() + (root / "a.py").write_text(f"def {new_name}():\n return 1\n") + (root / "b.py").write_text( + f"from .a import {new_name}\n\n\ndef caller():\n return {new_name}()\n" + ) + + +def _make_updater(root: Path, ingestor: InMemoryGraph) -> GraphUpdater: + parsers, queries = load_parsers() + return GraphUpdater( + ingestor=ingestor, + repo_path=root, + parsers=parsers, + queries=queries, + project_name=PROJECT_NAME, + ) + + +class TestIncrementalRenameStaleEntities: + def test_incremental_rename_matches_full_rebuild(self, tmp_path: Path) -> None: + # (H) Golden: a fresh full rebuild of the already-renamed tree. + golden_root = tmp_path / "golden" + golden_root.mkdir() + _write_tree(golden_root, "new_name") + golden_graph = InMemoryGraph() + _make_updater(golden_root, golden_graph).run(force=True) + + # (H) Sanity: golden truly contains the renamed symbol and not the old one. + golden_funcs = { + uid for (label, uid) in golden_graph.nodes if label == cs.NodeLabel.FUNCTION + } + assert any(str(qn).endswith(".new_name") for qn in golden_funcs) + assert not any(str(qn).endswith(".old_name") for qn in golden_funcs) + + # (H) Incremental: build original tree, then rename across both files + # (H) and rebuild incrementally (force=False). + incr_root = tmp_path / "incr" + incr_root.mkdir() + _write_tree(incr_root, "old_name") + incr_graph = InMemoryGraph() + _make_updater(incr_root, incr_graph).run(force=True) + + _write_tree(incr_root, "new_name") + _make_updater(incr_root, incr_graph).run(force=False) + + # (H) The stale old_name Function and its edges must be gone. + incr_nodes, incr_rels = incr_graph.snapshot() + golden_nodes, golden_rels = golden_graph.snapshot() + + assert incr_nodes == golden_nodes, { + "stale_extra_nodes": sorted(map(str, incr_nodes - golden_nodes)), + "missing_nodes": sorted(map(str, golden_nodes - incr_nodes)), + } + assert incr_rels == golden_rels, { + "stale_extra_rels": sorted(map(str, incr_rels - golden_rels)), + "missing_rels": sorted(map(str, golden_rels - incr_rels)), + } + + +if __name__ == "__main__": + pytest.main([__file__, "-v"]) diff --git a/codebase_rag/tests/test_graph_updater_pruning.py b/codebase_rag/tests/test_graph_updater_pruning.py new file mode 100644 index 000000000..a8d5419cc --- /dev/null +++ b/codebase_rag/tests/test_graph_updater_pruning.py @@ -0,0 +1,369 @@ +# (H) Tests for orphan node pruning in GraphUpdater._prune_orphan_nodes +# (H) and Cypher deletion in _process_files for hash-cache-detected deletions. +from pathlib import Path +from unittest.mock import MagicMock, patch + +import pytest + +from codebase_rag import constants as cs +from codebase_rag.graph_updater import GraphUpdater +from codebase_rag.parser_loader import load_parsers + + +@pytest.fixture +def updater(temp_repo: Path, mock_ingestor: MagicMock) -> GraphUpdater: + parsers, queries = load_parsers() + return GraphUpdater( + ingestor=mock_ingestor, + repo_path=temp_repo, + parsers=parsers, + queries=queries, + ) + + +@pytest.fixture +def py_project(temp_repo: Path) -> Path: + (temp_repo / "__init__.py").touch() + (temp_repo / "module_a.py").write_text("def func_a():\n pass\n") + (temp_repo / "module_b.py").write_text("def func_b():\n pass\n") + sub = temp_repo / "subpkg" + sub.mkdir() + (sub / "__init__.py").touch() + (sub / "inner.py").write_text("def inner_func():\n pass\n") + return temp_repo + + +class TestPruneOrphanNodes: + def test_prune_removes_orphan_module_nodes( + self, py_project: Path, mock_ingestor: MagicMock + ) -> None: + parsers, queries = load_parsers() + updater = GraphUpdater( + ingestor=mock_ingestor, + repo_path=py_project, + parsers=parsers, + queries=queries, + ) + project_name = py_project.resolve().name + + mock_ingestor.fetch_all.side_effect = [ + [], + [ + { + "path": "old_project/main.py", + "qualified_name": f"{project_name}.old_project.main", + }, + { + "path": "module_a.py", + "qualified_name": f"{project_name}.module_a", + }, + ], + [], + ] + updater._prune_orphan_nodes() + + delete_calls = [ + c + for c in mock_ingestor.execute_write.call_args_list + if c.args[0] == cs.CYPHER_DELETE_MODULE + ] + assert len(delete_calls) == 1 + assert delete_calls[0].args[1] == {cs.KEY_PATH: "old_project/main.py"} + + def test_prune_removes_orphan_external_module_nodes( + self, py_project: Path, mock_ingestor: MagicMock + ) -> None: + parsers, queries = load_parsers() + updater = GraphUpdater( + ingestor=mock_ingestor, + repo_path=py_project, + parsers=parsers, + queries=queries, + ) + + mock_ingestor.fetch_all.side_effect = [[], [], []] + updater._prune_orphan_nodes() + + external_calls = [ + c + for c in mock_ingestor.execute_write.call_args_list + if c.args[0] == cs.CYPHER_DELETE_ORPHAN_EXTERNAL_MODULES + ] + assert len(external_calls) == 1 + + def test_prune_skips_other_projects( + self, py_project: Path, mock_ingestor: MagicMock + ) -> None: + parsers, queries = load_parsers() + updater = GraphUpdater( + ingestor=mock_ingestor, + repo_path=py_project, + parsers=parsers, + queries=queries, + ) + + mock_ingestor.fetch_all.side_effect = [ + [{"path": "app.py", "absolute_path": "/other/project/app.py"}], + [{"path": "app.py", "qualified_name": "other_project.app"}], + [{"path": "data", "absolute_path": "/other/project/data"}], + ] + updater._prune_orphan_nodes() + + path_deletes = [ + c + for c in mock_ingestor.execute_write.call_args_list + if c.args[0] + in (cs.CYPHER_DELETE_FILE, cs.CYPHER_DELETE_MODULE, cs.CYPHER_DELETE_FOLDER) + ] + assert path_deletes == [] + + def test_prune_no_orphans_skips_deletes( + self, py_project: Path, mock_ingestor: MagicMock + ) -> None: + parsers, queries = load_parsers() + updater = GraphUpdater( + ingestor=mock_ingestor, + repo_path=py_project, + parsers=parsers, + queries=queries, + ) + + project_name = py_project.resolve().name + repo_abs = py_project.resolve().as_posix() + mock_ingestor.fetch_all.side_effect = [ + [{"path": "module_a.py", "absolute_path": f"{repo_abs}/module_a.py"}], + [{"path": "module_a.py", "qualified_name": f"{project_name}.module_a"}], + [{"path": "subpkg", "absolute_path": f"{repo_abs}/subpkg"}], + ] + updater._prune_orphan_nodes() + + path_deletes = [ + c + for c in mock_ingestor.execute_write.call_args_list + if c.args[0] + in (cs.CYPHER_DELETE_FILE, cs.CYPHER_DELETE_MODULE, cs.CYPHER_DELETE_FOLDER) + ] + assert path_deletes == [] + + def test_prune_handles_empty_graph( + self, py_project: Path, mock_ingestor: MagicMock + ) -> None: + parsers, queries = load_parsers() + updater = GraphUpdater( + ingestor=mock_ingestor, + repo_path=py_project, + parsers=parsers, + queries=queries, + ) + + mock_ingestor.fetch_all.side_effect = [[], [], []] + updater._prune_orphan_nodes() + + path_deletes = [ + c + for c in mock_ingestor.execute_write.call_args_list + if c.args[0] + in (cs.CYPHER_DELETE_FILE, cs.CYPHER_DELETE_MODULE, cs.CYPHER_DELETE_FOLDER) + ] + assert path_deletes == [] + + def test_prune_handles_none_path_gracefully( + self, py_project: Path, mock_ingestor: MagicMock + ) -> None: + parsers, queries = load_parsers() + updater = GraphUpdater( + ingestor=mock_ingestor, + repo_path=py_project, + parsers=parsers, + queries=queries, + ) + + project_name = py_project.resolve().name + mock_ingestor.fetch_all.side_effect = [ + [{"path": None, "absolute_path": None}], + [ + {"path": None, "qualified_name": f"{project_name}.something"}, + {"path": "module_a.py", "qualified_name": f"{project_name}.module_a"}, + ], + [], + ] + updater._prune_orphan_nodes() + + path_deletes = [ + c + for c in mock_ingestor.execute_write.call_args_list + if c.args[0] + in (cs.CYPHER_DELETE_FILE, cs.CYPHER_DELETE_MODULE, cs.CYPHER_DELETE_FOLDER) + ] + assert path_deletes == [] + + def test_prune_multiple_orphans_across_types( + self, py_project: Path, mock_ingestor: MagicMock + ) -> None: + parsers, queries = load_parsers() + updater = GraphUpdater( + ingestor=mock_ingestor, + repo_path=py_project, + parsers=parsers, + queries=queries, + ) + + project_name = py_project.resolve().name + repo_abs = py_project.resolve().as_posix() + mock_ingestor.fetch_all.side_effect = [ + [ + {"path": "gone.py", "absolute_path": f"{repo_abs}/gone.py"}, + {"path": "module_a.py", "absolute_path": f"{repo_abs}/module_a.py"}, + ], + [ + { + "path": "deleted.py", + "qualified_name": f"{project_name}.deleted", + }, + { + "path": "module_a.py", + "qualified_name": f"{project_name}.module_a", + }, + ], + [ + {"path": "old_dir", "absolute_path": f"{repo_abs}/old_dir"}, + {"path": "subpkg", "absolute_path": f"{repo_abs}/subpkg"}, + ], + ] + updater._prune_orphan_nodes() + + path_deletes = [ + c + for c in mock_ingestor.execute_write.call_args_list + if c.args[0] + in (cs.CYPHER_DELETE_FILE, cs.CYPHER_DELETE_MODULE, cs.CYPHER_DELETE_FOLDER) + ] + assert len(path_deletes) == 3 + + def test_prune_skips_inline_module_synthetic_paths( + self, py_project: Path, mock_ingestor: MagicMock + ) -> None: + parsers, queries = load_parsers() + updater = GraphUpdater( + ingestor=mock_ingestor, + repo_path=py_project, + parsers=parsers, + queries=queries, + ) + + project_name = py_project.resolve().name + inline_path_tests = f"{cs.INLINE_MODULE_PATH_PREFIX}tests" + inline_path_macos = f"{cs.INLINE_MODULE_PATH_PREFIX}macos" + mock_ingestor.fetch_all.side_effect = [ + [], + [ + { + "path": inline_path_tests, + "qualified_name": f"{project_name}.src.app.tests", + }, + { + "path": inline_path_tests, + "qualified_name": f"{project_name}.src.cli.tests", + }, + { + "path": inline_path_macos, + "qualified_name": f"{project_name}.src.clipboard.macos", + }, + ], + [], + ] + updater._prune_orphan_nodes() + + delete_module_calls = [ + c + for c in mock_ingestor.execute_write.call_args_list + if c.args[0] == cs.CYPHER_DELETE_MODULE + ] + assert delete_module_calls == [] + + +class TestCypherDeleteModuleQuery: + def test_query_does_not_traverse_calls_edges(self) -> None: + query = cs.CYPHER_DELETE_MODULE + assert "-[*0..]->" not in query + assert "-[*]->" not in query + + def test_query_constrains_traversal_to_containment_edges(self) -> None: + query = cs.CYPHER_DELETE_MODULE + assert "DEFINES" in query + assert "CALLS" not in query + assert "IMPORTS" not in query + assert "INHERITS" not in query + + +class TestDeletedFileInProcessFiles: + def test_deleted_file_triggers_cypher_delete( + self, py_project: Path, mock_ingestor: MagicMock + ) -> None: + parsers, queries = load_parsers() + updater = GraphUpdater( + ingestor=mock_ingestor, + repo_path=py_project, + parsers=parsers, + queries=queries, + ) + + updater.run(force=True) + mock_ingestor.execute_write.reset_mock() + + (py_project / "module_b.py").unlink() + updater.run(force=False) + + delete_module_calls = [ + c + for c in mock_ingestor.execute_write.call_args_list + if c.args[0] == cs.CYPHER_DELETE_MODULE + ] + delete_file_calls = [ + c + for c in mock_ingestor.execute_write.call_args_list + if c.args[0] == cs.CYPHER_DELETE_FILE + ] + assert len(delete_module_calls) >= 1 + assert len(delete_file_calls) >= 1 + + def test_no_deletes_when_no_files_removed( + self, py_project: Path, mock_ingestor: MagicMock + ) -> None: + parsers, queries = load_parsers() + updater = GraphUpdater( + ingestor=mock_ingestor, + repo_path=py_project, + parsers=parsers, + queries=queries, + ) + + updater.run(force=True) + mock_ingestor.execute_write.reset_mock() + + updater.run(force=False) + + delete_calls = [ + c + for c in mock_ingestor.execute_write.call_args_list + if c.args[0] in (cs.CYPHER_DELETE_MODULE, cs.CYPHER_DELETE_FILE) + ] + assert len(delete_calls) == 0 + + @patch("codebase_rag.graph_updater.GraphUpdater._prune_orphan_nodes") + def test_run_calls_prune( + self, + mock_prune: MagicMock, + py_project: Path, + mock_ingestor: MagicMock, + ) -> None: + parsers, queries = load_parsers() + updater = GraphUpdater( + ingestor=mock_ingestor, + repo_path=py_project, + parsers=parsers, + queries=queries, + ) + + updater.run(force=True) + mock_prune.assert_called_once() diff --git a/codebase_rag/tests/test_handler_registry.py b/codebase_rag/tests/test_handler_registry.py index 2a9215755..6b7259f18 100644 --- a/codebase_rag/tests/test_handler_registry.py +++ b/codebase_rag/tests/test_handler_registry.py @@ -9,6 +9,7 @@ from codebase_rag.parsers.handlers.java import JavaHandler from codebase_rag.parsers.handlers.js_ts import JsTsHandler from codebase_rag.parsers.handlers.lua import LuaHandler +from codebase_rag.parsers.handlers.php import PhpHandler from codebase_rag.parsers.handlers.python import PythonHandler from codebase_rag.parsers.handlers.rust import RustHandler @@ -47,8 +48,12 @@ def test_returns_base_handler_for_go(self) -> None: assert isinstance(handler, BaseLanguageHandler) assert type(handler) is BaseLanguageHandler - def test_returns_base_handler_for_php(self) -> None: + def test_returns_php_handler_for_php(self) -> None: handler = get_handler(SupportedLanguage.PHP) + assert isinstance(handler, PhpHandler) + + def test_returns_base_handler_for_c(self) -> None: + handler = get_handler(SupportedLanguage.C) assert isinstance(handler, BaseLanguageHandler) assert type(handler) is BaseLanguageHandler @@ -84,6 +89,7 @@ class TestHandlerProtocol: SupportedLanguage.PYTHON, SupportedLanguage.GO, SupportedLanguage.PHP, + SupportedLanguage.C, ], ) def test_handler_has_all_protocol_methods( @@ -114,6 +120,8 @@ def test_handler_has_all_protocol_methods( SupportedLanguage.JAVA, SupportedLanguage.LUA, SupportedLanguage.PYTHON, + SupportedLanguage.PHP, + SupportedLanguage.C, ], ) def test_handler_methods_are_callable(self, language: SupportedLanguage) -> None: @@ -151,3 +159,6 @@ def test_lua_handler_extends_base(self) -> None: def test_python_handler_extends_base(self) -> None: assert issubclass(PythonHandler, BaseLanguageHandler) + + def test_php_handler_extends_base(self) -> None: + assert issubclass(PhpHandler, BaseLanguageHandler) diff --git a/codebase_rag/tests/test_handlers_unit.py b/codebase_rag/tests/test_handlers_unit.py index a9391ecde..f34d42d86 100644 --- a/codebase_rag/tests/test_handlers_unit.py +++ b/codebase_rag/tests/test_handlers_unit.py @@ -13,6 +13,7 @@ from codebase_rag.parsers.handlers.java import JavaHandler from codebase_rag.parsers.handlers.js_ts import JsTsHandler from codebase_rag.parsers.handlers.lua import LuaHandler +from codebase_rag.parsers.handlers.php import PhpHandler from codebase_rag.parsers.handlers.python import PythonHandler from codebase_rag.parsers.handlers.rust import RustHandler from codebase_rag.tests.conftest import create_mock_node @@ -62,6 +63,13 @@ except ImportError: LUA_AVAILABLE = False +try: + import tree_sitter_php as tsphp + + PHP_AVAILABLE = True +except ImportError: + PHP_AVAILABLE = False + @pytest.fixture def js_parser() -> Parser | None: @@ -111,6 +119,14 @@ def lua_parser() -> Parser | None: return Parser(language) +@pytest.fixture +def php_parser() -> Parser | None: + if not PHP_AVAILABLE: + return None + language = Language(tsphp.language_php()) + return Parser(language) + + class TestBaseLanguageHandler: def test_is_inside_method_with_object_literals_returns_false(self) -> None: handler = BaseLanguageHandler() @@ -1105,3 +1121,168 @@ def test_extract_decorators_dataclass_with_options( result = handler.extract_decorators(class_node) assert result == ["@dataclass(frozen=True, slots=True)"] + + +def _find_php_node(root: ASTNode, node_type: str) -> ASTNode | None: + if root.type == node_type: + return root + for child in root.children: + if result := _find_php_node(child, node_type): + return result + return None + + +@pytest.mark.skipif(not PHP_AVAILABLE, reason="tree-sitter-php not available") +class TestPhpHandler: + def test_extract_function_name_from_function_definition( + self, php_parser: Parser + ) -> None: + handler = PhpHandler() + code = b" None: + handler = PhpHandler() + code = b" None: + handler = PhpHandler() + code = b" None: + handler = PhpHandler() + code = b" 2;" + tree = php_parser.parse(code) + arrow_node = _find_php_node(tree.root_node, cs.TS_PHP_ARROW_FUNCTION) + assert arrow_node is not None + + result = handler.extract_function_name(arrow_node) + assert result is not None + assert result.startswith("arrow_") + + def test_is_class_method_inside_class(self, php_parser: Parser) -> None: + handler = PhpHandler() + code = b" None: + handler = PhpHandler() + code = b" None: + handler = PhpHandler() + code = b" None: + handler = PhpHandler() + code = b" None: + handler = PhpHandler() + code = b" None: + handler = PhpHandler() + code = b" None: + handler = PhpHandler() + code = b" None: + handler = PhpHandler() + code = b' None: + handler = PhpHandler() + code = b" None: + handler = PhpHandler() + code = b" None: + handler = PhpHandler() + code = b' int: + return 1 + + def keyfn(self) -> int: + return 2 +""" + + +class _Capture: + def __init__(self) -> None: + self.rels: list[tuple[PropertyValue, str, PropertyValue]] = [] + + def ensure_node_batch(self, label: str, properties: PropertyDict) -> None: + return None + + def ensure_relationship_batch( + self, + from_spec: tuple[str, str, PropertyValue], + rel_type: str, + to_spec: tuple[str, str, PropertyValue], + properties: PropertyDict | None = None, + ) -> None: + self.rels.append((from_spec[2], str(rel_type), to_spec[2])) + + def flush_all(self) -> None: + return None + + def fetch_all( + self, query: str, params: PropertyDict | None = None + ) -> list[ResultRow]: + return [] + + def execute_write(self, query: str, params: PropertyDict | None = None) -> None: + return None + + +def _calls(tmp_path: Path) -> set[tuple[PropertyValue, PropertyValue]]: + (tmp_path / "m.py").write_text(MODULE_SRC) + parsers, queries = load_parsers() + cap = _Capture() + GraphUpdater( + ingestor=cap, + repo_path=tmp_path, + parsers=parsers, + queries=queries, + project_name=PROJECT, + ).run(force=True) + return { + (frm, to) for (frm, rel, to) in cap.rels if rel == cs.RelationshipType.CALLS + } + + +class TestHigherOrderCalls: + def test_callable_parameter_resolves_to_argument_at_call_site( + self, tmp_path: Path + ) -> None: + calls = _calls(tmp_path) + assert ("proj.m.apply_cb", "proj.m.helper") in calls, calls + + def test_callback_attributed_to_invoking_callee_not_caller( + self, tmp_path: Path + ) -> None: + calls = _calls(tmp_path) + # (H) driver passes helper but never invokes it; apply_cb does. + assert ("proj.m.driver", "proj.m.helper") not in calls, calls + + def test_callable_parameter_prefers_module_function_over_sibling_method( + self, tmp_path: Path + ) -> None: + calls = _calls(tmp_path) + assert ("proj.m.apply_cb", "proj.m.Other.helper") not in calls, calls + + def test_sorted_key_attributed_to_enclosing_function(self, tmp_path: Path) -> None: + calls = _calls(tmp_path) + assert ("proj.m.do_sort", "proj.m.keyfn") in calls, calls + + def test_normal_call_edge_to_callee_still_present(self, tmp_path: Path) -> None: + calls = _calls(tmp_path) + assert ("proj.m.driver", "proj.m.apply_cb") in calls, calls diff --git a/codebase_rag/tests/test_image_paths.py b/codebase_rag/tests/test_image_paths.py deleted file mode 100644 index 8daeba0db..000000000 --- a/codebase_rag/tests/test_image_paths.py +++ /dev/null @@ -1,160 +0,0 @@ -from pathlib import Path - -import pytest - -from codebase_rag.main import ( - _find_image_paths, - _get_path_variants, - _handle_chat_images, - _replace_path_in_question, -) - - -class TestFindImagePaths: - def test_finds_png_path(self) -> None: - question = "What is in this image /home/user/screenshot.png please analyze" - result = _find_image_paths(question) - assert result == [Path("/home/user/screenshot.png")] - - def test_finds_jpg_path(self) -> None: - question = "Look at /tmp/photo.jpg" - result = _find_image_paths(question) - assert result == [Path("/tmp/photo.jpg")] - - def test_finds_jpeg_path(self) -> None: - question = "Check /var/images/pic.jpeg" - result = _find_image_paths(question) - assert result == [Path("/var/images/pic.jpeg")] - - def test_finds_gif_path(self) -> None: - question = "Analyze /home/user/animation.gif" - result = _find_image_paths(question) - assert result == [Path("/home/user/animation.gif")] - - def test_finds_multiple_images(self) -> None: - question = "Compare /img/a.png and /img/b.jpg" - result = _find_image_paths(question) - assert result == [Path("/img/a.png"), Path("/img/b.jpg")] - - def test_case_insensitive_extension(self) -> None: - question = "Look at /path/IMAGE.PNG and /path/photo.JPG" - result = _find_image_paths(question) - assert len(result) == 2 - assert Path("/path/IMAGE.PNG") in result - assert Path("/path/photo.JPG") in result - - def test_ignores_relative_paths(self) -> None: - question = "Check images/photo.png and ./local/pic.jpg" - result = _find_image_paths(question) - assert result == [] - - def test_ignores_non_image_extensions(self) -> None: - question = "Look at /path/document.pdf and /path/code.py" - result = _find_image_paths(question) - assert result == [] - - def test_empty_question(self) -> None: - result = _find_image_paths("") - assert result == [] - - def test_no_paths(self) -> None: - question = "What is the meaning of life?" - result = _find_image_paths(question) - assert result == [] - - def test_handles_quoted_paths(self) -> None: - question = 'Look at "/path/with spaces/image.png"' - result = _find_image_paths(question) - assert result == [Path("/path/with spaces/image.png")] - - -class TestGetPathVariants: - def test_returns_four_variants(self) -> None: - result = _get_path_variants("/path/to/file.png") - assert len(result) == 4 - - def test_includes_escaped_spaces(self) -> None: - result = _get_path_variants("/path/with spaces/file.png") - assert r"/path/with\ spaces/file.png" in result - - def test_includes_single_quoted(self) -> None: - result = _get_path_variants("/path/to/file.png") - assert "'/path/to/file.png'" in result - - def test_includes_double_quoted(self) -> None: - result = _get_path_variants("/path/to/file.png") - assert '"/path/to/file.png"' in result - - def test_includes_original(self) -> None: - path = "/path/to/file.png" - result = _get_path_variants(path) - assert path in result - - -class TestReplacePathInQuestion: - def test_replaces_simple_path(self) -> None: - question = "Look at /old/path.png please" - result = _replace_path_in_question(question, "/old/path.png", "/new/path.png") - assert result == "Look at /new/path.png please" - - def test_replaces_quoted_path(self) -> None: - question = "Look at '/old/path.png' please" - result = _replace_path_in_question(question, "/old/path.png", "/new/path.png") - assert result == "Look at '/new/path.png' please" - - def test_replaces_double_quoted_path(self) -> None: - question = 'Look at "/old/path.png" please' - result = _replace_path_in_question(question, "/old/path.png", "/new/path.png") - assert result == 'Look at "/new/path.png" please' - - def test_returns_original_if_not_found(self) -> None: - question = "No path here" - result = _replace_path_in_question(question, "/missing.png", "/new.png") - assert result == question - - -class TestHandleChatImages: - @pytest.fixture - def temp_project(self, tmp_path: Path) -> Path: - return tmp_path - - @pytest.fixture - def temp_image(self, tmp_path: Path) -> Path: - img_path = tmp_path / "test_image.png" - img_path.write_bytes(b"fake png content") - return img_path - - def test_no_images_returns_unchanged(self, temp_project: Path) -> None: - question = "What is 2 + 2?" - result = _handle_chat_images(question, temp_project) - assert result == question - - def test_copies_image_to_tmp(self, temp_project: Path, temp_image: Path) -> None: - question = f"Look at {temp_image}" - result = _handle_chat_images(question, temp_project) - - assert ".tmp" in result - assert "test_image.png" in result - - tmp_dir = temp_project / ".tmp" - assert tmp_dir.exists() - copied_files = list(tmp_dir.glob("*test_image.png")) - assert len(copied_files) == 1 - - def test_handles_nonexistent_image(self, temp_project: Path) -> None: - question = "Look at /nonexistent/image.png" - result = _handle_chat_images(question, temp_project) - assert result == question - - def test_handles_multiple_images(self, temp_project: Path) -> None: - img1 = temp_project / "img1.png" - img2 = temp_project / "img2.jpg" - img1.write_bytes(b"png1") - img2.write_bytes(b"jpg2") - - question = f"Compare {img1} and {img2}" - result = _handle_chat_images(question, temp_project) - - assert ".tmp" in result - assert "img1.png" in result - assert "img2.jpg" in result diff --git a/codebase_rag/tests/test_import_parsing.py b/codebase_rag/tests/test_import_parsing.py index 318b146e3..2091d4195 100644 --- a/codebase_rag/tests/test_import_parsing.py +++ b/codebase_rag/tests/test_import_parsing.py @@ -475,3 +475,103 @@ def test_internal_import_matched_with_dot_separator( assert result == "myapp.utils.Helper" assert len(mock_ingestor.nodes_created) == 0 + + +class TestIsLocalModuleCache: + def test_is_local_module_cache_returns_correct_result(self, tmp_path: Path) -> None: + (tmp_path / "utils").mkdir() + (tmp_path / "utils" / "__init__.py").touch() + + processor = ImportProcessor( + repo_path=tmp_path, + project_name="myproject", + ingestor=None, + function_registry=None, + ) + + assert processor._is_local_module("utils") is True + assert processor._is_local_module("nonexistent") is False + + def test_is_local_module_cache_hits_on_repeated_calls(self, tmp_path: Path) -> None: + (tmp_path / "models").mkdir() + (tmp_path / "models" / "__init__.py").touch() + + processor = ImportProcessor( + repo_path=tmp_path, + project_name="myproject", + ingestor=None, + function_registry=None, + ) + + processor._is_local_module("models") + processor._is_local_module("models") + processor._is_local_module("models") + + info = processor._is_local_module_cached.cache_info() + assert info.hits >= 2 + assert info.misses == 1 + + def test_is_local_module_detects_py_file(self, tmp_path: Path) -> None: + (tmp_path / "helpers.py").touch() + + processor = ImportProcessor( + repo_path=tmp_path, + project_name="myproject", + ingestor=None, + function_registry=None, + ) + + assert processor._is_local_module("helpers") is True + + def test_is_local_module_detects_directory(self, tmp_path: Path) -> None: + (tmp_path / "services").mkdir() + + processor = ImportProcessor( + repo_path=tmp_path, + project_name="myproject", + ingestor=None, + function_registry=None, + ) + + assert processor._is_local_module("services") is True + + def test_is_local_java_import_cache_hits(self, tmp_path: Path) -> None: + (tmp_path / "com").mkdir() + + processor = ImportProcessor( + repo_path=tmp_path, + project_name="myproject", + ingestor=None, + function_registry=None, + ) + + processor._is_local_java_import("com.example.Service") + processor._is_local_java_import("com.example.Service") + processor._is_local_java_import("com.example.Service") + + info = processor._is_local_java_import_cached.cache_info() + assert info.hits >= 2 + assert info.misses == 1 + + def test_separate_instances_have_independent_caches(self, tmp_path: Path) -> None: + (tmp_path / "shared").mkdir() + + p1 = ImportProcessor( + repo_path=tmp_path, + project_name="project1", + ingestor=None, + function_registry=None, + ) + p2 = ImportProcessor( + repo_path=tmp_path, + project_name="project2", + ingestor=None, + function_registry=None, + ) + + p1._is_local_module("shared") + p1._is_local_module("shared") + + info2 = p2._is_local_module_cached.cache_info() + assert info2.hits == 0 + assert info2.misses == 0 diff --git a/codebase_rag/tests/test_inherits_attribute_base.py b/codebase_rag/tests/test_inherits_attribute_base.py new file mode 100644 index 000000000..f057758cf --- /dev/null +++ b/codebase_rag/tests/test_inherits_attribute_base.py @@ -0,0 +1,85 @@ +# (H) L2 finding from the evals/ harness: cgr captured INHERITS for direct-name +# (H) bases (class C(Base)) but dropped attribute-style bases (class C(mod.Base), +# (H) e.g. class UniXcoder(nn.Module)). Those inheritance edges must be captured. +from __future__ import annotations + +from pathlib import Path + +from codebase_rag import constants as cs +from codebase_rag.graph_updater import GraphUpdater +from codebase_rag.parser_loader import load_parsers +from codebase_rag.types_defs import PropertyDict, PropertyValue, ResultRow + +PROJECT = "inhproj" + +MODULE_SRC = """from collections import abc + + +class C(abc.Mapping): + pass +""" + +_RelTuple = tuple[str, PropertyValue, str, str, PropertyValue] + + +class _Capture: + def __init__(self) -> None: + self.nodes: dict[tuple[str, PropertyValue], PropertyDict] = {} + self.rels: list[_RelTuple] = [] + + def ensure_node_batch(self, label: str, properties: PropertyDict) -> None: + uid = properties[cs.NODE_UNIQUE_CONSTRAINTS[label]] + self.nodes[(str(label), uid)] = dict(properties) + + def ensure_relationship_batch( + self, + from_spec: tuple[str, str, PropertyValue], + rel_type: str, + to_spec: tuple[str, str, PropertyValue], + properties: PropertyDict | None = None, + ) -> None: + self.rels.append( + ( + str(from_spec[0]), + from_spec[2], + str(rel_type), + str(to_spec[0]), + to_spec[2], + ) + ) + + def flush_all(self) -> None: + return None + + def fetch_all( + self, query: str, params: PropertyDict | None = None + ) -> list[ResultRow]: + return [] + + def execute_write(self, query: str, params: PropertyDict | None = None) -> None: + return None + + +def _build(tmp_path: Path) -> _Capture: + (tmp_path / "m.py").write_text(MODULE_SRC) + parsers, queries = load_parsers() + cap = _Capture() + GraphUpdater( + ingestor=cap, + repo_path=tmp_path, + parsers=parsers, + queries=queries, + project_name=PROJECT, + ).run(force=True) + return cap + + +class TestInheritsAttributeBase: + def test_attribute_base_class_creates_inherits_edge(self, tmp_path: Path) -> None: + cap = _build(tmp_path) + targets = [ + str(target).rsplit(cs.SEPARATOR_DOT, 1)[-1] + for (_fl, from_val, rel_type, _tl, target) in cap.rels + if rel_type == cs.RelationshipType.INHERITS and str(from_val).endswith(".C") + ] + assert targets == ["Mapping"], targets diff --git a/codebase_rag/tests/test_instance_attr_type_inference.py b/codebase_rag/tests/test_instance_attr_type_inference.py new file mode 100644 index 000000000..d28e30319 --- /dev/null +++ b/codebase_rag/tests/test_instance_attr_type_inference.py @@ -0,0 +1,111 @@ +# (H) L3 finding from the evals/ harness: a method calls self.attr.method(), but the +# (H) type of self.attr is only knowable from the __init__ assignment in the same +# (H) class. cgr scanned only the calling method for self-assignments, so the type +# (H) was unknown and an ambiguous bare name resolved to the wrong global. Instance +# (H) attributes assigned in __init__ must be visible to every method of the class. +from __future__ import annotations + +from pathlib import Path + +from codebase_rag import constants as cs +from codebase_rag.graph_updater import GraphUpdater +from codebase_rag.parser_loader import load_parsers +from codebase_rag.types_defs import PropertyDict, PropertyValue, ResultRow + +PROJECT = "proj" + +MODULE_SRC = """def run() -> str: + return "global" + + +def status() -> str: + return "globalprop" + + +class Helper: + def run(self) -> str: + return "real" + + @property + def status(self) -> str: + return "ok" + + +class App: + def __init__(self) -> None: + self.helper = Helper() + + def go(self) -> str: + return self.helper.run() + + def check(self) -> str: + return self.helper.status +""" + + +class _Capture: + def __init__(self) -> None: + self.rels: list[tuple[PropertyValue, str, PropertyValue]] = [] + + def ensure_node_batch(self, label: str, properties: PropertyDict) -> None: + return None + + def ensure_relationship_batch( + self, + from_spec: tuple[str, str, PropertyValue], + rel_type: str, + to_spec: tuple[str, str, PropertyValue], + properties: PropertyDict | None = None, + ) -> None: + self.rels.append((from_spec[2], str(rel_type), to_spec[2])) + + def flush_all(self) -> None: + return None + + def fetch_all( + self, query: str, params: PropertyDict | None = None + ) -> list[ResultRow]: + return [] + + def execute_write(self, query: str, params: PropertyDict | None = None) -> None: + return None + + +def _calls(tmp_path: Path) -> set[tuple[PropertyValue, PropertyValue]]: + (tmp_path / "m.py").write_text(MODULE_SRC) + parsers, queries = load_parsers() + cap = _Capture() + GraphUpdater( + ingestor=cap, + repo_path=tmp_path, + parsers=parsers, + queries=queries, + project_name=PROJECT, + ).run(force=True) + return { + (frm, to) for (frm, rel, to) in cap.rels if rel == cs.RelationshipType.CALLS + } + + +class TestInstanceAttrTypeInference: + def test_method_call_resolves_via_init_attribute_type(self, tmp_path: Path) -> None: + calls = _calls(tmp_path) + assert ("proj.m.App.go", "proj.m.Helper.run") in calls, calls + + def test_ambiguous_method_does_not_resolve_to_module_function( + self, tmp_path: Path + ) -> None: + calls = _calls(tmp_path) + assert ("proj.m.App.go", "proj.m.run") not in calls, calls + + def test_property_access_resolves_via_init_attribute_type( + self, tmp_path: Path + ) -> None: + calls = _calls(tmp_path) + assert ("proj.m.App.check", "proj.m.Helper.status") in calls, calls + + def test_property_access_not_resolved_to_module_function( + self, tmp_path: Path + ) -> None: + calls = _calls(tmp_path) + assert ("proj.m.App.check", "proj.m.status") not in calls, calls diff --git a/codebase_rag/tests/test_interprocedural_callback_flow.py b/codebase_rag/tests/test_interprocedural_callback_flow.py new file mode 100644 index 000000000..6369ad22a --- /dev/null +++ b/codebase_rag/tests/test_interprocedural_callback_flow.py @@ -0,0 +1,94 @@ +# (H) L3 finding from the evals/ harness: extract_java_interface_names invokes a +# (H) resolve_to_qn callback that is threaded through extract_implemented_interfaces from +# (H) a caller that passes self._resolve_to_qn. The concrete callable is bound at the +# (H) outer call site and flows through pass-through parameters to where it is finally +# (H) invoked, so resolving the edge needs inter-procedural callback propagation. +from __future__ import annotations + +from pathlib import Path + +from codebase_rag import constants as cs +from codebase_rag.graph_updater import GraphUpdater +from codebase_rag.parser_loader import load_parsers +from codebase_rag.types_defs import PropertyDict, PropertyValue, ResultRow + +PROJECT = "proj" + +FILES = { + "pkg/__init__.py": "", + # (H) extract_names invokes the callback; extract_interfaces only passes it through. + "pkg/extract.py": ( + "def extract_names(node, out, scope, resolve_to_qn):\n" + ' out.append(resolve_to_qn("x", scope))\n\n\n' + "def extract_interfaces(node, scope, resolve_to_qn):\n" + " out = []\n" + " extract_names(node, out, scope, resolve_to_qn)\n" + " return out\n" + ), + "pkg/driver.py": ( + "from .extract import extract_interfaces\n\n\n" + "class Driver:\n" + " def resolve(self, name, scope):\n" + " return name\n\n" + " def run(self, node):\n" + ' return extract_interfaces(node, "s", self.resolve)\n' + ), +} + + +class _Capture: + def __init__(self) -> None: + self.rels: list[tuple[PropertyValue, str, PropertyValue]] = [] + + def ensure_node_batch(self, label: str, properties: PropertyDict) -> None: + return None + + def ensure_relationship_batch( + self, + from_spec: tuple[str, str, PropertyValue], + rel_type: str, + to_spec: tuple[str, str, PropertyValue], + properties: PropertyDict | None = None, + ) -> None: + self.rels.append((from_spec[2], str(rel_type), to_spec[2])) + + def flush_all(self) -> None: + return None + + def fetch_all( + self, query: str, params: PropertyDict | None = None + ) -> list[ResultRow]: + return [] + + def execute_write(self, query: str, params: PropertyDict | None = None) -> None: + return None + + +def _calls(tmp_path: Path) -> set[tuple[PropertyValue, PropertyValue]]: + for rel, content in FILES.items(): + p = tmp_path / rel + p.parent.mkdir(parents=True, exist_ok=True) + p.write_text(content) + parsers, queries = load_parsers() + cap = _Capture() + GraphUpdater( + ingestor=cap, + repo_path=tmp_path, + parsers=parsers, + queries=queries, + project_name=PROJECT, + ).run(force=True) + return { + (frm, to) for (frm, rel, to) in cap.rels if rel == cs.RelationshipType.CALLS + } + + +class TestInterproceduralCallbackFlow: + def test_callback_propagates_through_passthrough_param( + self, tmp_path: Path + ) -> None: + calls = _calls(tmp_path) + assert ( + "proj.pkg.extract.extract_names", + "proj.pkg.driver.Driver.resolve", + ) in calls, calls diff --git a/codebase_rag/tests/test_java_containment_oracle.py b/codebase_rag/tests/test_java_containment_oracle.py new file mode 100644 index 000000000..297e7ffea --- /dev/null +++ b/codebase_rag/tests/test_java_containment_oracle.py @@ -0,0 +1,70 @@ +# (H) Covers Java containment-edge validation: cgr's DEFINES (file module -> +# (H) every named type, including nested) and DEFINES_METHOD (class/interface/ +# (H) enum -> method) edges are graded against the independent JDK Compiler Tree +# (H) API oracle, joined on (kind, file, line). Exercises an interface method, an +# (H) enum method, and a nested class (cgr keeps type containment flat). +from __future__ import annotations + +from pathlib import Path + +import pytest + +from codebase_rag import constants as cs +from codebase_rag.parser_loader import load_parsers +from evals import constants as ec +from evals.cgr_graph import extract_cgr_java_graph +from evals.oracles import java_available, run_java_oracle +from evals.score import score_edge_types + +JAVA_SRC = """\ +package demo; + +public interface Shape { + double area(); +} + +public enum Color { + RED, GREEN; + public int rank() { return 1; } +} + +public class Point implements Shape { + private int x; + public double area() { return 1.0; } + + public static class Inner { + public void helper() {} + } +} +""" + + +def _require_java() -> None: + if not java_available(): + pytest.skip("java toolchain not available") + if cs.SupportedLanguage.JAVA not in load_parsers()[0]: + pytest.skip("java parser not available") + + +def test_cgr_matches_jdk_oracle_on_containment_edges(tmp_path: Path) -> None: + _require_java() + project = tmp_path / "java_edge_test" + project.mkdir() + (project / "Demo.java").write_text(JAVA_SRC, encoding="utf-8") + + cgr = extract_cgr_java_graph(project, project.name) + oracle = run_java_oracle(project) + + result = score_edge_types(cgr, oracle, ec.SCORED_EDGE_TYPES) + by_label = {row["label"]: row for row in result.rows} + for label in ( + cs.RelationshipType.DEFINES.value, + cs.RelationshipType.DEFINES_METHOD.value, + ): + row = by_label.get(label) + assert row is not None, (label, by_label, result.diff) + assert row["precision"] == 1.0 and row["recall"] == 1.0, ( + label, + row, + result.diff, + ) diff --git a/codebase_rag/tests/test_java_field_access_chains.py b/codebase_rag/tests/test_java_field_access_chains.py new file mode 100644 index 000000000..55a1d0791 --- /dev/null +++ b/codebase_rag/tests/test_java_field_access_chains.py @@ -0,0 +1,389 @@ +from __future__ import annotations + +from pathlib import Path +from unittest.mock import MagicMock + +import tree_sitter_java as tsjava +from tree_sitter import Language, Node, Parser + +from codebase_rag.graph_updater import GraphUpdater +from codebase_rag.parser_loader import load_parsers +from codebase_rag.parsers.java.utils import extract_class_info +from codebase_rag.tests.conftest import get_relationships + + +def _call_targets(mock_ingestor: MagicMock) -> set[str]: + return {c.args[2][2] for c in get_relationships(mock_ingestor, "CALLS")} + + +def _class_node(java_source: str) -> Node: + tree = Parser(Language(tsjava.language())).parse(java_source.encode()) + + def walk(node: Node) -> Node | None: + if node.type == "class_declaration": + return node + for child in node.children: + if found := walk(child): + return found + return None + + found = walk(tree.root_node) + assert found is not None + return found + + +def _run(project_path: Path, mock_ingestor: MagicMock) -> None: + parsers, queries = load_parsers() + GraphUpdater( + ingestor=mock_ingestor, + repo_path=project_path, + parsers=parsers, + queries=queries, + ).run() + + +def test_mixed_field_access_then_method_resolves( + temp_repo: Path, mock_ingestor: MagicMock +) -> None: + project = temp_repo / "proj" + (project / "src").mkdir(parents=True) + (project / "src" / "Main.java").write_text( + """ +class Engine { public void start() { System.out.println("started"); } } +class Car { public Engine engine = new Engine(); } +public class Main { + public static void main(String[] args) { + Car obj = new Car(); + obj.engine.start(); + } +} +""", + encoding="utf-8", + ) + _run(project, mock_ingestor) + + targets = _call_targets(mock_ingestor) + assert any(t.endswith(".Engine.start()") for t in targets), ( + f"obj.engine.start() should resolve to Engine.start(); got {sorted(targets)}" + ) + + +def test_multilevel_field_access_then_method_resolves( + temp_repo: Path, mock_ingestor: MagicMock +) -> None: + project = temp_repo / "proj" + (project / "src").mkdir(parents=True) + (project / "src" / "Main.java").write_text( + """ +class City { public void ping() { System.out.println("ping"); } } +class Address { public City city = new City(); } +class User { public Address address = new Address(); } +public class Main { + public static void main(String[] args) { + User obj = new User(); + obj.address.city.ping(); + } +} +""", + encoding="utf-8", + ) + _run(project, mock_ingestor) + + targets = _call_targets(mock_ingestor) + assert any(t.endswith(".City.ping()") for t in targets), ( + f"obj.address.city.ping() should resolve to City.ping(); got {sorted(targets)}" + ) + + +def test_nested_field_access_type_inference_via_var( + temp_repo: Path, mock_ingestor: MagicMock +) -> None: + project = temp_repo / "proj" + (project / "src").mkdir(parents=True) + (project / "src" / "Main.java").write_text( + """ +class City { public void ping() { System.out.println("ping"); } } +class Address { public City city = new City(); } +class User { public Address address = new Address(); } +public class Main { + public static void main(String[] args) { + User obj = new User(); + var c = obj.address.city; + c.ping(); + } +} +""", + encoding="utf-8", + ) + _run(project, mock_ingestor) + + targets = _call_targets(mock_ingestor) + assert any(t.endswith(".City.ping()") for t in targets), ( + f"var c = obj.address.city; c.ping() should resolve to City.ping(); " + f"got {sorted(targets)}" + ) + + +def test_this_rooted_nested_field_access_via_var( + temp_repo: Path, mock_ingestor: MagicMock +) -> None: + project = temp_repo / "proj" + (project / "src").mkdir(parents=True) + (project / "src" / "Main.java").write_text( + """ +class City { public void ping() { System.out.println("ping"); } } +class Address { public City city = new City(); } +public class Container { + public Address address = new Address(); + public void run() { + var c = this.address.city; + c.ping(); + } +} +""", + encoding="utf-8", + ) + _run(project, mock_ingestor) + + targets = _call_targets(mock_ingestor) + assert any(t.endswith(".City.ping()") for t in targets), ( + f"var c = this.address.city; c.ping() should resolve to City.ping(); " + f"got {sorted(targets)}" + ) + + +def test_super_rooted_nested_field_access_via_var( + temp_repo: Path, mock_ingestor: MagicMock +) -> None: + project = temp_repo / "proj" + (project / "src").mkdir(parents=True) + (project / "src" / "Main.java").write_text( + """ +class City { public void ping() { System.out.println("ping"); } } +class Address { public City city = new City(); } +class Base { public Address address = new Address(); } +public class Derived extends Base { + public void run() { + var c = super.address.city; + c.ping(); + } +} +""", + encoding="utf-8", + ) + _run(project, mock_ingestor) + + targets = _call_targets(mock_ingestor) + assert any(t.endswith(".City.ping()") for t in targets), ( + f"var c = super.address.city; c.ping() should resolve to City.ping(); " + f"got {sorted(targets)}" + ) + + +def test_inherited_field_chain_via_this( + temp_repo: Path, mock_ingestor: MagicMock +) -> None: + project = temp_repo / "proj" + (project / "src").mkdir(parents=True) + (project / "src" / "Main.java").write_text( + """ +class City { public void ping() { System.out.println("ping"); } } +class Address { public City city = new City(); } +class Base { public Address address = new Address(); } +public class Derived extends Base { + public void run() { + var c = this.address.city; + c.ping(); + } +} +""", + encoding="utf-8", + ) + _run(project, mock_ingestor) + + targets = _call_targets(mock_ingestor) + assert any(t.endswith(".City.ping()") for t in targets), ( + f"this.address.city (address inherited from Base) should resolve to " + f"City.ping(); got {sorted(targets)}" + ) + + +def test_inherited_field_chain_via_object( + temp_repo: Path, mock_ingestor: MagicMock +) -> None: + project = temp_repo / "proj" + (project / "src").mkdir(parents=True) + (project / "src" / "Main.java").write_text( + """ +class City { public void ping() { System.out.println("ping"); } } +class Address { public City city = new City(); } +class Base { public Address address = new Address(); } +class Derived extends Base {} +public class Main { + public static void main(String[] args) { + Derived obj = new Derived(); + obj.address.city.ping(); + } +} +""", + encoding="utf-8", + ) + _run(project, mock_ingestor) + + targets = _call_targets(mock_ingestor) + assert any(t.endswith(".City.ping()") for t in targets), ( + f"obj.address.city (address inherited from Base) should resolve to " + f"City.ping(); got {sorted(targets)}" + ) + + +def test_direct_this_field_chain_method_call_multiclass( + temp_repo: Path, mock_ingestor: MagicMock +) -> None: + project = temp_repo / "proj" + (project / "src").mkdir(parents=True) + (project / "src" / "Main.java").write_text( + """ +class Aardvark { public void unused() {} } +class City { public void ping() { System.out.println("ping"); } } +class Address { public City city = new City(); } +public class Container { + public Address address = new Address(); + public void run() { + this.address.city.ping(); + } +} +""", + encoding="utf-8", + ) + _run(project, mock_ingestor) + + targets = _call_targets(mock_ingestor) + assert any(t.endswith(".City.ping()") for t in targets), ( + f"direct this.address.city.ping() in a multi-class file should resolve to " + f"City.ping(); got {sorted(targets)}" + ) + + +def test_direct_super_field_chain_method_call_multiclass( + temp_repo: Path, mock_ingestor: MagicMock +) -> None: + project = temp_repo / "proj" + (project / "src").mkdir(parents=True) + (project / "src" / "Main.java").write_text( + """ +class Aardvark { public void unused() {} } +class Other {} +class Wrong extends Other { public void unused() {} } +class City { public void ping() { System.out.println("ping"); } } +class Address { public City city = new City(); } +class Base { public Address address = new Address(); } +public class Derived extends Base { + public void run() { + super.address.city.ping(); + } +} +""", + encoding="utf-8", + ) + _run(project, mock_ingestor) + + targets = _call_targets(mock_ingestor) + assert any(t.endswith(".City.ping()") for t in targets), ( + f"direct super.address.city.ping() in a multi-class file should resolve to " + f"City.ping(); got {sorted(targets)}" + ) + + +def test_scoped_superclass_extraction_keeps_actual_class() -> None: + nested = extract_class_info(_class_node("class Child extends Outer.Base {}")) + assert nested.get("superclass") == "Outer.Base", ( + f"scoped superclass should keep the full name, not the outer/package " + f"segment; got {nested.get('superclass')!r}" + ) + + qualified = extract_class_info(_class_node("class Child extends pkg.Base {}")) + assert qualified.get("superclass") == "pkg.Base", ( + f"package-qualified superclass should keep the full name; " + f"got {qualified.get('superclass')!r}" + ) + + simple = extract_class_info(_class_node("class Child extends Base {}")) + assert simple.get("superclass") == "Base" + + +def test_inherited_field_chain_via_nested_superclass( + temp_repo: Path, mock_ingestor: MagicMock +) -> None: + project = temp_repo / "proj" + (project / "src").mkdir(parents=True) + (project / "src" / "Main.java").write_text( + """ +class City { public void ping() { System.out.println("ping"); } } +class Address { public City city = new City(); } +class Outer { + static class Base { public Address address = new Address(); } +} +public class Child extends Outer.Base { + public void run() { + this.address.city.ping(); + } +} +""", + encoding="utf-8", + ) + _run(project, mock_ingestor) + + targets = _call_targets(mock_ingestor) + assert any(t.endswith(".City.ping()") for t in targets), ( + f"this.address.city with a same-file nested superclass (Outer.Base) should " + f"resolve to City.ping(); got {sorted(targets)}" + ) + + +def test_super_rooted_chain_with_nested_superclass( + temp_repo: Path, mock_ingestor: MagicMock +) -> None: + project = temp_repo / "proj" + (project / "src").mkdir(parents=True) + (project / "src" / "Main.java").write_text( + """ +class City { public void ping() { System.out.println("ping"); } } +class Address { public City city = new City(); } +class Outer { + static class Base { public Address address = new Address(); } +} +public class Child extends Outer.Base { + public void run() { + var c = super.address.city; + c.ping(); + } +} +""", + encoding="utf-8", + ) + _run(project, mock_ingestor) + + targets = _call_targets(mock_ingestor) + assert any(t.endswith(".City.ping()") for t in targets), ( + f"super.address.city with a nested superclass (Outer.Base) should resolve to " + f"City.ping(); got {sorted(targets)}" + ) + + +def test_generic_scoped_superclass_extraction() -> None: + generic_scoped = extract_class_info( + _class_node("class Child extends Outer.Base {}") + ) + assert generic_scoped.get("superclass") == "Outer.Base", ( + f"generic scoped superclass should extract the base name; " + f"got {generic_scoped.get('superclass')!r}" + ) + + generic_simple = extract_class_info( + _class_node("class Child extends Box {}") + ) + assert generic_simple.get("superclass") == "Box", ( + f"generic superclass should extract the base name; " + f"got {generic_simple.get('superclass')!r}" + ) diff --git a/codebase_rag/tests/test_java_inheritance_edges.py b/codebase_rag/tests/test_java_inheritance_edges.py new file mode 100644 index 000000000..9c293833f --- /dev/null +++ b/codebase_rag/tests/test_java_inheritance_edges.py @@ -0,0 +1,59 @@ +# (H) Java inheritance edges. cgr captured a class's `extends`/`implements` but +# (H) missed two cases: an interface's `extends` superinterfaces (-> INHERITS) +# (H) and an enum's `implements` interfaces (-> IMPLEMENTS). Both clauses carry a +# (H) type_list of interface names that were never extracted. +from __future__ import annotations + +from pathlib import Path +from unittest.mock import MagicMock + +from codebase_rag.constants import RelationshipType +from codebase_rag.tests.conftest import create_and_run_updater, get_relationships + +_JAVA = """\ +package demo; + +public interface A {} +public interface B {} +public interface Big extends A, B {} + +abstract class Base {} +enum Color implements A { RED } + +class Circle extends Base implements A, B {} + +class Holder extends Box implements Comparable {} +""" + + +def _pairs(mock_ingestor: MagicMock, rel: str) -> set[tuple[str, str]]: + # (H) (source_qn, target_qn) for the given relationship. + return { + (call[0][0][2], call[0][2][2]) for call in get_relationships(mock_ingestor, rel) + } + + +def test_java_inheritance_and_implements_edges( + temp_repo: Path, mock_ingestor: MagicMock +) -> None: + project = temp_repo / "java_inh" + project.mkdir() + (project / "Demo.java").write_text(_JAVA, encoding="utf-8") + create_and_run_updater(project, mock_ingestor, skip_if_missing="java") + + inherits = _pairs(mock_ingestor, RelationshipType.INHERITS.value) + implements = _pairs(mock_ingestor, RelationshipType.IMPLEMENTS.value) + base = "java_inh.Demo" + + # (H) Interface extends -> INHERITS to each superinterface. + assert (f"{base}.Big", f"{base}.A") in inherits, inherits + assert (f"{base}.Big", f"{base}.B") in inherits, inherits + # (H) Enum implements -> IMPLEMENTS. + assert (f"{base}.Color", f"{base}.A") in implements, implements + # (H) Class extends/implements (already worked) stay intact. + assert (f"{base}.Circle", f"{base}.Base") in inherits, inherits + assert (f"{base}.Circle", f"{base}.A") in implements, implements + assert (f"{base}.Circle", f"{base}.B") in implements, implements + # (H) Generic (parameterized) bases must be captured by their base type. + assert (f"{base}.Holder", f"{base}.Box") in inherits, inherits + assert (f"{base}.Holder", f"{base}.Comparable") in implements, implements diff --git a/codebase_rag/tests/test_java_inheritance_oracle.py b/codebase_rag/tests/test_java_inheritance_oracle.py new file mode 100644 index 000000000..65b8c2f42 --- /dev/null +++ b/codebase_rag/tests/test_java_inheritance_oracle.py @@ -0,0 +1,59 @@ +# (H) Covers Java inheritance-edge validation: cgr's INHERITS (class/interface +# (H) extends) and IMPLEMENTS (class/enum implements) edges are graded against the +# (H) JDK Compiler Tree API oracle, by (source node, base SIMPLE NAME). +from __future__ import annotations + +from pathlib import Path + +import pytest + +from codebase_rag import constants as cs +from codebase_rag.parser_loader import load_parsers +from evals import constants as ec +from evals.cgr_graph import extract_cgr_java_graph +from evals.oracles import java_available, run_java_oracle +from evals.score import score_name_edge_types + +JAVA_SRC = """\ +package demo; + +public interface A {} +public interface B {} +public interface Big extends A, B {} + +abstract class Base {} +enum Color implements A { RED } + +class Circle extends Base implements A, B {} +""" + + +def _require_java() -> None: + if not java_available(): + pytest.skip("java toolchain not available") + if cs.SupportedLanguage.JAVA not in load_parsers()[0]: + pytest.skip("java parser not available") + + +def test_cgr_matches_jdk_oracle_on_inheritance_edges(tmp_path: Path) -> None: + _require_java() + project = tmp_path / "java_inh_edge" + project.mkdir() + (project / "Demo.java").write_text(JAVA_SRC, encoding="utf-8") + + cgr = extract_cgr_java_graph(project, project.name) + oracle = run_java_oracle(project) + + result = score_name_edge_types(cgr, oracle, ec.INHERITANCE_NAME_EDGE_TYPES) + by_label = {row["label"]: row for row in result.rows} + for label in ( + cs.RelationshipType.INHERITS.value, + cs.RelationshipType.IMPLEMENTS.value, + ): + row = by_label.get(label) + assert row is not None, (label, by_label, result.diff) + assert row["precision"] == 1.0 and row["recall"] == 1.0, ( + label, + row, + result.diff, + ) diff --git a/codebase_rag/tests/test_java_label_name_collision.py b/codebase_rag/tests/test_java_label_name_collision.py new file mode 100644 index 000000000..c43702119 --- /dev/null +++ b/codebase_rag/tests/test_java_label_name_collision.py @@ -0,0 +1,314 @@ +from pathlib import Path +from unittest.mock import MagicMock + +import pytest + +from codebase_rag.constants import NODE_UNIQUE_CONSTRAINTS, NodeLabel +from codebase_rag.tests.conftest import ( + get_node_names, + get_nodes, + get_qualified_names, + get_relationships, + run_updater, +) +from codebase_rag.types_defs import NodeType + + +@pytest.fixture +def java_label_collision_project(temp_repo: Path) -> Path: + project_path = temp_repo / "java_label_collision" + project_path.mkdir() + src = project_path / "src" / "main" / "java" / "com" / "example" + src.mkdir(parents=True) + return project_path + + +def _src_dir(project: Path) -> Path: + return project / "src" / "main" / "java" / "com" / "example" + + +def _has_qn_ending(qns: set[str], suffix: str) -> bool: + return any(qn.endswith(suffix) for qn in qns) + + +def test_interface_named_interface_ingested_as_interface_node( + java_label_collision_project: Path, + mock_ingestor: MagicMock, +) -> None: + src = _src_dir(java_label_collision_project) + (src / "Interface.java").write_text( + encoding="utf-8", + data="""\ +package com.example; + +public interface Interface { + void doSomething(); +} +""", + ) + run_updater(java_label_collision_project, mock_ingestor, skip_if_missing="java") + + interface_nodes = get_nodes(mock_ingestor, NodeType.INTERFACE) + interface_qns = get_qualified_names(interface_nodes) + + assert _has_qn_ending(interface_qns, ".Interface"), ( + f"Interface named 'Interface' not found in Interface nodes. Got: {interface_qns}" + ) + + class_qns = get_node_names(mock_ingestor, NodeType.CLASS) + interface_in_class = [qn for qn in class_qns if qn.endswith(".Interface")] + assert not interface_in_class, ( + f"Interface named 'Interface' should not appear as a Class node. Got: {interface_in_class}" + ) + + +def test_enum_named_enum_ingested_as_enum_node( + java_label_collision_project: Path, + mock_ingestor: MagicMock, +) -> None: + src = _src_dir(java_label_collision_project) + (src / "Enum.java").write_text( + encoding="utf-8", + data="""\ +package com.example; + +public enum Enum { + VALUE_A, + VALUE_B, + VALUE_C +} +""", + ) + run_updater(java_label_collision_project, mock_ingestor, skip_if_missing="java") + + enum_nodes = get_nodes(mock_ingestor, NodeType.ENUM) + enum_qns = get_qualified_names(enum_nodes) + + assert _has_qn_ending(enum_qns, ".Enum"), ( + f"Enum named 'Enum' not found in Enum nodes. Got: {enum_qns}" + ) + + class_qns = get_node_names(mock_ingestor, NodeType.CLASS) + enum_in_class = [qn for qn in class_qns if qn.endswith(".Enum")] + assert not enum_in_class, ( + f"Enum named 'Enum' should not appear as a Class node. Got: {enum_in_class}" + ) + + +def test_class_named_class_ingested_as_class_node( + java_label_collision_project: Path, + mock_ingestor: MagicMock, +) -> None: + src = _src_dir(java_label_collision_project) + (src / "Class.java").write_text( + encoding="utf-8", + data="""\ +package com.example; + +public class Class { + public void run() {} +} +""", + ) + run_updater(java_label_collision_project, mock_ingestor, skip_if_missing="java") + + class_nodes = get_nodes(mock_ingestor, NodeType.CLASS) + class_qns = get_qualified_names(class_nodes) + + assert _has_qn_ending(class_qns, ".Class"), ( + f"Class named 'Class' not found in Class nodes. Got: {class_qns}" + ) + + +def test_interface_and_enum_labels_have_constraints() -> None: + assert NodeLabel.INTERFACE in NODE_UNIQUE_CONSTRAINTS, ( + "Interface label missing from NODE_UNIQUE_CONSTRAINTS" + ) + assert NodeLabel.ENUM in NODE_UNIQUE_CONSTRAINTS, ( + "Enum label missing from NODE_UNIQUE_CONSTRAINTS" + ) + assert NODE_UNIQUE_CONSTRAINTS[NodeLabel.INTERFACE] == "qualified_name" + assert NODE_UNIQUE_CONSTRAINTS[NodeLabel.ENUM] == "qualified_name" + + +def test_all_node_labels_have_constraints() -> None: + for label in NodeLabel: + assert label.value in NODE_UNIQUE_CONSTRAINTS, ( + f"NodeLabel.{label.name} ('{label.value}') missing from NODE_UNIQUE_CONSTRAINTS" + ) + + +def test_interface_named_interface_has_defines_relationship( + java_label_collision_project: Path, + mock_ingestor: MagicMock, +) -> None: + src = _src_dir(java_label_collision_project) + (src / "Interface.java").write_text( + encoding="utf-8", + data="""\ +package com.example; + +public interface Interface { + void doSomething(); +} +""", + ) + run_updater(java_label_collision_project, mock_ingestor, skip_if_missing="java") + + defines_rels = get_relationships(mock_ingestor, "DEFINES") + found_defines = False + for rel in defines_rels: + if len(rel.args) >= 3: + to_spec = rel.args[2] + if isinstance(to_spec, tuple) and len(to_spec) >= 3: + to_label = to_spec[0] + to_qn = str(to_spec[2]) + if to_qn.endswith(".Interface"): + assert to_label == NodeType.INTERFACE, ( + f"DEFINES target label should be 'Interface', got '{to_label}'" + ) + found_defines = True + + assert found_defines, ( + "No DEFINES relationship found for Interface named 'Interface'" + ) + + +def test_enum_named_enum_has_defines_relationship( + java_label_collision_project: Path, + mock_ingestor: MagicMock, +) -> None: + src = _src_dir(java_label_collision_project) + (src / "Enum.java").write_text( + encoding="utf-8", + data="""\ +package com.example; + +public enum Enum { + VALUE_A, + VALUE_B +} +""", + ) + run_updater(java_label_collision_project, mock_ingestor, skip_if_missing="java") + + defines_rels = get_relationships(mock_ingestor, "DEFINES") + found_defines = False + for rel in defines_rels: + if len(rel.args) >= 3: + to_spec = rel.args[2] + if isinstance(to_spec, tuple) and len(to_spec) >= 3: + to_label = to_spec[0] + to_qn = str(to_spec[2]) + if to_qn.endswith(".Enum"): + assert to_label == NodeType.ENUM, ( + f"DEFINES target label should be 'Enum', got '{to_label}'" + ) + found_defines = True + + assert found_defines, "No DEFINES relationship found for Enum named 'Enum'" + + +def test_class_implementing_interface_named_interface( + java_label_collision_project: Path, + mock_ingestor: MagicMock, +) -> None: + src = _src_dir(java_label_collision_project) + (src / "Interface.java").write_text( + encoding="utf-8", + data="""\ +package com.example; + +public interface Interface { + void doSomething(); +} +""", + ) + (src / "Implementor.java").write_text( + encoding="utf-8", + data="""\ +package com.example; + +public class Implementor implements Interface { + public void doSomething() { + System.out.println("done"); + } +} +""", + ) + run_updater(java_label_collision_project, mock_ingestor, skip_if_missing="java") + + interface_qns = get_node_names(mock_ingestor, NodeType.INTERFACE) + assert _has_qn_ending(interface_qns, ".Interface") + + class_qns = get_node_names(mock_ingestor, NodeType.CLASS) + assert _has_qn_ending(class_qns, ".Implementor") + + implements_rels = get_relationships(mock_ingestor, "IMPLEMENTS") + found_implements = False + for rel in implements_rels: + if len(rel.args) >= 3: + from_spec = rel.args[0] + if isinstance(from_spec, tuple) and len(from_spec) >= 3: + from_qn = str(from_spec[2]) + if from_qn.endswith(".Implementor"): + found_implements = True + + assert found_implements, ( + "No IMPLEMENTS relationship found for Implementor -> Interface" + ) + + +def test_multiple_label_colliding_names( + java_label_collision_project: Path, + mock_ingestor: MagicMock, +) -> None: + src = _src_dir(java_label_collision_project) + (src / "Function.java").write_text( + encoding="utf-8", + data="""\ +package com.example; + +public class Function { + public void execute() {} +} +""", + ) + (src / "Method.java").write_text( + encoding="utf-8", + data="""\ +package com.example; + +public class Method { + public void invoke() {} +} +""", + ) + (src / "Module.java").write_text( + encoding="utf-8", + data="""\ +package com.example; + +public class Module { + public void load() {} +} +""", + ) + run_updater(java_label_collision_project, mock_ingestor, skip_if_missing="java") + + class_qns = get_node_names(mock_ingestor, NodeType.CLASS) + assert _has_qn_ending(class_qns, ".Function") + assert _has_qn_ending(class_qns, ".Method") + assert _has_qn_ending(class_qns, ".Module") + + function_qns = get_node_names(mock_ingestor, NodeType.FUNCTION) + method_qns = get_node_names(mock_ingestor, NodeType.METHOD) + non_class_qns = function_qns | method_qns + collisions = [ + qn + for qn in non_class_qns + if qn.endswith(".Function") or qn.endswith(".Method") or qn.endswith(".Module") + ] + assert not collisions, ( + f"Class names colliding with node labels should not appear as wrong node types: {collisions}" + ) diff --git a/codebase_rag/tests/test_java_span_oracle.py b/codebase_rag/tests/test_java_span_oracle.py new file mode 100644 index 000000000..8ecff7bbb --- /dev/null +++ b/codebase_rag/tests/test_java_span_oracle.py @@ -0,0 +1,74 @@ +# (H) Covers Java node SPAN (end_line) validation: cgr's end_line for each node is +# (H) graded against the JDK Compiler Tree API oracle (which emits each node's +# (H) source end position), joined on (kind, file, start). Exercises a class with a +# (H) multi-line method signature, an interface, an enum, and a nested class so +# (H) spans are not trivially single line. +from __future__ import annotations + +from pathlib import Path + +import pytest + +from codebase_rag import constants as cs +from codebase_rag.parser_loader import load_parsers +from evals import constants as ec +from evals.cgr_graph import extract_cgr_java_graph +from evals.oracles import java_available, run_java_oracle +from evals.score import score_span + +JAVA_SRC = """\ +package demo; + +public class Widget implements Shape { + private int size; + + public int area( + int scale + ) { + return this.size * scale; + } + + static class Inner { + int value() { + return 1; + } + } +} + +interface Shape { + int area(int scale); +} + +enum Color { + RED, + GREEN, + BLUE +} +""" + + +def _require_java() -> None: + if not java_available(): + pytest.skip("jdk (javac/java) not available") + if cs.SupportedLanguage.JAVA not in load_parsers()[0]: + pytest.skip("java parser not available") + + +def test_cgr_matches_jdk_oracle_on_node_spans(tmp_path: Path) -> None: + _require_java() + project = tmp_path / "java_span_test" + (project / "demo").mkdir(parents=True) + (project / "demo" / "Widget.java").write_text(JAVA_SRC, encoding="utf-8") + + cgr = extract_cgr_java_graph(project, project.name) + oracle = run_java_oracle(project) + + result = score_span(cgr, oracle, ec.JAVA_SCORED_NODE_KINDS) + by_label = {row["label"]: row for row in result.rows} + aggregate = by_label.get(ec.AGGREGATE_LABEL) + assert aggregate is not None, (by_label, result.diff) + assert aggregate["precision"] == 1.0 and aggregate["recall"] == 1.0, ( + aggregate, + result.diff, + ) + assert aggregate["tp"] >= 5, aggregate diff --git a/codebase_rag/tests/test_java_structure_oracle.py b/codebase_rag/tests/test_java_structure_oracle.py new file mode 100644 index 000000000..2c1db2004 --- /dev/null +++ b/codebase_rag/tests/test_java_structure_oracle.py @@ -0,0 +1,72 @@ +# (H) Covers the Java structure oracle harness (evals/oracles/java_oracle + +# (H) evals/java_l1.py): the JDK Compiler Tree API oracle is authoritative ground +# (H) truth, and cgr's captured Java nodes are graded against it on +# (H) (kind, file, start_line). Includes an anonymous class, whose methods cgr +# (H) models as standalone Functions (like JS object-literal methods). +from __future__ import annotations + +from pathlib import Path + +import pytest + +from codebase_rag import constants as cs +from codebase_rag.parser_loader import load_parsers +from evals import constants as ec +from evals.cgr_graph import extract_cgr_java_nodes +from evals.oracles import java_available, run_java_oracle +from evals.score import score_node_kinds +from evals.types_defs import GraphData + +JAVA_SRC = """\ +package demo; + +public class Sample { + private int x; + public Sample(int x) { this.x = x; } + public int area() { return x; } + public static Sample make(int x) { return new Sample(x); } + + interface Shape { double area(); } + enum Color { RED, GREEN } + static class Inner { void helper() {} } + + Runnable callback() { + return new Runnable() { + public void run() { helper2(); } + void helper2() {} + }; + } +} + +interface Drawable { void draw(); } + +enum Direction { NORTH, SOUTH } +""" + + +def _require_java() -> None: + if not java_available(): + pytest.skip("javac/java toolchain not available") + if cs.SupportedLanguage.JAVA not in load_parsers()[0]: + pytest.skip("java parser not available") + + +def test_cgr_matches_jdk_oracle_on_java_structure(tmp_path: Path) -> None: + _require_java() + project = tmp_path / "java_oracle_test" + project.mkdir() + (project / "Sample.java").write_text(JAVA_SRC, encoding="utf-8") + + cgr = GraphData( + nodes=extract_cgr_java_nodes(project, project.name), + edges=set(), + name_edges=set(), + ) + oracle = run_java_oracle(project) + + result = score_node_kinds(cgr, oracle, ec.JAVA_SCORED_NODE_KINDS) + by_label = {row["label"]: row for row in result.rows} + for label in ("Class", "Interface", "Enum", "Method", "Function"): + row = by_label.get(label) + assert row is not None, (label, by_label) + assert row["precision"] == 1.0 and row["recall"] == 1.0, (label, row) diff --git a/codebase_rag/tests/test_javascript_containment_oracle.py b/codebase_rag/tests/test_javascript_containment_oracle.py new file mode 100644 index 000000000..bc197d92b --- /dev/null +++ b/codebase_rag/tests/test_javascript_containment_oracle.py @@ -0,0 +1,56 @@ +# (H) Covers JavaScript containment-edge validation: cgr's DEFINES (file module +# (H) -> class / top-level function) and DEFINES_METHOD (class -> method) edges +# (H) are graded against the TypeScript-compiler-API oracle run over .js, joined +# (H) on (kind, file, line). +from __future__ import annotations + +from pathlib import Path + +import pytest + +from codebase_rag import constants as cs +from codebase_rag.parser_loader import load_parsers +from evals import constants as ec +from evals.cgr_graph import extract_cgr_js_graph +from evals.oracles import run_javascript_oracle, typescript_available +from evals.score import score_edge_types + +JS_SRC = """\ +export class Point { + constructor() { this.x = 0; } + area() { return 1.0; } +} + +export function free() { return 1; } +""" + + +def _require_js() -> None: + if not typescript_available(): + pytest.skip("node/npm toolchain not available") + if cs.SupportedLanguage.JS not in load_parsers()[0]: + pytest.skip("javascript parser not available") + + +def test_cgr_matches_tsc_oracle_on_js_containment_edges(tmp_path: Path) -> None: + _require_js() + project = tmp_path / "js_edge" + project.mkdir() + (project / "lib.js").write_text(JS_SRC, encoding="utf-8") + + cgr = extract_cgr_js_graph(project, project.name) + oracle = run_javascript_oracle(project) + + result = score_edge_types(cgr, oracle, ec.SCORED_EDGE_TYPES) + by_label = {row["label"]: row for row in result.rows} + for label in ( + cs.RelationshipType.DEFINES.value, + cs.RelationshipType.DEFINES_METHOD.value, + ): + row = by_label.get(label) + assert row is not None, (label, by_label, result.diff) + assert row["precision"] == 1.0 and row["recall"] == 1.0, ( + label, + row, + result.diff, + ) diff --git a/codebase_rag/tests/test_javascript_span_oracle.py b/codebase_rag/tests/test_javascript_span_oracle.py new file mode 100644 index 000000000..49a64333b --- /dev/null +++ b/codebase_rag/tests/test_javascript_span_oracle.py @@ -0,0 +1,65 @@ +# (H) Covers JavaScript node SPAN (end_line) validation: cgr's end_line for each +# (H) node is graded against the TS-compiler-API oracle run over .js (which emits +# (H) each node's full-span end line), joined on (kind, file, start). Exercises a +# (H) class with a multi-line method signature, a multi-line arrow assigned to a +# (H) const, and a nested arrow so spans are not trivially single line. +from __future__ import annotations + +from pathlib import Path + +import pytest + +from codebase_rag import constants as cs +from codebase_rag.parser_loader import load_parsers +from evals import constants as ec +from evals.cgr_graph import extract_cgr_js_graph +from evals.oracles import run_javascript_oracle, typescript_available +from evals.score import score_span + +JS_SRC = """\ +class Widget { + area( + scale, + ) { + return scale; + } +} + +function standalone() { + const cb = (v) => { + return v + 1; + }; + return cb(2); +} + +const arrow = (x) => { + return x * 2; +}; +""" + + +def _require_js() -> None: + if not typescript_available(): + pytest.skip("node/npm toolchain not available") + if cs.SupportedLanguage.JS not in load_parsers()[0]: + pytest.skip("javascript parser not available") + + +def test_cgr_matches_tsc_oracle_on_javascript_node_spans(tmp_path: Path) -> None: + _require_js() + project = tmp_path / "js_span_test" + project.mkdir() + (project / "main.js").write_text(JS_SRC, encoding="utf-8") + + cgr = extract_cgr_js_graph(project, project.name) + oracle = run_javascript_oracle(project) + + result = score_span(cgr, oracle, ec.JS_SCORED_NODE_KINDS) + by_label = {row["label"]: row for row in result.rows} + aggregate = by_label.get(ec.AGGREGATE_LABEL) + assert aggregate is not None, (by_label, result.diff) + assert aggregate["precision"] == 1.0 and aggregate["recall"] == 1.0, ( + aggregate, + result.diff, + ) + assert aggregate["tp"] >= 4, aggregate diff --git a/codebase_rag/tests/test_javascript_structure_oracle.py b/codebase_rag/tests/test_javascript_structure_oracle.py new file mode 100644 index 000000000..508326d0d --- /dev/null +++ b/codebase_rag/tests/test_javascript_structure_oracle.py @@ -0,0 +1,57 @@ +# (H) Covers the JavaScript structure oracle harness (evals/oracles/ts_oracle run +# (H) over .js/.jsx + evals/js_l1.py): the TS-compiler-API oracle is authoritative +# (H) ground truth, and cgr's captured JavaScript nodes are graded against it on +# (H) (kind, file, start_line). +from __future__ import annotations + +from pathlib import Path + +import pytest + +from codebase_rag import constants as cs +from codebase_rag.parser_loader import load_parsers +from evals import constants as ec +from evals.cgr_graph import extract_cgr_js_nodes +from evals.oracles import run_javascript_oracle, typescript_available +from evals.score import score_node_kinds +from evals.types_defs import GraphData + +JS_SRC = """\ +class Point { + constructor(x) { this.x = x; } + area() { return this.x; } +} + +function freeFn(a) { return a + 1; } +const arrow = (b) => b * 2; +const obj = { method() { return 1; } }; +[1, 2].forEach((n) => freeFn(n)); +""" + + +def _require_js() -> None: + if not typescript_available(): + pytest.skip("node/npm toolchain not available") + if cs.SupportedLanguage.JS not in load_parsers()[0]: + pytest.skip("javascript parser not available") + + +def test_cgr_matches_tsc_oracle_on_javascript_structure(tmp_path: Path) -> None: + _require_js() + project = tmp_path / "js_oracle_test" + project.mkdir() + (project / "app.js").write_text(JS_SRC, encoding="utf-8") + + cgr = GraphData( + nodes=extract_cgr_js_nodes(project, project.name), + edges=set(), + name_edges=set(), + ) + oracle = run_javascript_oracle(project) + + result = score_node_kinds(cgr, oracle, ec.JS_SCORED_NODE_KINDS) + by_label = {row["label"]: row for row in result.rows} + for label in ("Class", "Function", "Method"): + row = by_label.get(label) + assert row is not None, (label, by_label) + assert row["precision"] == 1.0 and row["recall"] == 1.0, (label, row) diff --git a/codebase_rag/tests/test_js_ts_utils_integration.py b/codebase_rag/tests/test_js_ts_utils_integration.py index d83ccf4ae..bc50fb53b 100644 --- a/codebase_rag/tests/test_js_ts_utils_integration.py +++ b/codebase_rag/tests/test_js_ts_utils_integration.py @@ -647,6 +647,78 @@ def test_deeply_nested_qn(self) -> None: assert result == "a.b.c.d.e" +@pytest.mark.skipif(not JS_AVAILABLE, reason="tree-sitter-javascript not available") +class TestFindMethodInAstCacheOwnerTracking: + def test_cache_invalidates_on_new_root_node( + self, js_parser: Parser, sample_js_project: Path + ) -> None: + from codebase_rag.parsers.js_ts import utils as js_utils + + tree1 = parse_file(js_parser, sample_js_project / "singleton.js") + root1 = tree1.root_node + result1 = find_method_in_ast(root1, "DatabaseConnection", "getInstance") + assert result1 is not None + owner_after_first = js_utils._CLASS_BODY_CACHE_OWNER + + tree2 = parse_file(js_parser, sample_js_project / "factory.js") + root2 = tree2.root_node + result2 = find_method_in_ast(root2, "Dog", "speak") + assert result2 is not None + owner_after_second = js_utils._CLASS_BODY_CACHE_OWNER + + assert owner_after_first != owner_after_second + + def test_cache_hit_returns_correct_result( + self, js_parser: Parser, sample_js_project: Path + ) -> None: + tree = parse_file(js_parser, sample_js_project / "factory.js") + root = tree.root_node + + result1 = find_method_in_ast(root, "Dog", "speak") + assert result1 is not None + + result2 = find_method_in_ast(root, "Dog", "fetch") + assert result2 is not None + + def test_cache_miss_returns_none( + self, js_parser: Parser, sample_js_project: Path + ) -> None: + tree = parse_file(js_parser, sample_js_project / "factory.js") + root = tree.root_node + + result = find_method_in_ast(root, "NonExistent", "method") + assert result is None + + result2 = find_method_in_ast(root, "NonExistent", "other") + assert result2 is None + + +@pytest.mark.skipif(not JS_AVAILABLE, reason="tree-sitter-javascript not available") +class TestFindReturnStatementsWithLanguageObj: + def test_with_language_obj( + self, js_parser: Parser, sample_js_project: Path + ) -> None: + tree = parse_file(js_parser, sample_js_project / "complex_returns.js") + set_name = find_method_in_ast(tree.root_node, "Builder", "setName") + assert set_name is not None + + language = Language(tsjs.language()) + return_nodes: list = [] + find_return_statements(set_name, return_nodes, language) + assert len(return_nodes) == 1 + + def test_fallback_without_language_obj( + self, js_parser: Parser, sample_js_project: Path + ) -> None: + tree = parse_file(js_parser, sample_js_project / "complex_returns.js") + set_name = find_method_in_ast(tree.root_node, "Builder", "setName") + assert set_name is not None + + return_nodes: list = [] + find_return_statements(set_name, return_nodes, None) + assert len(return_nodes) == 1 + + @pytest.mark.skipif(not TS_AVAILABLE, reason="tree-sitter-typescript not available") class TestTypeScriptIntegration: def test_find_generic_class_methods( diff --git a/codebase_rag/tests/test_js_type_inference_unit.py b/codebase_rag/tests/test_js_type_inference_unit.py index 21e008522..279ac8b7f 100644 --- a/codebase_rag/tests/test_js_type_inference_unit.py +++ b/codebase_rag/tests/test_js_type_inference_unit.py @@ -428,3 +428,89 @@ def test_variable_with_uninferrable_value_is_skipped( ) assert result == {} + + +class TestGetDeclaratorsViaQueryException: + def test_returns_none_when_queries_is_none( + self, + mock_import_processor: MagicMock, + mock_function_registry: MagicMock, + mock_find_method_ast_node: MagicMock, + ) -> None: + engine = JsTypeInferenceEngine( + import_processor=mock_import_processor, + function_registry=mock_function_registry, + project_name="test_project", + find_method_ast_node_func=mock_find_method_ast_node, + queries=None, + ) + root_node = create_mock_node("program", children=[]) + result = engine._get_declarators_via_query( + root_node, # ty: ignore[invalid-argument-type] # (H) MockNode not Node + ) + assert result is None + + def test_exception_in_query_continues_to_next_language( + self, + mock_import_processor: MagicMock, + mock_function_registry: MagicMock, + mock_find_method_ast_node: MagicMock, + ) -> None: + bad_language_obj = MagicMock() + bad_language_obj.side_effect = Exception("bad query") + + queries = { + cs.SupportedLanguage.JS: {"language": bad_language_obj}, + cs.SupportedLanguage.TS: {"language": bad_language_obj}, + } + + engine = JsTypeInferenceEngine( + import_processor=mock_import_processor, + function_registry=mock_function_registry, + project_name="test_project", + find_method_ast_node_func=mock_find_method_ast_node, + queries=queries, + ) + root_node = create_mock_node("program", children=[]) + result = engine._get_declarators_via_query( + root_node, # ty: ignore[invalid-argument-type] # (H) MockNode not Node + ) + assert result is None + + +class TestGetLanguageObj: + def test_returns_none_when_queries_is_none( + self, + mock_import_processor: MagicMock, + mock_function_registry: MagicMock, + mock_find_method_ast_node: MagicMock, + ) -> None: + engine = JsTypeInferenceEngine( + import_processor=mock_import_processor, + function_registry=mock_function_registry, + project_name="test_project", + find_method_ast_node_func=mock_find_method_ast_node, + queries=None, + ) + result = engine._get_language_obj() + assert result is None + + def test_returns_language_when_available( + self, + mock_import_processor: MagicMock, + mock_function_registry: MagicMock, + mock_find_method_ast_node: MagicMock, + ) -> None: + lang_obj = MagicMock() + queries = { + cs.SupportedLanguage.JS: {"language": lang_obj}, + } + engine = JsTypeInferenceEngine( + import_processor=mock_import_processor, + function_registry=mock_function_registry, + project_name="test_project", + find_method_ast_node_func=mock_find_method_ast_node, + queries=queries, + ) + result = engine._get_language_obj() + assert result is lang_obj diff --git a/codebase_rag/tests/test_l3_decorator_normalization.py b/codebase_rag/tests/test_l3_decorator_normalization.py new file mode 100644 index 000000000..a2e105398 --- /dev/null +++ b/codebase_rag/tests/test_l3_decorator_normalization.py @@ -0,0 +1,77 @@ +# (H) Covers the L3 eval harness (evals/calls_trace.py): a call to a functools.wraps +# (H) decorated function dispatches through the decorator's generic wrapper at runtime, +# (H) but cgr's static graph resolves the call to the function itself. The trace must +# (H) attribute the wrapper frame to the wrapped function so the two agree. +from __future__ import annotations + +import importlib.util +import textwrap +from pathlib import Path + +from evals.calls_trace import trace_calls + +MOD_SRC = textwrap.dedent( + """ + from functools import wraps + + + def guard(fn): + @wraps(fn) + def wrapper(*args, **kwargs): + return fn(*args, **kwargs) + + return wrapper + + + def helper(): + return 1 + + + @guard + def target_fn(): + return helper() + + + def caller(): + return target_fn() + """ +) + + +def _load_module(mod_path: Path): + spec = importlib.util.spec_from_file_location("evaltest_decorator_mod", mod_path) + assert spec is not None and spec.loader is not None + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) + return module + + +def _trace(tmp_path: Path) -> set[tuple[str, str]]: + pkg = tmp_path / "pkgx" + pkg.mkdir() + (pkg / "__init__.py").write_text("") + mod_path = pkg / "mod.py" + mod_path.write_text(MOD_SRC) + module = _load_module(mod_path) + return trace_calls(module.caller, pkg, "pkgx") + + +class TestDecoratorWrapperNormalization: + def test_call_attributed_to_wrapped_function_not_wrapper( + self, tmp_path: Path + ) -> None: + edges = _trace(tmp_path) + assert ("pkgx.mod.caller", "pkgx.mod.target_fn") in edges, edges + + def test_no_generic_wrapper_node_appears(self, tmp_path: Path) -> None: + edges = _trace(tmp_path) + wrapper_edges = [ + (frm, to) + for frm, to in edges + if frm.endswith("wrapper") or to.endswith("wrapper") + ] + assert wrapper_edges == [], wrapper_edges + + def test_wrapped_function_body_calls_are_preserved(self, tmp_path: Path) -> None: + edges = _trace(tmp_path) + assert ("pkgx.mod.target_fn", "pkgx.mod.helper") in edges, edges diff --git a/codebase_rag/tests/test_language_node_coverage.py b/codebase_rag/tests/test_language_node_coverage.py index 74648125f..7ee255693 100644 --- a/codebase_rag/tests/test_language_node_coverage.py +++ b/codebase_rag/tests/test_language_node_coverage.py @@ -3,8 +3,8 @@ import pytest from codebase_rag.constants import ( + C_EXTENSIONS, CPP_EXTENSIONS, - CS_EXTENSIONS, GO_EXTENSIONS, JAVA_EXTENSIONS, JS_EXTENSIONS, @@ -60,8 +60,8 @@ def test_each_language_has_file_extensions(self, lang: SupportedLanguage) -> Non (SupportedLanguage.GO, GO_EXTENSIONS), (SupportedLanguage.SCALA, SCALA_EXTENSIONS), (SupportedLanguage.JAVA, JAVA_EXTENSIONS), + (SupportedLanguage.C, C_EXTENSIONS), (SupportedLanguage.CPP, CPP_EXTENSIONS), - (SupportedLanguage.CSHARP, CS_EXTENSIONS), (SupportedLanguage.PHP, PHP_EXTENSIONS), (SupportedLanguage.LUA, LUA_EXTENSIONS), ] @@ -87,11 +87,11 @@ def test_language_spec_has_correct_extensions( (".go", SupportedLanguage.GO), (".scala", SupportedLanguage.SCALA), (".java", SupportedLanguage.JAVA), + (".c", SupportedLanguage.C), (".cpp", SupportedLanguage.CPP), (".h", SupportedLanguage.CPP), (".hpp", SupportedLanguage.CPP), (".cc", SupportedLanguage.CPP), - (".cs", SupportedLanguage.CSHARP), (".php", SupportedLanguage.PHP), (".lua", SupportedLanguage.LUA), ] diff --git a/codebase_rag/tests/test_llm_service_unit.py b/codebase_rag/tests/test_llm_service_unit.py index 74127c7f5..4fc69287d 100644 --- a/codebase_rag/tests/test_llm_service_unit.py +++ b/codebase_rag/tests/test_llm_service_unit.py @@ -231,12 +231,13 @@ def test_creates_agent_with_tools( mock_agent.return_value = MagicMock() tools = [MagicMock(), MagicMock()] - result = create_rag_orchestrator(tools) + agent, system_prompt = create_rag_orchestrator(tools) mock_agent.assert_called_once() call_kwargs = mock_agent.call_args.kwargs assert call_kwargs["tools"] == tools - assert result is not None + assert agent is not None + assert system_prompt == "System prompt" @patch("codebase_rag.services.llm.settings") @patch("codebase_rag.services.llm.get_provider_from_config") diff --git a/codebase_rag/tests/test_local_alias_calls.py b/codebase_rag/tests/test_local_alias_calls.py new file mode 100644 index 000000000..017638524 --- /dev/null +++ b/codebase_rag/tests/test_local_alias_calls.py @@ -0,0 +1,90 @@ +# (H) L3 finding from the evals/ harness: a function bound to a local variable and +# (H) then called through that alias (g = self._method; g()) runs the aliased +# (H) callable at runtime, but cgr saw a bare-name call that resolved to nothing. +# (H) A call through a local alias must produce a CALLS edge to the aliased target. +from __future__ import annotations + +from pathlib import Path + +from codebase_rag import constants as cs +from codebase_rag.graph_updater import GraphUpdater +from codebase_rag.parser_loader import load_parsers +from codebase_rag.types_defs import PropertyDict, PropertyValue, ResultRow + +PROJECT = "proj" + +MODULE_SRC = """class Engine: + def run(self) -> str: + do = self._start + return do() + + def _start(self) -> str: + return helper() + + +def helper() -> str: + return "x" + + +def top() -> str: + fn = helper + return fn() +""" + + +class _Capture: + def __init__(self) -> None: + self.rels: list[tuple[PropertyValue, str, PropertyValue]] = [] + + def ensure_node_batch(self, label: str, properties: PropertyDict) -> None: + return None + + def ensure_relationship_batch( + self, + from_spec: tuple[str, str, PropertyValue], + rel_type: str, + to_spec: tuple[str, str, PropertyValue], + properties: PropertyDict | None = None, + ) -> None: + self.rels.append((from_spec[2], str(rel_type), to_spec[2])) + + def flush_all(self) -> None: + return None + + def fetch_all( + self, query: str, params: PropertyDict | None = None + ) -> list[ResultRow]: + return [] + + def execute_write(self, query: str, params: PropertyDict | None = None) -> None: + return None + + +def _calls(tmp_path: Path) -> set[tuple[PropertyValue, PropertyValue]]: + (tmp_path / "m.py").write_text(MODULE_SRC) + parsers, queries = load_parsers() + cap = _Capture() + GraphUpdater( + ingestor=cap, + repo_path=tmp_path, + parsers=parsers, + queries=queries, + project_name=PROJECT, + ).run(force=True) + return { + (frm, to) for (frm, rel, to) in cap.rels if rel == cs.RelationshipType.CALLS + } + + +class TestLocalAliasCalls: + def test_alias_to_self_method_is_a_call(self, tmp_path: Path) -> None: + calls = _calls(tmp_path) + assert ("proj.m.Engine.run", "proj.m.Engine._start") in calls, calls + + def test_alias_to_module_function_is_a_call(self, tmp_path: Path) -> None: + calls = _calls(tmp_path) + assert ("proj.m.top", "proj.m.helper") in calls, calls + + def test_direct_call_unaffected(self, tmp_path: Path) -> None: + calls = _calls(tmp_path) + assert ("proj.m.Engine._start", "proj.m.helper") in calls, calls diff --git a/codebase_rag/tests/test_local_alias_chain_resolution.py b/codebase_rag/tests/test_local_alias_chain_resolution.py new file mode 100644 index 000000000..a2c964507 --- /dev/null +++ b/codebase_rag/tests/test_local_alias_chain_resolution.py @@ -0,0 +1,96 @@ +# (H) L3 finding from the evals/ harness: CallProcessor._ingest_function_calls does +# (H) `registry = resolver.function_registry` (resolver = self._resolver) then +# (H) `qn in registry`, dispatching to FunctionRegistryTrie.__contains__. Resolving it +# (H) needs local-variable aliasing (local = self.attr) plus cross-class attribute-chain +# (H) typing (local2 = local.attr) so the operand's concrete type is known. +from __future__ import annotations + +from pathlib import Path + +from codebase_rag import constants as cs +from codebase_rag.graph_updater import GraphUpdater +from codebase_rag.parser_loader import load_parsers +from codebase_rag.types_defs import PropertyDict, PropertyValue, ResultRow + +PROJECT = "proj" + +FILES = { + "pkg/__init__.py": "", + "pkg/registry.py": ( + "class Registry:\n def __contains__(self, key):\n return True\n" + ), + "pkg/resolver.py": ( + "from .registry import Registry\n\n\n" + "class Resolver:\n" + " def __init__(self) -> None:\n" + " self.registry = Registry()\n" + ), + "pkg/proc.py": ( + "from .resolver import Resolver\n\n\n" + "class Proc:\n" + " def __init__(self) -> None:\n" + " self._resolver = Resolver()\n\n" + " def run(self, qn):\n" + " resolver = self._resolver\n" + " registry = resolver.registry\n" + " return qn in registry\n" + ), +} + + +class _Capture: + def __init__(self) -> None: + self.rels: list[tuple[PropertyValue, str, PropertyValue]] = [] + + def ensure_node_batch(self, label: str, properties: PropertyDict) -> None: + return None + + def ensure_relationship_batch( + self, + from_spec: tuple[str, str, PropertyValue], + rel_type: str, + to_spec: tuple[str, str, PropertyValue], + properties: PropertyDict | None = None, + ) -> None: + self.rels.append((from_spec[2], str(rel_type), to_spec[2])) + + def flush_all(self) -> None: + return None + + def fetch_all( + self, query: str, params: PropertyDict | None = None + ) -> list[ResultRow]: + return [] + + def execute_write(self, query: str, params: PropertyDict | None = None) -> None: + return None + + +def _calls(tmp_path: Path) -> set[tuple[PropertyValue, PropertyValue]]: + for rel, content in FILES.items(): + p = tmp_path / rel + p.parent.mkdir(parents=True, exist_ok=True) + p.write_text(content) + parsers, queries = load_parsers() + cap = _Capture() + GraphUpdater( + ingestor=cap, + repo_path=tmp_path, + parsers=parsers, + queries=queries, + project_name=PROJECT, + ).run(force=True) + return { + (frm, to) for (frm, rel, to) in cap.rels if rel == cs.RelationshipType.CALLS + } + + +class TestLocalAliasChainResolution: + def test_local_alias_attribute_chain_dispatches_to_dunder( + self, tmp_path: Path + ) -> None: + calls = _calls(tmp_path) + assert ( + "proj.pkg.proc.Proc.run", + "proj.pkg.registry.Registry.__contains__", + ) in calls, calls diff --git a/codebase_rag/tests/test_lua_containment_oracle.py b/codebase_rag/tests/test_lua_containment_oracle.py new file mode 100644 index 000000000..1d517b8ba --- /dev/null +++ b/codebase_rag/tests/test_lua_containment_oracle.py @@ -0,0 +1,56 @@ +# (H) Covers Lua containment-edge validation. Lua has no classes/methods, so the +# (H) only containment edge is DEFINES: the file module DEFINES top-level +# (H) functions, and a function DEFINES the functions nested in its body. Graded +# (H) against the independent luaparse oracle, joined on (kind, file, line). +from __future__ import annotations + +from pathlib import Path + +import pytest + +from codebase_rag import constants as cs +from codebase_rag.parser_loader import load_parsers +from evals import constants as ec +from evals.cgr_graph import extract_cgr_lua_graph +from evals.oracles import lua_oracle_available, run_lua_oracle +from evals.score import score_edge_types + +LUA_SRC = """\ +local function freeFn(a) + return a + 1 +end + +function globalFn() + local function nested() + return 1 + end + return nested +end + +local cb = function(x) return x end +""" + + +def _require_lua() -> None: + if not lua_oracle_available(): + pytest.skip("node/npm toolchain not available") + if cs.SupportedLanguage.LUA not in load_parsers()[0]: + pytest.skip("lua parser not available") + + +def test_cgr_matches_luaparse_oracle_on_containment_edges(tmp_path: Path) -> None: + _require_lua() + project = tmp_path / "lua_edge" + project.mkdir() + (project / "lib.lua").write_text(LUA_SRC, encoding="utf-8") + + cgr = extract_cgr_lua_graph(project, project.name) + oracle = run_lua_oracle(project) + + result = score_edge_types(cgr, oracle, ec.SCORED_EDGE_TYPES) + by_label = {row["label"]: row for row in result.rows} + # (H) Lua only has DEFINES (no methods, so no DEFINES_METHOD row at all). + row = by_label.get(cs.RelationshipType.DEFINES.value) + assert row is not None, (by_label, result.diff) + assert row["precision"] == 1.0 and row["recall"] == 1.0, (row, result.diff) + assert cs.RelationshipType.DEFINES_METHOD.value not in by_label, by_label diff --git a/codebase_rag/tests/test_lua_modern_features.py b/codebase_rag/tests/test_lua_modern_features.py index 0cf6003a4..a9e84265f 100644 --- a/codebase_rag/tests/test_lua_modern_features.py +++ b/codebase_rag/tests/test_lua_modern_features.py @@ -621,9 +621,23 @@ def test_lua_54_enhanced_stdlib(temp_repo: Path, mock_ingestor: MagicMock) -> No assert expected_fn in fn_qns, f"Missing function: {expected_fn}" calls_rels = get_relationships(mock_ingestor, "CALLS") - - assert len(calls_rels) >= 10, ( - f"Expected at least 10 CALLS, got {len(calls_rels)}" + call_edges = {(c.args[0][2], c.args[2][2]) for c in calls_rels} + + # (H) stdlib calls (math.*, string.*, table.*, io.*, os.*) are not + # (H) first-party, so the only CALLS edges are between StdLib methods: + # (H) run_all_tests fans out to the six test_* methods, and the + # (H) top-level `StdLib.run_all_tests()` in main.lua is attributed to + # (H) the main module (not duplicated onto every nested call site). + run_all = f"{stdlib_qn}.StdLib.run_all_tests" + main_qn = f"{project.name}.main" + for method in expected_functions: + if method == run_all: + continue + assert (run_all, method) in call_edges, ( + f"Missing CALLS edge {run_all} -> {method}" + ) + assert (main_qn, run_all) in call_edges, ( + f"Missing module-level CALLS edge {main_qn} -> {run_all}" ) print("✅ Lua 5.4 enhanced standard library test PASSED") diff --git a/codebase_rag/tests/test_lua_span_oracle.py b/codebase_rag/tests/test_lua_span_oracle.py new file mode 100644 index 000000000..9f70f4641 --- /dev/null +++ b/codebase_rag/tests/test_lua_span_oracle.py @@ -0,0 +1,58 @@ +# (H) Covers Lua node SPAN (end_line) validation: cgr's end_line for each Function +# (H) is graded against the luaparse oracle (which emits node.loc.end.line), joined +# (H) on (kind, file, start). Exercises a global function, a nested function, and a +# (H) multi-line anonymous function expression so spans are not single line. +from __future__ import annotations + +from pathlib import Path + +import pytest + +from codebase_rag import constants as cs +from codebase_rag.parser_loader import load_parsers +from evals import constants as ec +from evals.cgr_graph import extract_cgr_lua_graph +from evals.oracles import lua_oracle_available, run_lua_oracle +from evals.score import score_span + +LUA_SRC = """\ +function outer(a, b) + local function inner(x) + return x + 1 + end + return inner(a) + b +end + +local handler = function(v) + return v * 2 +end + +return outer(handler(1), 2) +""" + + +def _require_lua() -> None: + if not lua_oracle_available(): + pytest.skip("node/npm toolchain not available") + if cs.SupportedLanguage.LUA not in load_parsers()[0]: + pytest.skip("lua parser not available") + + +def test_cgr_matches_luaparse_oracle_on_node_spans(tmp_path: Path) -> None: + _require_lua() + project = tmp_path / "lua_span_test" + project.mkdir() + (project / "lib.lua").write_text(LUA_SRC, encoding="utf-8") + + cgr = extract_cgr_lua_graph(project, project.name) + oracle = run_lua_oracle(project) + + result = score_span(cgr, oracle, ec.LUA_SCORED_NODE_KINDS) + by_label = {row["label"]: row for row in result.rows} + aggregate = by_label.get(ec.AGGREGATE_LABEL) + assert aggregate is not None, (by_label, result.diff) + assert aggregate["precision"] == 1.0 and aggregate["recall"] == 1.0, ( + aggregate, + result.diff, + ) + assert aggregate["tp"] >= 3, aggregate diff --git a/codebase_rag/tests/test_lua_structure_oracle.py b/codebase_rag/tests/test_lua_structure_oracle.py new file mode 100644 index 000000000..c30b49f9e --- /dev/null +++ b/codebase_rag/tests/test_lua_structure_oracle.py @@ -0,0 +1,54 @@ +# (H) Covers the Lua structure oracle harness (evals/oracles/lua_oracle + +# (H) evals/lua_l1.py): the luaparse oracle is authoritative ground truth, and +# (H) cgr's captured Lua nodes are graded against it on (kind, file, start_line). +# (H) Lua has no classes, so every function is a Function. +from __future__ import annotations + +from pathlib import Path + +import pytest + +from codebase_rag import constants as cs +from codebase_rag.parser_loader import load_parsers +from evals import constants as ec +from evals.cgr_graph import extract_cgr_lua_nodes +from evals.oracles import lua_oracle_available, run_lua_oracle +from evals.score import score_node_kinds +from evals.types_defs import GraphData + +LUA_SRC = """\ +local M = {} +function freeFn(a) return a + 1 end +local function localFn(b) return b end +function M.tableFn(c) return c end +function M:methodFn(d) return d end +local arrow = function(e) return e end +return M +""" + + +def _require_lua() -> None: + if not lua_oracle_available(): + pytest.skip("node/npm toolchain not available") + if cs.SupportedLanguage.LUA not in load_parsers()[0]: + pytest.skip("lua parser not available") + + +def test_cgr_matches_luaparse_oracle_on_lua_structure(tmp_path: Path) -> None: + _require_lua() + project = tmp_path / "lua_oracle_test" + project.mkdir() + (project / "m.lua").write_text(LUA_SRC, encoding="utf-8") + + cgr = GraphData( + nodes=extract_cgr_lua_nodes(project, project.name), + edges=set(), + name_edges=set(), + ) + oracle = run_lua_oracle(project) + + result = score_node_kinds(cgr, oracle, ec.LUA_SCORED_NODE_KINDS) + by_label = {row["label"]: row for row in result.rows} + row = by_label.get(cs.NodeLabel.FUNCTION.value) + assert row is not None, by_label + assert row["precision"] == 1.0 and row["recall"] == 1.0, row diff --git a/codebase_rag/tests/test_mcp_query_and_index.py b/codebase_rag/tests/test_mcp_query_and_index.py index ce9a5ffcd..89cbdc267 100644 --- a/codebase_rag/tests/test_mcp_query_and_index.py +++ b/codebase_rag/tests/test_mcp_query_and_index.py @@ -364,6 +364,77 @@ async def test_sequential_index_only_clears_own_project_data( assert mock_ingestor.delete_project.call_count == 2 +class TestIndexRepositoryConstraintsAndFlush: + """Regression tests for issue #2: MCP indexing produced an incomplete graph. + + The MCP path diverged from the CLI path: it never called + ``ensure_constraints()`` and never defensively flushed the long-lived + ingestor before/after ``GraphUpdater.run()``, so stale buffered state could + leak across calls and missing constraints/indexes corrupted node creation. + + NOTE: A full assertion that ``Class`` and ``Method`` nodes are persisted + requires a live Memgraph backend (the in-repo ``_MockIngestor`` does not + persist a real graph, and ``GraphUpdater`` emits those node batches + regardless of the orchestration bug). These tests instead pin the + orchestration that the CLI path performs and the MCP path was missing. + """ + + @staticmethod + def _ordered_calls(manager: MagicMock) -> list[str]: + tracked = { + "ingestor.ensure_constraints", + "ingestor.flush_all", + "updater.run", + } + return [name for name, _, _ in manager.mock_calls if name in tracked] + + async def test_index_ensures_constraints_and_flushes_around_run( + self, temp_project_root: Path + ) -> None: + manager = MagicMock() + registry = MCPToolsRegistry( + project_root=str(temp_project_root), + ingestor=manager.ingestor, + cypher_gen=MagicMock(), + ) + + with patch("codebase_rag.mcp.tools.GraphUpdater") as mock_updater_class: + mock_updater_class.return_value = manager.updater + manager.updater.run.return_value = None + + await registry.index_repository() + + assert self._ordered_calls(manager) == [ + "ingestor.ensure_constraints", + "ingestor.flush_all", + "updater.run", + "ingestor.flush_all", + ] + + async def test_update_ensures_constraints_and_flushes_around_run( + self, temp_project_root: Path + ) -> None: + manager = MagicMock() + registry = MCPToolsRegistry( + project_root=str(temp_project_root), + ingestor=manager.ingestor, + cypher_gen=MagicMock(), + ) + + with patch("codebase_rag.mcp.tools.GraphUpdater") as mock_updater_class: + mock_updater_class.return_value = manager.updater + manager.updater.run.return_value = None + + await registry.update_repository() + + assert self._ordered_calls(manager) == [ + "ingestor.ensure_constraints", + "ingestor.flush_all", + "updater.run", + "ingestor.flush_all", + ] + + class TestQueryAndIndexIntegration: """Test integration between querying and indexing.""" diff --git a/codebase_rag/tests/test_mcp_server.py b/codebase_rag/tests/test_mcp_server.py index 6d621e76d..c84901bf6 100644 --- a/codebase_rag/tests/test_mcp_server.py +++ b/codebase_rag/tests/test_mcp_server.py @@ -1,10 +1,13 @@ +import contextlib import os +from collections.abc import AsyncIterator from pathlib import Path from typing import Any -from unittest.mock import patch +from unittest.mock import AsyncMock, MagicMock, patch import pytest +from codebase_rag.mcp import server as srv from codebase_rag.mcp.server import get_project_root @@ -173,3 +176,51 @@ def test_works_with_actual_cwd(self) -> None: assert result == actual_cwd.resolve() assert result.exists() assert result.is_dir() + + +class TestServiceLifecycle: + """Tests that the MCP server lifecycle releases the Qdrant client.""" + + def test_service_lifecycle_closes_qdrant_on_exit(self) -> None: + mock_ingestor = MagicMock() + + with patch.object(srv, "close_qdrant_client") as mock_close: + with srv._service_lifecycle(mock_ingestor): + mock_ingestor.__enter__.assert_called_once() + mock_close.assert_not_called() + mock_close.assert_called_once_with() + mock_ingestor.__exit__.assert_called_once() + + def test_service_lifecycle_closes_qdrant_on_exception(self) -> None: + mock_ingestor = MagicMock() + + with patch.object(srv, "close_qdrant_client") as mock_close: + with pytest.raises(RuntimeError): + with srv._service_lifecycle(mock_ingestor): + raise RuntimeError("boom") + mock_close.assert_called_once_with() + mock_ingestor.__exit__.assert_called_once() + + +class TestServeStdioShutdown: + """Tests that serve_stdio releases the Qdrant lock on shutdown.""" + + async def test_serve_stdio_closes_qdrant_client_on_shutdown(self) -> None: + mock_ingestor = MagicMock() + mock_server = MagicMock() + mock_server.run = AsyncMock() + mock_server.create_initialization_options = MagicMock(return_value=MagicMock()) + + @contextlib.asynccontextmanager + async def fake_stdio() -> AsyncIterator[tuple[MagicMock, MagicMock]]: + yield (MagicMock(), MagicMock()) + + with patch.object( + srv, "create_server", return_value=(mock_server, mock_ingestor) + ): + with patch.object(srv, "stdio_server", fake_stdio): + with patch.object(srv, "close_qdrant_client") as mock_close: + await srv.serve_stdio() + + mock_close.assert_called_once_with() + mock_server.run.assert_awaited_once() diff --git a/codebase_rag/tests/test_mcp_tools_helpers.py b/codebase_rag/tests/test_mcp_tools_helpers.py new file mode 100644 index 000000000..7804c9fa0 --- /dev/null +++ b/codebase_rag/tests/test_mcp_tools_helpers.py @@ -0,0 +1,98 @@ +from unittest.mock import MagicMock, patch + +from codebase_rag import constants as cs + +_PATCH_DELETE = "codebase_rag.mcp.tools.delete_project_embeddings" + + +def _make_registry(mock_ingestor: MagicMock) -> MagicMock: + from codebase_rag.mcp.tools import MCPToolsRegistry + + registry = MagicMock(spec=MCPToolsRegistry) + registry.ingestor = mock_ingestor + registry._get_project_node_ids = MCPToolsRegistry._get_project_node_ids.__get__( + registry + ) + registry._cleanup_project_embeddings = ( + MCPToolsRegistry._cleanup_project_embeddings.__get__(registry) + ) + return registry + + +class TestGetProjectNodeIds: + def test_returns_integer_ids(self) -> None: + mock_ingestor = MagicMock() + mock_ingestor.fetch_all.return_value = [ + {cs.KEY_NODE_ID: 1}, + {cs.KEY_NODE_ID: 2}, + {cs.KEY_NODE_ID: 3}, + ] + registry = _make_registry(mock_ingestor) + + result = registry._get_project_node_ids("myproject") + + assert result == [1, 2, 3] + mock_ingestor.fetch_all.assert_called_once_with( + cs.CYPHER_QUERY_PROJECT_NODE_IDS, + {cs.KEY_PROJECT_NAME: "myproject"}, + ) + + def test_filters_non_integer_ids(self) -> None: + mock_ingestor = MagicMock() + mock_ingestor.fetch_all.return_value = [ + {cs.KEY_NODE_ID: 1}, + {cs.KEY_NODE_ID: "not_an_int"}, + {cs.KEY_NODE_ID: None}, + {cs.KEY_NODE_ID: 4}, + ] + registry = _make_registry(mock_ingestor) + + result = registry._get_project_node_ids("proj") + + assert result == [1, 4] + + def test_returns_empty_when_no_rows(self) -> None: + mock_ingestor = MagicMock() + mock_ingestor.fetch_all.return_value = [] + registry = _make_registry(mock_ingestor) + + result = registry._get_project_node_ids("empty") + + assert result == [] + + def test_skips_rows_missing_key(self) -> None: + mock_ingestor = MagicMock() + mock_ingestor.fetch_all.return_value = [ + {"other_key": 99}, + {cs.KEY_NODE_ID: 5}, + ] + registry = _make_registry(mock_ingestor) + + result = registry._get_project_node_ids("proj") + + assert result == [5] + + +class TestCleanupProjectEmbeddings: + def test_calls_delete_with_node_ids(self) -> None: + mock_ingestor = MagicMock() + mock_ingestor.fetch_all.return_value = [ + {cs.KEY_NODE_ID: 10}, + {cs.KEY_NODE_ID: 20}, + ] + registry = _make_registry(mock_ingestor) + + with patch(_PATCH_DELETE) as mock_delete: + registry._cleanup_project_embeddings("myproject") + + mock_delete.assert_called_once_with("myproject", [10, 20]) + + def test_calls_delete_with_empty_list_when_no_nodes(self) -> None: + mock_ingestor = MagicMock() + mock_ingestor.fetch_all.return_value = [] + registry = _make_registry(mock_ingestor) + + with patch(_PATCH_DELETE) as mock_delete: + registry._cleanup_project_embeddings("empty_proj") + + mock_delete.assert_called_once_with("empty_proj", []) diff --git a/codebase_rag/tests/test_mcp_update_and_search.py b/codebase_rag/tests/test_mcp_update_and_search.py new file mode 100644 index 000000000..b01128931 --- /dev/null +++ b/codebase_rag/tests/test_mcp_update_and_search.py @@ -0,0 +1,496 @@ +from pathlib import Path +from unittest.mock import AsyncMock, MagicMock, patch + +import pytest + +from codebase_rag import constants as cs +from codebase_rag.mcp.client import query_mcp_server +from codebase_rag.mcp.tools import MCPToolsRegistry + +pytestmark = [pytest.mark.anyio] + + +@pytest.fixture(params=["asyncio"]) +def anyio_backend(request: pytest.FixtureRequest) -> str: + return str(request.param) + + +@pytest.fixture +def temp_project_root(tmp_path: Path) -> Path: + sample_file = tmp_path / "app.py" + sample_file.write_text("def main(): pass\n", encoding="utf-8") + return tmp_path + + +@pytest.fixture +def mcp_registry(temp_project_root: Path) -> MCPToolsRegistry: + mock_ingestor = MagicMock() + mock_cypher_gen = MagicMock() + + registry = MCPToolsRegistry( + project_root=str(temp_project_root), + ingestor=mock_ingestor, + cypher_gen=mock_cypher_gen, + ) + return registry + + +class TestUpdateRepository: + async def test_update_repository_success( + self, mcp_registry: MCPToolsRegistry + ) -> None: + with patch("codebase_rag.mcp.tools.GraphUpdater") as mock_updater_cls: + mock_updater = MagicMock() + mock_updater_cls.return_value = mock_updater + + result = await mcp_registry.update_repository() + + mock_updater_cls.assert_called_once() + mock_updater.run.assert_called_once() + assert mcp_registry.project_root in result + + async def test_update_repository_error( + self, mcp_registry: MCPToolsRegistry + ) -> None: + with patch("codebase_rag.mcp.tools.GraphUpdater") as mock_updater_cls: + mock_updater_cls.side_effect = RuntimeError("parse error") + + result = await mcp_registry.update_repository() + + assert "Error" in result + + async def test_update_repository_registered( + self, mcp_registry: MCPToolsRegistry + ) -> None: + assert cs.MCPToolName.UPDATE_REPOSITORY in mcp_registry._tools + + async def test_update_repository_no_wipe( + self, mcp_registry: MCPToolsRegistry + ) -> None: + with patch("codebase_rag.mcp.tools.GraphUpdater") as mock_updater_cls: + mock_updater = MagicMock() + mock_updater_cls.return_value = mock_updater + + await mcp_registry.update_repository() + + mcp_registry.ingestor.delete_project.assert_not_called() + mcp_registry.ingestor.clean_database.assert_not_called() + + +class TestSemanticSearchRegistration: + def test_semantic_search_not_registered_without_deps( + self, temp_project_root: Path + ) -> None: + mock_ingestor = MagicMock() + mock_cypher_gen = MagicMock() + + with patch( + "codebase_rag.mcp.tools.has_semantic_dependencies", + return_value=False, + ): + registry = MCPToolsRegistry( + project_root=str(temp_project_root), + ingestor=mock_ingestor, + cypher_gen=mock_cypher_gen, + ) + + assert cs.MCPToolName.SEMANTIC_SEARCH not in registry._tools + assert registry._semantic_search_available is False + + def test_semantic_search_registered_with_deps( + self, temp_project_root: Path + ) -> None: + mock_ingestor = MagicMock() + mock_cypher_gen = MagicMock() + + with ( + patch( + "codebase_rag.mcp.tools.has_semantic_dependencies", + return_value=True, + ), + patch( + "codebase_rag.tools.semantic_search.create_semantic_search_tool" + ) as mock_create, + ): + mock_tool = MagicMock() + mock_create.return_value = mock_tool + + registry = MCPToolsRegistry( + project_root=str(temp_project_root), + ingestor=mock_ingestor, + cypher_gen=mock_cypher_gen, + ) + + assert cs.MCPToolName.SEMANTIC_SEARCH in registry._tools + assert registry._semantic_search_available is True + + async def test_semantic_search_calls_tool(self, temp_project_root: Path) -> None: + mock_ingestor = MagicMock() + mock_cypher_gen = MagicMock() + + with ( + patch( + "codebase_rag.mcp.tools.has_semantic_dependencies", + return_value=True, + ), + patch( + "codebase_rag.tools.semantic_search.create_semantic_search_tool" + ) as mock_create, + ): + mock_tool = MagicMock() + mock_tool.function = AsyncMock(return_value="result1, result2") + mock_create.return_value = mock_tool + + registry = MCPToolsRegistry( + project_root=str(temp_project_root), + ingestor=mock_ingestor, + cypher_gen=mock_cypher_gen, + ) + + result = await registry.semantic_search("find auth functions", top_k=3) + + mock_tool.function.assert_called_once_with( + query="find auth functions", top_k=3 + ) + assert "result1" in result + + +class TestAskAgent: + async def test_ask_agent_registered(self, mcp_registry: MCPToolsRegistry) -> None: + assert cs.MCPToolName.ASK_AGENT in mcp_registry._tools + + async def test_ask_agent_success(self, mcp_registry: MCPToolsRegistry) -> None: + mock_agent = MagicMock() + mock_response = MagicMock() + mock_response.output = "The auth module uses JWT tokens." + mock_agent.run = AsyncMock(return_value=mock_response) + mcp_registry.rag_agent = mock_agent + + result = await mcp_registry.ask_agent("How is auth implemented?") + + assert result["output"] == "The auth module uses JWT tokens." + mock_agent.run.assert_called_once_with( + "How is auth implemented?", message_history=[] + ) + + async def test_ask_agent_error(self, mcp_registry: MCPToolsRegistry) -> None: + mock_agent = MagicMock() + mock_agent.run = AsyncMock(side_effect=RuntimeError("LLM unavailable")) + mcp_registry.rag_agent = mock_agent + + result = await mcp_registry.ask_agent("What does main do?") + + assert "error" in result + + +class TestToolDescriptions: + def test_update_repository_in_tool_map(self) -> None: + from codebase_rag.tools.tool_descriptions import MCP_TOOLS + + assert cs.MCPToolName.UPDATE_REPOSITORY in MCP_TOOLS + + def test_semantic_search_in_tool_map(self) -> None: + from codebase_rag.tools.tool_descriptions import MCP_TOOLS + + assert cs.MCPToolName.SEMANTIC_SEARCH in MCP_TOOLS + + def test_ask_agent_in_tool_map(self) -> None: + from codebase_rag.tools.tool_descriptions import MCP_TOOLS + + assert cs.MCPToolName.ASK_AGENT in MCP_TOOLS + + def test_index_repository_warns_about_project_clear(self) -> None: + from codebase_rag.tools.tool_descriptions import MCP_INDEX_REPOSITORY + + assert "current project" in MCP_INDEX_REPOSITORY + assert "entire database" not in MCP_INDEX_REPOSITORY + + +class TestRagAgentProperty: + def test_rag_agent_setter_allows_mock(self, mcp_registry: MCPToolsRegistry) -> None: + mock_agent = MagicMock() + mcp_registry.rag_agent = mock_agent + assert mcp_registry.rag_agent is mock_agent + + def test_rag_agent_lazy_init(self, temp_project_root: Path) -> None: + mock_ingestor = MagicMock() + mock_cypher_gen = MagicMock() + + with patch( + "codebase_rag.mcp.tools.has_semantic_dependencies", + return_value=False, + ): + registry = MCPToolsRegistry( + project_root=str(temp_project_root), + ingestor=mock_ingestor, + cypher_gen=mock_cypher_gen, + ) + + assert registry._rag_agent is None + + with patch("codebase_rag.mcp.tools.create_rag_orchestrator") as mock_create: + mock_agent = MagicMock() + mock_create.return_value = (mock_agent, "system prompt") + + agent = registry.rag_agent + + mock_create.assert_called_once() + assert agent is mock_agent + + def test_rag_agent_includes_function_source_tool( + self, temp_project_root: Path + ) -> None: + mock_ingestor = MagicMock() + mock_cypher_gen = MagicMock() + + with patch( + "codebase_rag.mcp.tools.has_semantic_dependencies", + return_value=False, + ): + registry = MCPToolsRegistry( + project_root=str(temp_project_root), + ingestor=mock_ingestor, + cypher_gen=mock_cypher_gen, + ) + + with ( + patch("codebase_rag.mcp.tools.create_rag_orchestrator") as mock_create, + patch( + "codebase_rag.tools.semantic_search.create_get_function_source_tool" + ) as mock_fst, + ): + mock_tool = MagicMock() + mock_fst.return_value = mock_tool + mock_create.return_value = (MagicMock(), "system prompt") + + registry.rag_agent + + tools_arg = mock_create.call_args[1]["tools"] + assert mock_tool in tools_arg + + def test_rag_agent_includes_semantic_search_when_available( + self, temp_project_root: Path + ) -> None: + mock_ingestor = MagicMock() + mock_cypher_gen = MagicMock() + + with ( + patch( + "codebase_rag.mcp.tools.has_semantic_dependencies", + return_value=True, + ), + patch( + "codebase_rag.tools.semantic_search.create_semantic_search_tool" + ) as mock_ss, + ): + mock_ss_tool = MagicMock() + mock_ss.return_value = mock_ss_tool + + registry = MCPToolsRegistry( + project_root=str(temp_project_root), + ingestor=mock_ingestor, + cypher_gen=mock_cypher_gen, + ) + + with ( + patch("codebase_rag.mcp.tools.create_rag_orchestrator") as mock_create, + patch("codebase_rag.tools.semantic_search.create_get_function_source_tool"), + ): + mock_create.return_value = (MagicMock(), "system prompt") + registry.rag_agent + + tools_arg = mock_create.call_args[1]["tools"] + assert mock_ss_tool in tools_arg + + def test_rag_agent_caches_after_first_access(self, temp_project_root: Path) -> None: + mock_ingestor = MagicMock() + mock_cypher_gen = MagicMock() + + with patch( + "codebase_rag.mcp.tools.has_semantic_dependencies", + return_value=False, + ): + registry = MCPToolsRegistry( + project_root=str(temp_project_root), + ingestor=mock_ingestor, + cypher_gen=mock_cypher_gen, + ) + + with ( + patch("codebase_rag.mcp.tools.create_rag_orchestrator") as mock_create, + patch("codebase_rag.tools.semantic_search.create_get_function_source_tool"), + ): + mock_create.return_value = (MagicMock(), "system prompt") + + agent1 = registry.rag_agent + agent2 = registry.rag_agent + + mock_create.assert_called_once() + assert agent1 is agent2 + + +class TestMainSingleQuery: + def test_main_single_query_prints_output( + self, tmp_path: Path, capsys: pytest.CaptureFixture[str] + ) -> None: + from codebase_rag.main import main_single_query + + mock_response = MagicMock() + mock_response.output = "The answer is 42." + + with ( + patch("codebase_rag.main.connect_memgraph") as mock_conn, + patch("codebase_rag.main._initialize_services_and_agent") as mock_init, + patch("codebase_rag.main.asyncio") as mock_asyncio, + patch("codebase_rag.main._setup_common_initialization"), + ): + mock_agent = MagicMock() + mock_init.return_value = (mock_agent, [], "system prompt") + mock_asyncio.run.return_value = mock_response + mock_conn.return_value.__enter__ = MagicMock(return_value=MagicMock()) + mock_conn.return_value.__exit__ = MagicMock(return_value=False) + + main_single_query(str(tmp_path), 1000, "What is the answer?") + + captured = capsys.readouterr() + assert "The answer is 42." in captured.out + + def test_main_single_query_routes_logs_to_stderr(self, tmp_path: Path) -> None: + from codebase_rag.main import main_single_query + + mock_response = MagicMock() + mock_response.output = "result" + + with ( + patch("codebase_rag.main.connect_memgraph") as mock_conn, + patch("codebase_rag.main._initialize_services_and_agent") as mock_init, + patch("codebase_rag.main.asyncio") as mock_asyncio, + patch("codebase_rag.main._setup_common_initialization"), + patch("codebase_rag.main.logger") as mock_logger, + ): + mock_agent = MagicMock() + mock_init.return_value = (mock_agent, [], "system prompt") + mock_asyncio.run.return_value = mock_response + mock_conn.return_value.__enter__ = MagicMock(return_value=MagicMock()) + mock_conn.return_value.__exit__ = MagicMock(return_value=False) + + main_single_query(str(tmp_path), 1000, "test") + + mock_logger.remove.assert_called_once() + mock_logger.add.assert_called_once() + add_args = mock_logger.add.call_args + import sys + + assert add_args[0][0] is sys.stderr + + +class TestMCPClient: + def test_query_mcp_server_is_callable(self) -> None: + assert callable(query_mcp_server) + + def test_client_uses_constants(self) -> None: + import inspect + + from codebase_rag.mcp import client + + source = inspect.getsource(client) + assert "MCPToolName.ASK_AGENT" in source + assert "MCPParamName.QUESTION" in source + + def test_query_with_errlog_is_async(self) -> None: + import asyncio + + from codebase_rag.mcp.client import _query_with_errlog + + assert asyncio.iscoroutinefunction(_query_with_errlog) + + async def test_query_with_errlog_json_response(self) -> None: + import io + + from codebase_rag.mcp.client import _query_with_errlog + + mock_content = MagicMock() + mock_content.text = '{"output": "test answer"}' + mock_result = MagicMock() + mock_result.content = [mock_content] + + mock_session = AsyncMock() + mock_session.initialize = AsyncMock() + mock_session.call_tool = AsyncMock(return_value=mock_result) + mock_session.__aenter__ = AsyncMock(return_value=mock_session) + mock_session.__aexit__ = AsyncMock(return_value=False) + + mock_transport = AsyncMock() + mock_transport.__aenter__ = AsyncMock(return_value=(MagicMock(), MagicMock())) + mock_transport.__aexit__ = AsyncMock(return_value=False) + + with ( + patch("codebase_rag.mcp.client.stdio_client", return_value=mock_transport), + patch("codebase_rag.mcp.client.ClientSession", return_value=mock_session), + ): + result = await _query_with_errlog("test question", io.StringIO()) + + assert result == {"output": "test answer"} + + async def test_query_with_errlog_non_json_response(self) -> None: + import io + + from codebase_rag.mcp.client import _query_with_errlog + + mock_content = MagicMock() + mock_content.text = "plain text response" + mock_result = MagicMock() + mock_result.content = [mock_content] + + mock_session = AsyncMock() + mock_session.initialize = AsyncMock() + mock_session.call_tool = AsyncMock(return_value=mock_result) + mock_session.__aenter__ = AsyncMock(return_value=mock_session) + mock_session.__aexit__ = AsyncMock(return_value=False) + + mock_transport = AsyncMock() + mock_transport.__aenter__ = AsyncMock(return_value=(MagicMock(), MagicMock())) + mock_transport.__aexit__ = AsyncMock(return_value=False) + + with ( + patch("codebase_rag.mcp.client.stdio_client", return_value=mock_transport), + patch("codebase_rag.mcp.client.ClientSession", return_value=mock_session), + ): + result = await _query_with_errlog("test", io.StringIO()) + + assert result == {"output": "plain text response"} + + async def test_query_with_errlog_empty_response(self) -> None: + import io + + from codebase_rag.mcp.client import _query_with_errlog + + mock_result = MagicMock() + mock_result.content = [] + + mock_session = AsyncMock() + mock_session.initialize = AsyncMock() + mock_session.call_tool = AsyncMock(return_value=mock_result) + mock_session.__aenter__ = AsyncMock(return_value=mock_session) + mock_session.__aexit__ = AsyncMock(return_value=False) + + mock_transport = AsyncMock() + mock_transport.__aenter__ = AsyncMock(return_value=(MagicMock(), MagicMock())) + mock_transport.__aexit__ = AsyncMock(return_value=False) + + with ( + patch("codebase_rag.mcp.client.stdio_client", return_value=mock_transport), + patch("codebase_rag.mcp.client.ClientSession", return_value=mock_session), + ): + result = await _query_with_errlog("test", io.StringIO()) + + assert result == {"output": "No response from server"} + + def test_query_mcp_server_opens_devnull(self) -> None: + with ( + patch("codebase_rag.mcp.client.asyncio") as mock_asyncio, + patch("builtins.open", MagicMock()) as mock_open, + ): + mock_asyncio.run.return_value = {"output": "result"} + query_mcp_server("test") + mock_open.assert_called_once() diff --git a/codebase_rag/tests/test_mcp_write_file.py b/codebase_rag/tests/test_mcp_write_file.py index 6c214c12a..dd222e9c6 100644 --- a/codebase_rag/tests/test_mcp_write_file.py +++ b/codebase_rag/tests/test_mcp_write_file.py @@ -199,6 +199,10 @@ class TestWriteFileErrorHandling: @pytest.mark.skipif( os.name == "nt", reason="chmod 0o444 does not prevent file creation on Windows" ) + @pytest.mark.skipif( + hasattr(os, "getuid") and os.getuid() == 0, + reason="root bypasses filesystem permissions", + ) async def test_write_to_readonly_directory( self, mcp_registry: MCPToolsRegistry, temp_project_root: Path ) -> None: diff --git a/codebase_rag/tests/test_memgraph_batching.py b/codebase_rag/tests/test_memgraph_batching.py index a3297e819..81c068b66 100644 --- a/codebase_rag/tests/test_memgraph_batching.py +++ b/codebase_rag/tests/test_memgraph_batching.py @@ -64,15 +64,20 @@ def test_node_batch_preserves_per_row_properties() -> None: def test_relationship_batch_flushes_after_threshold_and_respects_node_flush() -> None: ingestor, cursor_mock = _create_ingestor_with_mocked_connection() + col = MagicMock() + col.name = "created" + cursor_mock.description = [col] + cursor_mock.fetchall.return_value = [(1,), (1,)] + with patch.object( - ingestor, "flush_nodes", wraps=ingestor.flush_nodes + MemgraphIngestor, "flush_nodes", wraps=ingestor.flush_nodes ) as flush_nodes_spy: ingestor.ensure_relationship_batch( ("Module", "qualified_name", "proj.module1"), "CONTAINS_FILE", ("File", "path", "file1"), ) - assert len(ingestor.relationship_buffer) == 1 + assert ingestor._rel_count == 1 cursor_mock.execute.assert_not_called() ingestor.ensure_relationship_batch( @@ -83,7 +88,7 @@ def test_relationship_batch_flushes_after_threshold_and_respects_node_flush() -> assert flush_nodes_spy.call_count == 1 - assert len(ingestor.relationship_buffer) == 0 + assert ingestor._rel_count == 0 cursor_mock.execute.assert_called_once() executed_query = cursor_mock.execute.call_args[0][0] assert "UNWIND $batch" in executed_query diff --git a/codebase_rag/tests/test_memory_limit.py b/codebase_rag/tests/test_memory_limit.py new file mode 100644 index 000000000..8fed07bad --- /dev/null +++ b/codebase_rag/tests/test_memory_limit.py @@ -0,0 +1,43 @@ +from __future__ import annotations + +import pytest + +from codebase_rag.services.graph_service import _apply_memory_limit + + +class TestApplyMemoryLimit: + def test_appends_hint_to_simple_query(self) -> None: + result = _apply_memory_limit("MATCH (n) RETURN n;", 4096) + assert result == "MATCH (n) RETURN n QUERY MEMORY LIMIT 4096 MB;" + + def test_appends_hint_when_no_trailing_semicolon(self) -> None: + result = _apply_memory_limit("MATCH (n) RETURN n", 256) + assert result == "MATCH (n) RETURN n QUERY MEMORY LIMIT 256 MB;" + + def test_preserves_existing_hint(self) -> None: + query = "MATCH (n) RETURN n QUERY MEMORY LIMIT 1024 MB;" + assert _apply_memory_limit(query, 4096) == query + + def test_preserves_existing_hint_case_insensitive(self) -> None: + query = "MATCH (n) RETURN n query memory limit 1024 mb;" + assert _apply_memory_limit(query, 4096) == query + + def test_handles_trailing_whitespace(self) -> None: + result = _apply_memory_limit("MATCH (n) RETURN n;\n ", 4096) + assert result == "MATCH (n) RETURN n QUERY MEMORY LIMIT 4096 MB;" + + def test_handles_whitespace_before_semicolon(self) -> None: + result = _apply_memory_limit("MATCH (n) RETURN n ;", 4096) + assert result == "MATCH (n) RETURN n QUERY MEMORY LIMIT 4096 MB;" + + def test_handles_multiline_query(self) -> None: + query = "MATCH (a)-[:CALLS*1..6]->(b)\nRETURN a, b;" + result = _apply_memory_limit(query, 2048) + assert result == ( + "MATCH (a)-[:CALLS*1..6]->(b)\nRETURN a, b QUERY MEMORY LIMIT 2048 MB;" + ) + + @pytest.mark.parametrize("mb", [128, 256, 1024, 4096, 16384]) + def test_uses_configured_megabytes(self, mb: int) -> None: + result = _apply_memory_limit("MATCH (n) RETURN n;", mb) + assert f"QUERY MEMORY LIMIT {mb} MB" in result diff --git a/codebase_rag/tests/test_method_calls_caller_attribution.py b/codebase_rag/tests/test_method_calls_caller_attribution.py new file mode 100644 index 000000000..6c4cd2a01 --- /dev/null +++ b/codebase_rag/tests/test_method_calls_caller_attribution.py @@ -0,0 +1,679 @@ +from __future__ import annotations + +from pathlib import Path +from typing import TYPE_CHECKING +from unittest.mock import MagicMock + +from codebase_rag import constants as cs +from codebase_rag.tests.conftest import get_relationships, run_updater + +if TYPE_CHECKING: + pass + + +def _get_method_caller_calls(mock_ingestor: MagicMock) -> list: + return [ + c + for c in get_relationships(mock_ingestor, cs.RelationshipType.CALLS) + if c.args[0][0] == cs.NodeLabel.METHOD + ] + + +def _get_function_caller_calls(mock_ingestor: MagicMock) -> list: + return [ + c + for c in get_relationships(mock_ingestor, cs.RelationshipType.CALLS) + if c.args[0][0] == cs.NodeLabel.FUNCTION + ] + + +def _get_module_caller_calls(mock_ingestor: MagicMock) -> list: + return [ + c + for c in get_relationships(mock_ingestor, cs.RelationshipType.CALLS) + if c.args[0][0] == cs.NodeLabel.MODULE + ] + + +def _caller_qn(call: MagicMock) -> str: + return call.args[0][2] + + +def _callee_qn(call: MagicMock) -> str: + return call.args[2][2] + + +class TestCppMethodCallerAttribution: + def test_simple_class_method_calls_method( + self, temp_repo: Path, mock_ingestor: MagicMock + ) -> None: + (temp_repo / "player.cpp").write_text( + encoding="utf-8", + data=""" +class Player { +public: + void handleArtifact() {} + + void handleArtifactWatcherCb() { + handleArtifact(); + } +}; +""", + ) + run_updater(temp_repo, mock_ingestor, cs.SupportedLanguage.CPP) + + method_calls = _get_method_caller_calls(mock_ingestor) + callers = [_caller_qn(c) for c in method_calls] + callees = [_callee_qn(c) for c in method_calls] + + watcher_callers = [qn for qn in callers if "handleArtifactWatcherCb" in qn] + assert len(watcher_callers) >= 1 + + artifact_callees = [qn for qn in callees if "handleArtifact" in qn] + assert len(artifact_callees) >= 1 + + def test_struct_method_calls_method( + self, temp_repo: Path, mock_ingestor: MagicMock + ) -> None: + (temp_repo / "sensor.cpp").write_text( + encoding="utf-8", + data=""" +struct Sensor { + int readRaw() { return 42; } + + int readCalibrated() { + return readRaw() * 2; + } +}; +""", + ) + run_updater(temp_repo, mock_ingestor, cs.SupportedLanguage.CPP) + + method_calls = _get_method_caller_calls(mock_ingestor) + callers = [_caller_qn(c) for c in method_calls] + callees = [_callee_qn(c) for c in method_calls] + + assert any("readCalibrated" in qn for qn in callers) + assert any("readRaw" in qn for qn in callees) + + def test_multiple_methods_calling_each_other( + self, temp_repo: Path, mock_ingestor: MagicMock + ) -> None: + (temp_repo / "calc.cpp").write_text( + encoding="utf-8", + data=""" +class Calculator { +public: + int add(int a, int b) { return a + b; } + int multiply(int a, int b) { return a * b; } + + int compute(int x) { + int sum = add(x, 1); + return multiply(sum, 2); + } +}; +""", + ) + run_updater(temp_repo, mock_ingestor, cs.SupportedLanguage.CPP) + + method_calls = _get_method_caller_calls(mock_ingestor) + compute_calls = [c for c in method_calls if "compute" in _caller_qn(c)] + compute_callees = {_callee_qn(c) for c in compute_calls} + + assert any("add" in qn for qn in compute_callees) + assert any("multiply" in qn for qn in compute_callees) + + def test_constructor_body_calls_method( + self, temp_repo: Path, mock_ingestor: MagicMock + ) -> None: + (temp_repo / "init.cpp").write_text( + encoding="utf-8", + data=""" +class Engine { +public: + void initialize() {} + + Engine() { + initialize(); + } +}; +""", + ) + run_updater(temp_repo, mock_ingestor, cs.SupportedLanguage.CPP) + + method_calls = _get_method_caller_calls(mock_ingestor) + callees = [_callee_qn(c) for c in method_calls] + assert any("initialize" in qn for qn in callees) + + def test_method_calling_free_function_has_method_caller( + self, temp_repo: Path, mock_ingestor: MagicMock + ) -> None: + (temp_repo / "mixed.cpp").write_text( + encoding="utf-8", + data=""" +void freeHelper() {} + +class Service { +public: + void process() { + freeHelper(); + } +}; +""", + ) + run_updater(temp_repo, mock_ingestor, cs.SupportedLanguage.CPP) + + method_calls = _get_method_caller_calls(mock_ingestor) + process_calls = [c for c in method_calls if "process" in _caller_qn(c)] + assert len(process_calls) >= 1 + + def test_multiple_classes_in_one_file( + self, temp_repo: Path, mock_ingestor: MagicMock + ) -> None: + (temp_repo / "multi.cpp").write_text( + encoding="utf-8", + data=""" +class Alpha { +public: + void step1() {} + void run() { step1(); } +}; + +class Beta { +public: + void step2() {} + void execute() { step2(); } +}; +""", + ) + run_updater(temp_repo, mock_ingestor, cs.SupportedLanguage.CPP) + + method_calls = _get_method_caller_calls(mock_ingestor) + callers = {_caller_qn(c) for c in method_calls} + callees = {_callee_qn(c) for c in method_calls} + + assert any("run" in qn for qn in callers) + assert any("execute" in qn for qn in callers) + assert any("step1" in qn for qn in callees) + assert any("step2" in qn for qn in callees) + + def test_method_with_parameters( + self, temp_repo: Path, mock_ingestor: MagicMock + ) -> None: + (temp_repo / "params.cpp").write_text( + encoding="utf-8", + data=""" +class Parser { +public: + int parse(const char* input, int length) { return 0; } + + int parseFile(const char* path) { + return parse(path, 100); + } +}; +""", + ) + run_updater(temp_repo, mock_ingestor, cs.SupportedLanguage.CPP) + + method_calls = _get_method_caller_calls(mock_ingestor) + callers = [_caller_qn(c) for c in method_calls] + assert any("parseFile" in qn for qn in callers) + + def test_virtual_method_calls( + self, temp_repo: Path, mock_ingestor: MagicMock + ) -> None: + (temp_repo / "virtual.cpp").write_text( + encoding="utf-8", + data=""" +class Base { +public: + virtual void onEvent() {} + + void dispatch() { + onEvent(); + } +}; +""", + ) + run_updater(temp_repo, mock_ingestor, cs.SupportedLanguage.CPP) + + method_calls = _get_method_caller_calls(mock_ingestor) + dispatch_calls = [c for c in method_calls if "dispatch" in _caller_qn(c)] + assert len(dispatch_calls) >= 1 + assert any("onEvent" in _callee_qn(c) for c in dispatch_calls) + + def test_method_calling_another_via_this_pointer( + self, temp_repo: Path, mock_ingestor: MagicMock + ) -> None: + (temp_repo / "this_ptr.cpp").write_text( + encoding="utf-8", + data=""" +class Widget { +public: + void repaint() {} + + void resize(int w, int h) { + this->repaint(); + } +}; +""", + ) + run_updater(temp_repo, mock_ingestor, cs.SupportedLanguage.CPP) + + method_calls = _get_method_caller_calls(mock_ingestor) + callers = [_caller_qn(c) for c in method_calls] + assert any("resize" in qn for qn in callers) + + def test_deeply_nested_call_chain( + self, temp_repo: Path, mock_ingestor: MagicMock + ) -> None: + (temp_repo / "chain.cpp").write_text( + encoding="utf-8", + data=""" +class Pipeline { +public: + int validate() { return 1; } + int transform(int x) { return x * 2; } + int output(int x) { return x; } + + int run() { + int v = validate(); + int t = transform(v); + return output(t); + } +}; +""", + ) + run_updater(temp_repo, mock_ingestor, cs.SupportedLanguage.CPP) + + method_calls = _get_method_caller_calls(mock_ingestor) + run_calls = [c for c in method_calls if "run" in _caller_qn(c)] + run_callees = {_callee_qn(c) for c in run_calls} + + assert any("validate" in qn for qn in run_callees) + assert any("transform" in qn for qn in run_callees) + assert any("output" in qn for qn in run_callees) + + def test_static_method_calls( + self, temp_repo: Path, mock_ingestor: MagicMock + ) -> None: + (temp_repo / "static.cpp").write_text( + encoding="utf-8", + data=""" +class Factory { +public: + static int create() { return 0; } + + static int build() { + return create(); + } +}; +""", + ) + run_updater(temp_repo, mock_ingestor, cs.SupportedLanguage.CPP) + + method_calls = _get_method_caller_calls(mock_ingestor) + callers = [_caller_qn(c) for c in method_calls] + assert any("build" in qn for qn in callers) + + def test_const_method_calls( + self, temp_repo: Path, mock_ingestor: MagicMock + ) -> None: + (temp_repo / "const.cpp").write_text( + encoding="utf-8", + data=""" +class Container { +public: + int size() const { return 10; } + + bool empty() const { + return size() == 0; + } +}; +""", + ) + run_updater(temp_repo, mock_ingestor, cs.SupportedLanguage.CPP) + + method_calls = _get_method_caller_calls(mock_ingestor) + callers = [_caller_qn(c) for c in method_calls] + assert any("empty" in qn for qn in callers) + + +class TestPythonMethodCallerAttribution: + def test_method_calls_method( + self, temp_repo: Path, mock_ingestor: MagicMock + ) -> None: + (temp_repo / "service.py").write_text( + encoding="utf-8", + data=""" +class Service: + def validate(self): + pass + + def process(self): + self.validate() +""", + ) + run_updater(temp_repo, mock_ingestor, cs.SupportedLanguage.PYTHON) + + method_calls = _get_method_caller_calls(mock_ingestor) + callers = [_caller_qn(c) for c in method_calls] + assert any("process" in qn for qn in callers) + + def test_multiple_methods_calling_each_other( + self, temp_repo: Path, mock_ingestor: MagicMock + ) -> None: + (temp_repo / "pipeline.py").write_text( + encoding="utf-8", + data=""" +class Pipeline: + def step1(self): + pass + + def step2(self): + self.step1() + + def run(self): + self.step2() +""", + ) + run_updater(temp_repo, mock_ingestor, cs.SupportedLanguage.PYTHON) + + method_calls = _get_method_caller_calls(mock_ingestor) + callers = {_caller_qn(c) for c in method_calls} + assert any("step2" in qn for qn in callers) + assert any("run" in qn for qn in callers) + + def test_dunder_init_calls_method( + self, temp_repo: Path, mock_ingestor: MagicMock + ) -> None: + (temp_repo / "init.py").write_text( + encoding="utf-8", + data=""" +class Config: + def _load(self): + pass + + def __init__(self): + self._load() +""", + ) + run_updater(temp_repo, mock_ingestor, cs.SupportedLanguage.PYTHON) + + method_calls = _get_method_caller_calls(mock_ingestor) + callers = [_caller_qn(c) for c in method_calls] + assert any("__init__" in qn for qn in callers) + + +class TestJavaScriptMethodCallerAttribution: + def test_class_method_calls_method( + self, temp_repo: Path, mock_ingestor: MagicMock + ) -> None: + (temp_repo / "service.js").write_text( + encoding="utf-8", + data=""" +class Service { + validate() { + return true; + } + + process() { + return this.validate(); + } +} +""", + ) + run_updater(temp_repo, mock_ingestor, cs.SupportedLanguage.JS) + + method_calls = _get_method_caller_calls(mock_ingestor) + callers = [_caller_qn(c) for c in method_calls] + assert any("process" in qn for qn in callers) + + def test_constructor_calls_method( + self, temp_repo: Path, mock_ingestor: MagicMock + ) -> None: + (temp_repo / "widget.js").write_text( + encoding="utf-8", + data=""" +class Widget { + setup() {} + + constructor() { + this.setup(); + } +} +""", + ) + run_updater(temp_repo, mock_ingestor, cs.SupportedLanguage.JS) + + method_calls = _get_method_caller_calls(mock_ingestor) + callees = [_callee_qn(c) for c in method_calls] + assert any("setup" in qn for qn in callees) + + +class TestTypeScriptMethodCallerAttribution: + def test_class_method_calls_method( + self, temp_repo: Path, mock_ingestor: MagicMock + ) -> None: + (temp_repo / "handler.ts").write_text( + encoding="utf-8", + data=""" +class Handler { + private validate(): boolean { + return true; + } + + public handle(): void { + this.validate(); + } +} +""", + ) + run_updater(temp_repo, mock_ingestor, cs.SupportedLanguage.TS) + + method_calls = _get_method_caller_calls(mock_ingestor) + callers = [_caller_qn(c) for c in method_calls] + assert any("handle" in qn for qn in callers) + + def test_multiple_methods_with_types( + self, temp_repo: Path, mock_ingestor: MagicMock + ) -> None: + (temp_repo / "repo.ts").write_text( + encoding="utf-8", + data=""" +class Repository { + find(id: number): string { return ""; } + validate(data: string): boolean { return true; } + + save(id: number): boolean { + const item = this.find(id); + return this.validate(item); + } +} +""", + ) + run_updater(temp_repo, mock_ingestor, cs.SupportedLanguage.TS) + + method_calls = _get_method_caller_calls(mock_ingestor) + save_calls = [c for c in method_calls if "save" in _caller_qn(c)] + save_callees = {_callee_qn(c) for c in save_calls} + assert any("find" in qn for qn in save_callees) + assert any("validate" in qn for qn in save_callees) + + +class TestJavaMethodCallerAttribution: + def test_method_calls_method( + self, temp_repo: Path, mock_ingestor: MagicMock + ) -> None: + (temp_repo / "Service.java").write_text( + encoding="utf-8", + data=""" +public class Service { + private boolean validate() { + return true; + } + + public void process() { + validate(); + } +} +""", + ) + run_updater(temp_repo, mock_ingestor, cs.SupportedLanguage.JAVA) + + method_calls = _get_method_caller_calls(mock_ingestor) + callers = [_caller_qn(c) for c in method_calls] + assert any("process" in qn for qn in callers) + + def test_constructor_calls_method( + self, temp_repo: Path, mock_ingestor: MagicMock + ) -> None: + (temp_repo / "Config.java").write_text( + encoding="utf-8", + data=""" +public class Config { + private void loadDefaults() {} + + public Config() { + loadDefaults(); + } +} +""", + ) + run_updater(temp_repo, mock_ingestor, cs.SupportedLanguage.JAVA) + + method_calls = _get_method_caller_calls(mock_ingestor) + callees = [_callee_qn(c) for c in method_calls] + assert any("loadDefaults" in qn for qn in callees) + + def test_multiple_methods_calling_each_other( + self, temp_repo: Path, mock_ingestor: MagicMock + ) -> None: + (temp_repo / "Calculator.java").write_text( + encoding="utf-8", + data=""" +public class Calculator { + public int add(int a, int b) { return a + b; } + public int multiply(int a, int b) { return a * b; } + + public int compute(int x) { + int sum = add(x, 1); + return multiply(sum, 2); + } +} +""", + ) + run_updater(temp_repo, mock_ingestor, cs.SupportedLanguage.JAVA) + + method_calls = _get_method_caller_calls(mock_ingestor) + compute_calls = [c for c in method_calls if "compute" in _caller_qn(c)] + compute_callees = {_callee_qn(c) for c in compute_calls} + assert any("add" in qn for qn in compute_callees) + assert any("multiply" in qn for qn in compute_callees) + + +class TestRustMethodCallerAttribution: + def test_impl_method_calls_method( + self, temp_repo: Path, mock_ingestor: MagicMock + ) -> None: + (temp_repo / "lib.rs").write_text( + encoding="utf-8", + data=""" +struct Player { + health: i32, +} + +impl Player { + fn heal(&mut self) { + self.health += 10; + } + + fn take_damage(&mut self, amount: i32) { + self.health -= amount; + self.heal(); + } +} +""", + ) + run_updater(temp_repo, mock_ingestor, cs.SupportedLanguage.RUST) + + method_calls = _get_method_caller_calls(mock_ingestor) + callers = [_caller_qn(c) for c in method_calls] + assert any("take_damage" in qn for qn in callers) + + def test_multiple_impl_methods( + self, temp_repo: Path, mock_ingestor: MagicMock + ) -> None: + (temp_repo / "lib.rs").write_text( + encoding="utf-8", + data=""" +struct Pipeline; + +impl Pipeline { + fn validate(&self) -> bool { true } + fn transform(&self, x: i32) -> i32 { x * 2 } + + fn run(&self, input: i32) -> i32 { + if self.validate() { + self.transform(input) + } else { + 0 + } + } +} +""", + ) + run_updater(temp_repo, mock_ingestor, cs.SupportedLanguage.RUST) + + method_calls = _get_method_caller_calls(mock_ingestor) + run_calls = [c for c in method_calls if "run" in _caller_qn(c)] + assert len(run_calls) >= 1 + + +class TestPhpMethodCallerAttribution: + def test_method_calls_method( + self, temp_repo: Path, mock_ingestor: MagicMock + ) -> None: + (temp_repo / "service.php").write_text( + encoding="utf-8", + data="""validate(); + } +} +""", + ) + run_updater(temp_repo, mock_ingestor, cs.SupportedLanguage.PHP) + + method_calls = _get_method_caller_calls(mock_ingestor) + callers = [_caller_qn(c) for c in method_calls] + assert any("process" in qn for qn in callers) + + def test_multiple_methods_calling_each_other( + self, temp_repo: Path, mock_ingestor: MagicMock + ) -> None: + (temp_repo / "pipeline.php").write_text( + encoding="utf-8", + data="""step1(); + } + + public function run() { + $this->step2(); + } +} +""", + ) + run_updater(temp_repo, mock_ingestor, cs.SupportedLanguage.PHP) + + method_calls = _get_method_caller_calls(mock_ingestor) + callers = {_caller_qn(c) for c in method_calls} + assert any("step2" in qn for qn in callers) + assert any("run" in qn for qn in callers) diff --git a/codebase_rag/tests/test_model_switching.py b/codebase_rag/tests/test_model_switching.py index 52fb1e632..14217f0d1 100644 --- a/codebase_rag/tests/test_model_switching.py +++ b/codebase_rag/tests/test_model_switching.py @@ -235,6 +235,201 @@ async def test_model_override_none_by_default(self) -> None: assert kwargs.get("model") is None +class TestAgentLoopUserPromptOnResume: + @staticmethod + def _make_response(output: object) -> MagicMock: + response = MagicMock() + response.output = output + response.new_messages.return_value = [] + return response + + @staticmethod + def _patches(): + from pydantic_ai import DeferredToolResults + + return ( + patch("codebase_rag.main.app_context"), + patch("codebase_rag.main.log_session_event"), + patch( + "codebase_rag.main._process_tool_approvals", + new=AsyncMock(return_value=DeferredToolResults()), + ), + patch("codebase_rag.main._refresh_context_tokens", new=AsyncMock()), + patch("codebase_rag.main._thinking_with_status_bar"), + ) + + @pytest.mark.asyncio + async def test_user_prompt_not_resent_after_deferred_tool_approval(self) -> None: + from pydantic_ai import DeferredToolRequests + + from codebase_rag.main import _run_agent_response_loop + from codebase_rag.types_defs import CHAT_LOOP_UI, ConfirmationToolNames + + mock_agent = MagicMock() + mock_agent.run = AsyncMock( + side_effect=[ + self._make_response(DeferredToolRequests(approvals=[])), + self._make_response("Done"), + ] + ) + tool_names = ConfirmationToolNames( + replace_code="replace", create_file="create", shell_command="shell" + ) + ctx, log_evt, approvals, refresh, status = self._patches() + + with ctx as mock_ctx, log_evt, approvals, refresh, status: + mock_ctx.console.print = MagicMock() + mock_ctx.session.cancelled = False + + await _run_agent_response_loop( + mock_agent, + [], + "delete first and add two", + CHAT_LOOP_UI, + tool_names, + ) + + assert mock_agent.run.call_count == 2 + assert mock_agent.run.call_args_list[0][0][0] == "delete first and add two" + assert mock_agent.run.call_args_list[1][0][0] is None + + @pytest.mark.asyncio + async def test_user_prompt_not_resent_across_multiple_deferred_rounds( + self, + ) -> None: + from pydantic_ai import DeferredToolRequests + + from codebase_rag.main import _run_agent_response_loop + from codebase_rag.types_defs import CHAT_LOOP_UI, ConfirmationToolNames + + mock_agent = MagicMock() + mock_agent.run = AsyncMock( + side_effect=[ + self._make_response(DeferredToolRequests(approvals=[])), + self._make_response(DeferredToolRequests(approvals=[])), + self._make_response(DeferredToolRequests(approvals=[])), + self._make_response("All done"), + ] + ) + tool_names = ConfirmationToolNames( + replace_code="replace", create_file="create", shell_command="shell" + ) + ctx, log_evt, approvals, refresh, status = self._patches() + + with ctx as mock_ctx, log_evt, approvals, refresh, status: + mock_ctx.console.print = MagicMock() + mock_ctx.session.cancelled = False + + await _run_agent_response_loop( + mock_agent, [], "multi-step task", CHAT_LOOP_UI, tool_names + ) + + assert mock_agent.run.call_count == 4 + assert mock_agent.run.call_args_list[0][0][0] == "multi-step task" + for call in mock_agent.run.call_args_list[1:]: + assert call[0][0] is None + + @pytest.mark.asyncio + async def test_user_prompt_passed_on_first_call_when_no_deferred(self) -> None: + from codebase_rag.main import _run_agent_response_loop + from codebase_rag.types_defs import CHAT_LOOP_UI, ConfirmationToolNames + + mock_agent = MagicMock() + mock_agent.run = AsyncMock(return_value=self._make_response("Hello")) + tool_names = ConfirmationToolNames( + replace_code="replace", create_file="create", shell_command="shell" + ) + ctx, log_evt, approvals, refresh, status = self._patches() + + with ctx as mock_ctx, log_evt, approvals, refresh, status: + mock_ctx.console.print = MagicMock() + mock_ctx.session.cancelled = False + + await _run_agent_response_loop( + mock_agent, [], "just a question", CHAT_LOOP_UI, tool_names + ) + + assert mock_agent.run.call_count == 1 + assert mock_agent.run.call_args_list[0][0][0] == "just a question" + assert mock_agent.run.call_args_list[0][1].get("deferred_tool_results") is None + + @pytest.mark.asyncio + async def test_multimodal_user_prompt_not_resent_after_approval(self) -> None: + from pydantic_ai import BinaryContent, DeferredToolRequests + + from codebase_rag.main import _run_agent_response_loop + from codebase_rag.types_defs import CHAT_LOOP_UI, ConfirmationToolNames + + multimodal_prompt = [ + "look at this image", + BinaryContent(data=b"\x89PNG\r\n", media_type="image/png"), + ] + mock_agent = MagicMock() + mock_agent.run = AsyncMock( + side_effect=[ + self._make_response(DeferredToolRequests(approvals=[])), + self._make_response("Analyzed"), + ] + ) + tool_names = ConfirmationToolNames( + replace_code="replace", create_file="create", shell_command="shell" + ) + ctx, log_evt, approvals, refresh, status = self._patches() + + with ctx as mock_ctx, log_evt, approvals, refresh, status: + mock_ctx.console.print = MagicMock() + mock_ctx.session.cancelled = False + + await _run_agent_response_loop( + mock_agent, [], multimodal_prompt, CHAT_LOOP_UI, tool_names + ) + + assert mock_agent.run.call_count == 2 + assert mock_agent.run.call_args_list[0][0][0] is multimodal_prompt + assert mock_agent.run.call_args_list[1][0][0] is None + + @pytest.mark.asyncio + async def test_deferred_results_passed_only_after_approval(self) -> None: + from pydantic_ai import DeferredToolRequests, DeferredToolResults + + from codebase_rag.main import _run_agent_response_loop + from codebase_rag.types_defs import CHAT_LOOP_UI, ConfirmationToolNames + + approved = DeferredToolResults() + mock_agent = MagicMock() + mock_agent.run = AsyncMock( + side_effect=[ + self._make_response(DeferredToolRequests(approvals=[])), + self._make_response("Done"), + ] + ) + tool_names = ConfirmationToolNames( + replace_code="replace", create_file="create", shell_command="shell" + ) + + with ( + patch("codebase_rag.main.app_context") as mock_ctx, + patch("codebase_rag.main.log_session_event"), + patch( + "codebase_rag.main._process_tool_approvals", + new=AsyncMock(return_value=approved), + ), + patch("codebase_rag.main._refresh_context_tokens", new=AsyncMock()), + patch("codebase_rag.main._thinking_with_status_bar"), + ): + mock_ctx.console.print = MagicMock() + mock_ctx.session.cancelled = False + + await _run_agent_response_loop( + mock_agent, [], "edit file", CHAT_LOOP_UI, tool_names + ) + + first_kwargs = mock_agent.run.call_args_list[0][1] + second_kwargs = mock_agent.run.call_args_list[1][1] + assert first_kwargs.get("deferred_tool_results") is None + assert second_kwargs.get("deferred_tool_results") is approved + + class TestCommandConstants: def test_model_command_prefix(self) -> None: assert cs.MODEL_COMMAND_PREFIX == "/model" diff --git a/codebase_rag/tests/test_module_call_attribution.py b/codebase_rag/tests/test_module_call_attribution.py new file mode 100644 index 000000000..9d635e0ee --- /dev/null +++ b/codebase_rag/tests/test_module_call_attribution.py @@ -0,0 +1,147 @@ +from __future__ import annotations + +from pathlib import Path +from unittest.mock import MagicMock + +from codebase_rag import constants as cs +from codebase_rag.tests.conftest import run_updater + + +def _calls(mock_ingestor: MagicMock) -> list[tuple[str, str, str]]: + # (H) CALLS edges as (caller_label, caller_qn, callee_qn). + out: list[tuple[str, str, str]] = [] + for c in mock_ingestor.ensure_relationship_batch.call_args_list: + if c.args[1] == cs.RelationshipType.CALLS: + caller_label, _caller_key, caller_qn = c.args[0] + _callee_label, _callee_key, callee_qn = c.args[2] + out.append((caller_label, caller_qn, callee_qn)) + return out + + +def _module_callees(calls: list[tuple[str, str, str]]) -> set[str]: + return { + callee.rsplit(cs.SEPARATOR_DOT, 1)[-1] + for label, _caller, callee in calls + if label == cs.NodeLabel.MODULE + } + + +class TestModuleCallAttribution: + def test_nested_call_not_attributed_to_module( + self, temp_repo: Path, mock_ingestor: MagicMock + ) -> None: + (temp_repo / "app.py").write_text( + "def main():\n" + " used_by_main()\n" + "\n" + "\n" + "def used_by_main():\n" + " return 1\n" + "\n" + "\n" + 'if __name__ == "__main__":\n' + " main()\n", + encoding="utf-8", + ) + + run_updater(temp_repo, mock_ingestor, skip_if_missing="python") + calls = _calls(mock_ingestor) + module_callees = _module_callees(calls) + + # (H) the function-body call is attributed to the function, not the module + assert any( + caller.endswith(".main") and callee.endswith(".used_by_main") + for _label, caller, callee in calls + ) + # (H) used_by_main is only called inside main(), never at module top level + assert "used_by_main" not in module_callees + + def test_top_level_call_is_attributed_to_module( + self, temp_repo: Path, mock_ingestor: MagicMock + ) -> None: + (temp_repo / "app.py").write_text( + "def main():\n" + " used_by_main()\n" + "\n" + "\n" + "def used_by_main():\n" + " return 1\n" + "\n" + "\n" + 'if __name__ == "__main__":\n' + " main()\n", + encoding="utf-8", + ) + + run_updater(temp_repo, mock_ingestor, skip_if_missing="python") + module_callees = _module_callees(_calls(mock_ingestor)) + + # (H) the `if __name__ == "__main__": main()` call runs at module load + assert "main" in module_callees + + def test_bare_module_level_call_attributed_to_module( + self, temp_repo: Path, mock_ingestor: MagicMock + ) -> None: + (temp_repo / "app.py").write_text( + "def setup():\n" + " return 1\n" + "\n" + "\n" + "def helper():\n" + " return 2\n" + "\n" + "\n" + "VALUE = setup()\n", + encoding="utf-8", + ) + + run_updater(temp_repo, mock_ingestor, skip_if_missing="python") + module_callees = _module_callees(_calls(mock_ingestor)) + + assert "setup" in module_callees + # (H) helper is never called at all -> no module edge to it + assert "helper" not in module_callees + + def test_default_argument_call_attributed_to_module( + self, temp_repo: Path, mock_ingestor: MagicMock + ) -> None: + # (H) a default-argument expression runs at module-load (definition) time, + # (H) not when the function body executes, so it is a module-level call. + (temp_repo / "app.py").write_text( + "def make_default():\n" + " return 1\n" + "\n" + "\n" + "def with_default(x=make_default()):\n" + " return x\n", + encoding="utf-8", + ) + + run_updater(temp_repo, mock_ingestor, skip_if_missing="python") + module_callees = _module_callees(_calls(mock_ingestor)) + + assert "make_default" in module_callees + + def test_cpp_file_scope_initializer_call_attributed_to_module( + self, temp_repo: Path, mock_ingestor: MagicMock + ) -> None: + # (H) a C++ file-scope initializer runs at load time, so its call is + # (H) module-attributed; a call inside a function body is not. + (temp_repo / "app.cpp").write_text( + "int nested_cpp() { return 1; }\n" + "int top_cpp() { return 2; }\n" + "int run_cpp() { return nested_cpp(); }\n" + "int module_value = top_cpp();\n", + encoding="utf-8", + ) + + run_updater(temp_repo, mock_ingestor, skip_if_missing="cpp") + calls = _calls(mock_ingestor) + module_callees = _module_callees(calls) + + assert "top_cpp" in module_callees + assert "nested_cpp" not in module_callees + assert any( + caller.endswith(".run_cpp") and callee.endswith(".nested_cpp") + for _label, caller, callee in calls + ) diff --git a/codebase_rag/tests/test_module_qn_language_collision.py b/codebase_rag/tests/test_module_qn_language_collision.py new file mode 100644 index 000000000..5df31da32 --- /dev/null +++ b/codebase_rag/tests/test_module_qn_language_collision.py @@ -0,0 +1,79 @@ +# (H) Regression: two source files that share a basename but differ by extension +# (H) (foo.py and foo.cpp) must get distinct module qualified names. Path-based +# (H) module naming strips the extension, so without disambiguation both map to +# (H) the same module qn, cascading into identical class/method qns that collapse +# (H) under the graph's qualified_name unique constraint (dropping one file's defs). +from __future__ import annotations + +from pathlib import Path +from unittest.mock import MagicMock + +from codebase_rag.constants import KEY_PATH, KEY_QUALIFIED_NAME, NodeLabel +from codebase_rag.tests.conftest import create_and_run_updater, get_nodes + + +def _make_project(temp_repo: Path) -> Path: + project_path = temp_repo / "mixedmod" + (project_path / "pkg").mkdir(parents=True) + (project_path / "pkg" / "shape.py").write_text( + encoding="utf-8", + data="class Shape:\n def area(self):\n return 1\n", + ) + (project_path / "pkg" / "shape.cpp").write_text( + encoding="utf-8", + data="class Shape {\npublic:\n int area() {\n return 2;\n }\n};\n", + ) + return project_path + + +def _qns_by_path( + mock_ingestor: MagicMock, label: NodeLabel, name: str +) -> dict[str, str]: + out: dict[str, str] = {} + for node in get_nodes(mock_ingestor, label): + props = node[0][1] + qn = str(props.get(KEY_QUALIFIED_NAME)) + if qn.rsplit(".", 1)[-1] == name: + out[str(props.get(KEY_PATH))] = qn + return out + + +def test_same_stem_files_get_distinct_module_qns( + temp_repo: Path, mock_ingestor: MagicMock +) -> None: + project = _make_project(temp_repo) + create_and_run_updater(project, mock_ingestor, skip_if_missing="cpp") + + modules = { + str(node[0][1].get(KEY_PATH)): str(node[0][1].get(KEY_QUALIFIED_NAME)) + for node in get_nodes(mock_ingestor, NodeLabel.MODULE) + } + py_mod = modules.get("pkg/shape.py") + cpp_mod = modules.get("pkg/shape.cpp") + assert py_mod and cpp_mod, f"both module nodes expected: {modules}" + assert py_mod != cpp_mod, f"module qn collision: {py_mod}" + + +def test_same_stem_methods_do_not_collide( + temp_repo: Path, mock_ingestor: MagicMock +) -> None: + project = _make_project(temp_repo) + create_and_run_updater(project, mock_ingestor, skip_if_missing="cpp") + + area = _qns_by_path(mock_ingestor, NodeLabel.METHOD, "area") + py_area = area.get("pkg/shape.py") + cpp_area = area.get("pkg/shape.cpp") + assert py_area and cpp_area, f"both area methods expected: {area}" + assert py_area != cpp_area, f"method qn collision across languages: {area}" + + # (H) The method qn must derive from its own (disambiguated) module qn, not a + # (H) bare recomputed prefix patched up by register_unique_qn's @N dedup. + modules = { + str(node[0][1].get(KEY_PATH)): str(node[0][1].get(KEY_QUALIFIED_NAME)) + for node in get_nodes(mock_ingestor, NodeLabel.MODULE) + } + py_mod = modules["pkg/shape.py"] + assert py_area.startswith(f"{py_mod}."), ( + f"python method qn {py_area} not derived from its module {py_mod}" + ) + assert "@" not in py_area, f"method qn collided and was @N-deduped: {py_area}" diff --git a/codebase_rag/tests/test_multi_project.py b/codebase_rag/tests/test_multi_project.py new file mode 100644 index 000000000..3755bd207 --- /dev/null +++ b/codebase_rag/tests/test_multi_project.py @@ -0,0 +1,126 @@ +from __future__ import annotations + +from collections.abc import Generator +from pathlib import Path +from unittest.mock import MagicMock, patch + +import pytest +from typer.testing import CliRunner + +from codebase_rag.cli import _resolve_active_projects, app +from codebase_rag.prompts import build_rag_orchestrator_prompt + +runner = CliRunner() + + +class TestResolveActiveProjects: + def test_returns_default_when_no_projects_flag(self) -> None: + assert _resolve_active_projects(None, "default_proj") == ["default_proj"] + + def test_returns_default_for_empty_string(self) -> None: + assert _resolve_active_projects("", "default_proj") == ["default_proj"] + + def test_single_project_in_flag(self) -> None: + assert _resolve_active_projects("only_one", "default_proj") == ["only_one"] + + def test_multiple_projects_comma_separated(self) -> None: + assert _resolve_active_projects("a,b,c", "default_proj") == ["a", "b", "c"] + + def test_strips_whitespace(self) -> None: + assert _resolve_active_projects(" a , b ,c ", "default_proj") == ["a", "b", "c"] + + def test_drops_empty_entries(self) -> None: + assert _resolve_active_projects("a,,b,", "default_proj") == ["a", "b"] + + def test_all_empty_falls_back_to_default(self) -> None: + assert _resolve_active_projects(",,", "default_proj") == ["default_proj"] + + +class TestPromptActiveProjectsBlock: + def test_no_projects_lists_list_projects_hint(self) -> None: + prompt = build_rag_orchestrator_prompt([], active_projects=None) + assert "list_projects" in prompt + assert "Project Scope" in prompt + + def test_single_project_mentions_starts_with(self) -> None: + prompt = build_rag_orchestrator_prompt([], active_projects=["only_one"]) + assert "only_one" in prompt + assert "STARTS WITH" in prompt + + def test_multiple_projects_lists_all(self) -> None: + prompt = build_rag_orchestrator_prompt([], active_projects=["a", "b", "c"]) + for name in ["a", "b", "c"]: + assert f"`{name}`" in prompt or f"'{name}." in prompt + assert "STARTS WITH 'a.'" in prompt + assert "STARTS WITH 'b.'" in prompt + + +@pytest.fixture +def mock_memgraph_connect() -> Generator[MagicMock, None, None]: + with patch("codebase_rag.cli.connect_memgraph") as mock_connect: + mock_ingestor = MagicMock() + mock_connect.return_value.__enter__ = MagicMock(return_value=mock_ingestor) + mock_connect.return_value.__exit__ = MagicMock(return_value=False) + yield mock_connect + + +@pytest.fixture +def mock_sync_path() -> Generator[MagicMock, None, None]: + with patch("codebase_rag.cli._run_graph_sync"): + yield + + +@pytest.fixture +def mock_validate_models() -> Generator[None, None, None]: + with patch("codebase_rag.cli._update_and_validate_models"): + yield + + +def test_start_passes_projects_to_single_query( + mock_memgraph_connect: MagicMock, + mock_sync_path: None, + mock_validate_models: None, + tmp_path: Path, +) -> None: + with patch("codebase_rag.cli.main_single_query") as mock_single: + result = runner.invoke( + app, + [ + "start", + "--repo-path", + str(tmp_path), + "--projects", + "alpha,beta", + "--ask-agent", + "hi", + "--no-sync", + ], + ) + assert result.exit_code == 0, result.output + mock_single.assert_called_once() + assert mock_single.call_args.kwargs["active_projects"] == ["alpha", "beta"] + + +def test_start_default_projects_uses_derived_name( + mock_memgraph_connect: MagicMock, + mock_sync_path: None, + mock_validate_models: None, + tmp_path: Path, +) -> None: + with patch("codebase_rag.cli.main_single_query") as mock_single: + result = runner.invoke( + app, + [ + "start", + "--repo-path", + str(tmp_path), + "--ask-agent", + "hi", + "--no-sync", + ], + ) + assert result.exit_code == 0, result.output + mock_single.assert_called_once() + active = mock_single.call_args.kwargs["active_projects"] + assert len(active) == 1 + assert "__" in active[0] diff --git a/codebase_rag/tests/test_multiline_input_keybindings.py b/codebase_rag/tests/test_multiline_input_keybindings.py new file mode 100644 index 000000000..d41abe943 --- /dev/null +++ b/codebase_rag/tests/test_multiline_input_keybindings.py @@ -0,0 +1,56 @@ +from __future__ import annotations + +import pytest +from prompt_toolkit.application import create_app_session +from prompt_toolkit.input import create_pipe_input +from prompt_toolkit.output import DummyOutput + +from codebase_rag import constants as cs +from codebase_rag.main import get_multiline_input + +CTRL_J = "\x0a" +CTRL_E = "\x05" +CTRL_C = "\x03" +ENTER = "\r" + + +def _run_with_input(text: str) -> str: + with create_pipe_input() as inp: + inp.send_text(text) + with create_app_session(input=inp, output=DummyOutput()): + return get_multiline_input("Ask") + + +def test_ctrl_j_submits_buffer() -> None: + assert _run_with_input(f"hello{CTRL_J}") == "hello" + + +def test_ctrl_e_submits_buffer() -> None: + assert _run_with_input(f"hello{CTRL_E}") == "hello" + + +def test_ctrl_e_submits_after_multiline_with_enter() -> None: + assert _run_with_input(f"line1{ENTER}line2{CTRL_E}") == "line1\nline2" + + +def test_ctrl_j_submits_after_multiline_with_enter() -> None: + assert _run_with_input(f"line1{ENTER}line2{CTRL_J}") == "line1\nline2" + + +def test_result_is_stripped() -> None: + assert _run_with_input(f" padded {CTRL_E}") == "padded" + + +def test_ctrl_c_raises_keyboard_interrupt() -> None: + with pytest.raises(KeyboardInterrupt): + _run_with_input(f"abc{CTRL_C}") + + +def test_keybinding_enum_has_submit_shortcuts() -> None: + assert cs.KeyBinding.CTRL_J.value == "c-j" + assert cs.KeyBinding.CTRL_E.value == "c-e" + + +def test_hint_mentions_both_submit_shortcuts() -> None: + assert "Ctrl+J" in cs.MULTILINE_INPUT_HINT + assert "Ctrl+E" in cs.MULTILINE_INPUT_HINT diff --git a/codebase_rag/tests/test_nested_function_defines.py b/codebase_rag/tests/test_nested_function_defines.py new file mode 100644 index 000000000..e9b9694b2 --- /dev/null +++ b/codebase_rag/tests/test_nested_function_defines.py @@ -0,0 +1,129 @@ +# (H) Finding #2 from the evals/ harness: a function nested inside a METHOD was +# (H) attributed to the Module via DEFINES (flattened), producing false-positive +# (H) module-level edges. A nested function must be DEFINES'd by its enclosing +# (H) scope: the method for function-in-method, the function for function-in-function. +from __future__ import annotations + +from pathlib import Path + +from codebase_rag import constants as cs +from codebase_rag.graph_updater import GraphUpdater +from codebase_rag.parser_loader import load_parsers +from codebase_rag.types_defs import PropertyDict, PropertyValue, ResultRow + +PROJECT = "nestproj" + +MODULE_SRC = """class C: + def find_x(self) -> int: + def dfs(n: int) -> int: + return n + + return dfs(1) + + +def outer() -> int: + def inner() -> int: + return 1 + + return inner() +""" + +_RelTuple = tuple[str, PropertyValue, str, str, PropertyValue] + + +class _Capture: + def __init__(self) -> None: + self.nodes: dict[tuple[str, PropertyValue], PropertyDict] = {} + self.rels: list[_RelTuple] = [] + + def ensure_node_batch(self, label: str, properties: PropertyDict) -> None: + uid = properties[cs.NODE_UNIQUE_CONSTRAINTS[label]] + self.nodes[(str(label), uid)] = dict(properties) + + def ensure_relationship_batch( + self, + from_spec: tuple[str, str, PropertyValue], + rel_type: str, + to_spec: tuple[str, str, PropertyValue], + properties: PropertyDict | None = None, + ) -> None: + self.rels.append( + ( + str(from_spec[0]), + from_spec[2], + str(rel_type), + str(to_spec[0]), + to_spec[2], + ) + ) + + def flush_all(self) -> None: + return None + + def fetch_all( + self, query: str, params: PropertyDict | None = None + ) -> list[ResultRow]: + return [] + + def execute_write(self, query: str, params: PropertyDict | None = None) -> None: + return None + + +def _build(tmp_path: Path, src: str = MODULE_SRC) -> _Capture: + (tmp_path / "m.py").write_text(src) + parsers, queries = load_parsers() + cap = _Capture() + GraphUpdater( + ingestor=cap, + repo_path=tmp_path, + parsers=parsers, + queries=queries, + project_name=PROJECT, + ).run(force=True) + return cap + + +def _defines_sources(cap: _Capture, target_suffix: str) -> list[tuple[str, str]]: + return [ + (from_label, str(from_val)) + for (from_label, from_val, rel_type, _tl, target) in cap.rels + if rel_type == cs.RelationshipType.DEFINES + and str(target).endswith(target_suffix) + ] + + +class TestNestedFunctionDefines: + def test_function_in_method_defined_by_method(self, tmp_path: Path) -> None: + cap = _build(tmp_path) + sources = _defines_sources(cap, ".find_x.dfs") + assert len(sources) == 1, sources + label, qn = sources[0] + assert label == cs.NodeLabel.METHOD, sources + assert qn.endswith(".C.find_x"), sources + + def test_function_in_function_defined_by_function(self, tmp_path: Path) -> None: + cap = _build(tmp_path) + sources = _defines_sources(cap, ".outer.inner") + assert len(sources) == 1, sources + label, qn = sources[0] + assert label == cs.NodeLabel.FUNCTION, sources + assert qn.endswith(".outer"), sources + + +CLASS_IN_METHOD_SRC = """class Holder: + def make(self) -> object: + class Local: + pass + + return Local() +""" + + +class TestNestedClassDefines: + def test_class_in_method_defined_by_method(self, tmp_path: Path) -> None: + cap = _build(tmp_path, CLASS_IN_METHOD_SRC) + sources = _defines_sources(cap, ".make.Local") + assert len(sources) == 1, sources + label, qn = sources[0] + assert label == cs.NodeLabel.METHOD, sources + assert qn.endswith(".Holder.make"), sources diff --git a/codebase_rag/tests/test_node_relationship_coverage.py b/codebase_rag/tests/test_node_relationship_coverage.py index e6af5fd05..00389af7a 100644 --- a/codebase_rag/tests/test_node_relationship_coverage.py +++ b/codebase_rag/tests/test_node_relationship_coverage.py @@ -136,18 +136,15 @@ def test_each_relationship_type_can_be_flushed( ingestor.conn = mock_conn - ingestor.relationship_buffer.append( - ( - (NodeLabel.MODULE.value, KEY_QUALIFIED_NAME, "module.test"), - rel_type.value, - (NodeLabel.FUNCTION.value, KEY_QUALIFIED_NAME, "module.test.func"), - None, - ) + ingestor.ensure_relationship_batch( + (NodeLabel.MODULE.value, KEY_QUALIFIED_NAME, "module.test"), + rel_type.value, + (NodeLabel.FUNCTION.value, KEY_QUALIFIED_NAME, "module.test.func"), ) ingestor.flush_relationships() mock_cursor.execute.assert_called_once() - assert ingestor.relationship_buffer == [] + assert ingestor._rel_count == 0 class TestUniqueKeyPropertyNames: @@ -230,10 +227,13 @@ def test_ensure_constraints_creates_all_constraints(self) -> None: ingestor = MemgraphIngestor(host="localhost", port=7687) executed_queries: list[str] = [] - def capture_query(query: str) -> None: + def capture_query(query: str, params: object = None) -> list[object]: executed_queries.append(query) + return [] - with patch.object(ingestor, "_execute_query", side_effect=capture_query): + with patch.object( + MemgraphIngestor, "_execute_query", side_effect=capture_query + ): ingestor.ensure_constraints() for label in NodeLabel: @@ -249,10 +249,13 @@ def test_ensure_constraints_creates_all_indexes(self) -> None: ingestor = MemgraphIngestor(host="localhost", port=7687) executed_queries: list[str] = [] - def capture_query(query: str) -> None: + def capture_query(query: str, params: object = None) -> list[object]: executed_queries.append(query) + return [] - with patch.object(ingestor, "_execute_query", side_effect=capture_query): + with patch.object( + MemgraphIngestor, "_execute_query", side_effect=capture_query + ): ingestor.ensure_constraints() for label in NodeLabel: diff --git a/codebase_rag/tests/test_operator_dispatch_resolution.py b/codebase_rag/tests/test_operator_dispatch_resolution.py new file mode 100644 index 000000000..6f4262552 --- /dev/null +++ b/codebase_rag/tests/test_operator_dispatch_resolution.py @@ -0,0 +1,126 @@ +# (H) L3 finding from the evals/ harness: Python operator syntax dispatches to dunder +# (H) methods at runtime: `k in reg` -> reg.__contains__, `reg[k]` -> reg.__getitem__, +# (H) `reg[k] = v` -> reg.__setitem__, `len(reg)` -> reg.__len__. cgr only extracts +# (H) call expressions, so these first-party method calls were never captured. They are +# (H) emitted only when the operand's type resolves to a first-party class that defines +# (H) the dunder, so builtin containers (dict/list) produce no spurious edges. +from __future__ import annotations + +from pathlib import Path + +from codebase_rag import constants as cs +from codebase_rag.graph_updater import GraphUpdater +from codebase_rag.parser_loader import load_parsers +from codebase_rag.types_defs import PropertyDict, PropertyValue, ResultRow + +PROJECT = "proj" + +FILES = { + "pkg/__init__.py": "", + "pkg/registry.py": ( + "class Registry:\n" + " def __contains__(self, key):\n return True\n\n" + " def __getitem__(self, key):\n return 1\n\n" + " def __setitem__(self, key, value):\n return None\n\n" + " def __len__(self):\n return 0\n" + ), + "pkg/user.py": ( + "from .registry import Registry\n\n\n" + "class User:\n" + " def __init__(self, reg: Registry) -> None:\n" + " self._reg = reg\n\n" + " def use(self, key):\n" + " if key in self._reg:\n" + " value = self._reg[key]\n" + " self._reg[key] = 1\n" + " return len(self._reg)\n\n" + " def builtin(self):\n" + " data = {}\n" + " data['x'] = 1\n" + " return data['x']\n" + ), +} + + +class _Capture: + def __init__(self) -> None: + self.rels: list[tuple[PropertyValue, str, PropertyValue]] = [] + + def ensure_node_batch(self, label: str, properties: PropertyDict) -> None: + return None + + def ensure_relationship_batch( + self, + from_spec: tuple[str, str, PropertyValue], + rel_type: str, + to_spec: tuple[str, str, PropertyValue], + properties: PropertyDict | None = None, + ) -> None: + self.rels.append((from_spec[2], str(rel_type), to_spec[2])) + + def flush_all(self) -> None: + return None + + def fetch_all( + self, query: str, params: PropertyDict | None = None + ) -> list[ResultRow]: + return [] + + def execute_write(self, query: str, params: PropertyDict | None = None) -> None: + return None + + +def _calls(tmp_path: Path) -> set[tuple[PropertyValue, PropertyValue]]: + for rel, content in FILES.items(): + p = tmp_path / rel + p.parent.mkdir(parents=True, exist_ok=True) + p.write_text(content) + parsers, queries = load_parsers() + cap = _Capture() + GraphUpdater( + ingestor=cap, + repo_path=tmp_path, + parsers=parsers, + queries=queries, + project_name=PROJECT, + ).run(force=True) + return { + (frm, to) for (frm, rel, to) in cap.rels if rel == cs.RelationshipType.CALLS + } + + +class TestOperatorDispatchResolution: + def test_contains_operator_dispatches_to_dunder(self, tmp_path: Path) -> None: + calls = _calls(tmp_path) + assert ( + "proj.pkg.user.User.use", + "proj.pkg.registry.Registry.__contains__", + ) in calls, calls + + def test_subscript_read_dispatches_to_getitem(self, tmp_path: Path) -> None: + calls = _calls(tmp_path) + assert ( + "proj.pkg.user.User.use", + "proj.pkg.registry.Registry.__getitem__", + ) in calls, calls + + def test_subscript_write_dispatches_to_setitem(self, tmp_path: Path) -> None: + calls = _calls(tmp_path) + assert ( + "proj.pkg.user.User.use", + "proj.pkg.registry.Registry.__setitem__", + ) in calls, calls + + def test_len_dispatches_to_dunder(self, tmp_path: Path) -> None: + calls = _calls(tmp_path) + assert ( + "proj.pkg.user.User.use", + "proj.pkg.registry.Registry.__len__", + ) in calls, calls + + def test_builtin_container_produces_no_dunder_edge(self, tmp_path: Path) -> None: + calls = _calls(tmp_path) + dunder_targets = { + to for (frm, to) in calls if frm == "proj.pkg.user.User.builtin" + } + assert dunder_targets == set(), dunder_targets diff --git a/codebase_rag/tests/test_oracle_nested_defs.py b/codebase_rag/tests/test_oracle_nested_defs.py new file mode 100644 index 000000000..e770dd1a6 --- /dev/null +++ b/codebase_rag/tests/test_oracle_nested_defs.py @@ -0,0 +1,46 @@ +# (H) Covers the L1 ast oracle (evals/ast_oracle.py): functions defined inside an +# (H) except handler or a match/case block must be captured. cgr captures these +# (H) function-local defs, so an oracle that skips them produces spurious Function +# (H) false positives (e.g. thrift's sslcompat.py `def match` inside `except`). +from __future__ import annotations + +from pathlib import Path + +from evals.ast_oracle import extract_oracle_graph + +SRC = """\ +def with_except(): + try: + import something + except ImportError: + def fallback_in_except(): + return 1 + return fallback_in_except + + +def with_match(value): + match value: + case 1: + def handler_in_case(): + return 2 + return handler_in_case + case _: + return None +""" + + +def _function_names(target: Path) -> set[str]: + graph = extract_oracle_graph(target, "proj") + return {node.name for node in graph.nodes.values() if node.key.kind == "Function"} + + +def test_oracle_captures_function_in_except_handler(tmp_path: Path) -> None: + (tmp_path / "mod.py").write_text(SRC, encoding="utf-8") + names = _function_names(tmp_path) + assert "fallback_in_except" in names, names + + +def test_oracle_captures_function_in_match_case(tmp_path: Path) -> None: + (tmp_path / "mod.py").write_text(SRC, encoding="utf-8") + names = _function_names(tmp_path) + assert "handler_in_case" in names, names diff --git a/codebase_rag/tests/test_permission_mode.py b/codebase_rag/tests/test_permission_mode.py new file mode 100644 index 000000000..f660b4a51 --- /dev/null +++ b/codebase_rag/tests/test_permission_mode.py @@ -0,0 +1,20 @@ +from codebase_rag.constants import PermissionMode +from codebase_rag.models import SessionState + + +class TestSessionPermissionMode: + def test_default_mode_is_normal(self) -> None: + state = SessionState() + assert state.permission_mode == PermissionMode.NORMAL + assert state.is_yolo() is False + + def test_cycle_toggles_to_yolo(self) -> None: + state = SessionState() + assert state.cycle_permission_mode() == PermissionMode.YOLO + assert state.is_yolo() is True + + def test_cycle_toggles_back_to_normal(self) -> None: + state = SessionState() + state.cycle_permission_mode() + assert state.cycle_permission_mode() == PermissionMode.NORMAL + assert state.is_yolo() is False diff --git a/codebase_rag/tests/test_php_containment_oracle.py b/codebase_rag/tests/test_php_containment_oracle.py new file mode 100644 index 000000000..08a38bf08 --- /dev/null +++ b/codebase_rag/tests/test_php_containment_oracle.py @@ -0,0 +1,66 @@ +# (H) Covers PHP containment-edge validation: cgr's DEFINES (file module -> +# (H) every named type and top-level function) and DEFINES_METHOD (class/ +# (H) interface/trait/enum -> method) edges are graded against the independent +# (H) php-parser oracle, joined on (kind, file, line). Exercises an interface, +# (H) a trait, an enum with a method, a class, and a free function. +from __future__ import annotations + +from pathlib import Path + +import pytest + +from codebase_rag import constants as cs +from codebase_rag.parser_loader import load_parsers +from evals import constants as ec +from evals.cgr_graph import extract_cgr_php_graph +from evals.oracles import php_oracle_available, run_php_oracle +from evals.score import score_edge_types + +PHP_SRC = """\ + None: + if not php_oracle_available(): + pytest.skip("node/npm toolchain not available") + if cs.SupportedLanguage.PHP not in load_parsers()[0]: + pytest.skip("php parser not available") + + +def test_cgr_matches_php_parser_oracle_on_containment_edges(tmp_path: Path) -> None: + _require_php() + project = tmp_path / "php_edge" + project.mkdir() + (project / "lib.php").write_text(PHP_SRC, encoding="utf-8") + + cgr = extract_cgr_php_graph(project, project.name) + oracle = run_php_oracle(project) + + result = score_edge_types(cgr, oracle, ec.SCORED_EDGE_TYPES) + by_label = {row["label"]: row for row in result.rows} + for label in ( + cs.RelationshipType.DEFINES.value, + cs.RelationshipType.DEFINES_METHOD.value, + ): + row = by_label.get(label) + assert row is not None, (label, by_label, result.diff) + assert row["precision"] == 1.0 and row["recall"] == 1.0, ( + label, + row, + result.diff, + ) diff --git a/codebase_rag/tests/test_php_functions.py b/codebase_rag/tests/test_php_functions.py new file mode 100644 index 000000000..992d5c900 --- /dev/null +++ b/codebase_rag/tests/test_php_functions.py @@ -0,0 +1,153 @@ +from pathlib import Path +from unittest.mock import MagicMock + +from codebase_rag.graph_updater import GraphUpdater +from codebase_rag.parser_loader import load_parsers +from codebase_rag.tests.conftest import get_relationships +from codebase_rag.types_defs import NodeType + + +def test_php_function_discovery(temp_repo: Path, mock_ingestor: MagicMock) -> None: + project_path = temp_repo / "php_functions_test" + project_path.mkdir() + + (project_path / "example.php").write_text( + encoding="utf-8", + data="""value = 0; + } + + public function getValue() { + return $this->value; + } +} + +interface MyInterface { + public function doSomething(); +} + +enum Status { + case Active; + case Inactive; +} + +function standaloneFunction() { + $obj = new MyPhpClass(); + return $obj->getValue(); +} +""", + ) + + parsers, queries = load_parsers() + assert "php" in parsers, "PHP parser should be available" + + updater = GraphUpdater( + ingestor=mock_ingestor, + repo_path=project_path, + parsers=parsers, + queries=queries, + ) + updater.run() + + created_functions = [ + c + for c in mock_ingestor.ensure_node_batch.call_args_list + if c[0][0] == NodeType.FUNCTION + ] + fn_qns = {c[0][1]["qualified_name"] for c in created_functions} + + assert any(qn.endswith(".standaloneFunction") for qn in fn_qns), fn_qns + + call_rels = get_relationships(mock_ingestor, "CALLS") + assert len(call_rels) >= 1 + + +def test_php_class_discovery(temp_repo: Path, mock_ingestor: MagicMock) -> None: + project_path = temp_repo / "php_class_test" + project_path.mkdir() + + (project_path / "models.php").write_text( + encoding="utf-8", + data=""" None: + project_path = temp_repo / "php_calls_test" + project_path.mkdir() + + (project_path / "service.php").write_text( + encoding="utf-8", + data="""add(1, 2); + } +} + +function main() { + $calc = new Calculator(); + $calc->calculate(); +} +""", + ) + + parsers, queries = load_parsers() + updater = GraphUpdater( + ingestor=mock_ingestor, + repo_path=project_path, + parsers=parsers, + queries=queries, + ) + updater.run() + + call_rels = get_relationships(mock_ingestor, "CALLS") + assert len(call_rels) >= 2 diff --git a/codebase_rag/tests/test_php_imports.py b/codebase_rag/tests/test_php_imports.py new file mode 100644 index 000000000..9f8e2ef59 --- /dev/null +++ b/codebase_rag/tests/test_php_imports.py @@ -0,0 +1,93 @@ +from pathlib import Path +from unittest.mock import MagicMock + +from codebase_rag.graph_updater import GraphUpdater +from codebase_rag.parser_loader import load_parsers +from codebase_rag.tests.conftest import get_relationships + + +def test_php_use_statement_import(temp_repo: Path, mock_ingestor: MagicMock) -> None: + project_path = temp_repo / "php_imports_test" + project_path.mkdir() + + (project_path / "Controller.php").write_text( + encoding="utf-8", + data="""= 1 + + controller_module = f"{project_path.name}.Controller" + import_mapping = updater.factory.import_processor.import_mapping + if controller_module in import_mapping: + mapping = import_mapping[controller_module] + assert "ProductService" in mapping + assert mapping["ProductService"] == "App.Service.ProductService" + assert "Repo" in mapping + assert mapping["Repo"] == "App.Repository.ProductRepository" + + +def test_php_multiple_use_statements(temp_repo: Path, mock_ingestor: MagicMock) -> None: + project_path = temp_repo / "php_multi_imports" + project_path.mkdir() + + (project_path / "app.php").write_text( + encoding="utf-8", + data=""" set[tuple[str, str]]: + return { + (call[0][0][2], call[0][2][2]) for call in get_relationships(mock_ingestor, rel) + } + + +def test_php_inheritance_and_implements_edges( + temp_repo: Path, mock_ingestor: MagicMock +) -> None: + project = temp_repo / "php_inh" + project.mkdir() + (project / "lib.php").write_text(_PHP, encoding="utf-8") + create_and_run_updater(project, mock_ingestor, skip_if_missing="php") + + inherits = _pairs(mock_ingestor, RelationshipType.INHERITS.value) + implements = _pairs(mock_ingestor, RelationshipType.IMPLEMENTS.value) + base = "php_inh.lib" + + # (H) class extends -> INHERITS. + assert (f"{base}.Circle", f"{base}.Base") in inherits, inherits + # (H) class implements -> IMPLEMENTS to each interface. + assert (f"{base}.Circle", f"{base}.Shape") in implements, implements + assert (f"{base}.Circle", f"{base}.Drawable") in implements, implements + # (H) interface extends -> INHERITS to each superinterface. + assert (f"{base}.Big", f"{base}.Shape") in inherits, inherits + assert (f"{base}.Big", f"{base}.Drawable") in inherits, inherits diff --git a/codebase_rag/tests/test_php_inheritance_oracle.py b/codebase_rag/tests/test_php_inheritance_oracle.py new file mode 100644 index 000000000..a27c33a20 --- /dev/null +++ b/codebase_rag/tests/test_php_inheritance_oracle.py @@ -0,0 +1,58 @@ +# (H) Covers PHP inheritance-edge validation: cgr's INHERITS (class/interface +# (H) extends) and IMPLEMENTS (class implements) edges are graded against the +# (H) php-parser oracle, by (source node, base SIMPLE NAME). +from __future__ import annotations + +from pathlib import Path + +import pytest + +from codebase_rag import constants as cs +from codebase_rag.parser_loader import load_parsers +from evals import constants as ec +from evals.cgr_graph import extract_cgr_php_graph +from evals.oracles import php_oracle_available, run_php_oracle +from evals.score import score_name_edge_types + +PHP_SRC = """\ + None: + if not php_oracle_available(): + pytest.skip("node/npm toolchain not available") + if cs.SupportedLanguage.PHP not in load_parsers()[0]: + pytest.skip("php parser not available") + + +def test_cgr_matches_php_parser_oracle_on_inheritance_edges(tmp_path: Path) -> None: + _require_php() + project = tmp_path / "php_inh_edge" + project.mkdir() + (project / "lib.php").write_text(PHP_SRC, encoding="utf-8") + + cgr = extract_cgr_php_graph(project, project.name) + oracle = run_php_oracle(project) + + result = score_name_edge_types(cgr, oracle, ec.INHERITANCE_NAME_EDGE_TYPES) + by_label = {row["label"]: row for row in result.rows} + for label in ( + cs.RelationshipType.INHERITS.value, + cs.RelationshipType.IMPLEMENTS.value, + ): + row = by_label.get(label) + assert row is not None, (label, by_label, result.diff) + assert row["precision"] == 1.0 and row["recall"] == 1.0, ( + label, + row, + result.diff, + ) diff --git a/codebase_rag/tests/test_php_span_oracle.py b/codebase_rag/tests/test_php_span_oracle.py new file mode 100644 index 000000000..60b003ab8 --- /dev/null +++ b/codebase_rag/tests/test_php_span_oracle.py @@ -0,0 +1,74 @@ +# (H) Covers PHP node SPAN (end_line) validation: cgr's end_line for each node is +# (H) graded against the php-parser oracle (which emits node.loc.end.line), joined +# (H) on (kind, file, start). Exercises a class with a multi-line method, an +# (H) interface, an enum, and a multi-line function so spans are not single line. +from __future__ import annotations + +from pathlib import Path + +import pytest + +from codebase_rag import constants as cs +from codebase_rag.parser_loader import load_parsers +from evals import constants as ec +from evals.cgr_graph import extract_cgr_php_graph +from evals.oracles import php_oracle_available, run_php_oracle +from evals.score import score_span + +PHP_SRC = """\ +size * $scale; + } +} + +interface Shape +{ + public function area(int $scale): int; +} + +enum Color +{ + case Red; + case Green; +} + +function standalone(int $a): int +{ + return $a + 1; +} +""" + + +def _require_php() -> None: + if not php_oracle_available(): + pytest.skip("node/npm toolchain not available") + if cs.SupportedLanguage.PHP not in load_parsers()[0]: + pytest.skip("php parser not available") + + +def test_cgr_matches_php_parser_oracle_on_node_spans(tmp_path: Path) -> None: + _require_php() + project = tmp_path / "php_span_test" + project.mkdir() + (project / "lib.php").write_text(PHP_SRC, encoding="utf-8") + + cgr = extract_cgr_php_graph(project, project.name) + oracle = run_php_oracle(project) + + result = score_span(cgr, oracle, ec.PHP_SCORED_NODE_KINDS) + by_label = {row["label"]: row for row in result.rows} + aggregate = by_label.get(ec.AGGREGATE_LABEL) + assert aggregate is not None, (by_label, result.diff) + assert aggregate["precision"] == 1.0 and aggregate["recall"] == 1.0, ( + aggregate, + result.diff, + ) + assert aggregate["tp"] >= 4, aggregate diff --git a/codebase_rag/tests/test_php_structure_oracle.py b/codebase_rag/tests/test_php_structure_oracle.py new file mode 100644 index 000000000..577eb14ee --- /dev/null +++ b/codebase_rag/tests/test_php_structure_oracle.py @@ -0,0 +1,71 @@ +# (H) Covers the PHP structure oracle harness (evals/oracles/php_oracle + +# (H) evals/php_l1.py): the php-parser oracle is authoritative ground truth, and +# (H) cgr's captured PHP nodes are graded against it on (kind, file, start_line). +# (H) Includes an attributed class (whose span starts at the attribute) and an +# (H) anonymous class (whose methods cgr models as Functions). +from __future__ import annotations + +from pathlib import Path + +import pytest + +from codebase_rag import constants as cs +from codebase_rag.parser_loader import load_parsers +from evals import constants as ec +from evals.cgr_graph import extract_cgr_php_nodes +from evals.oracles import php_oracle_available, run_php_oracle +from evals.score import score_node_kinds +from evals.types_defs import GraphData + +PHP_SRC = """\ + None: + if not php_oracle_available(): + pytest.skip("node/npm toolchain not available") + if cs.SupportedLanguage.PHP not in load_parsers()[0]: + pytest.skip("php parser not available") + + +def test_cgr_matches_php_parser_oracle_on_php_structure(tmp_path: Path) -> None: + _require_php() + project = tmp_path / "php_oracle_test" + project.mkdir() + (project / "sample.php").write_text(PHP_SRC, encoding="utf-8") + + cgr = GraphData( + nodes=extract_cgr_php_nodes(project, project.name), + edges=set(), + name_edges=set(), + ) + oracle = run_php_oracle(project) + + result = score_node_kinds(cgr, oracle, ec.PHP_SCORED_NODE_KINDS) + by_label = {row["label"]: row for row in result.rows} + for label in ("Class", "Interface", "Enum", "Method", "Function"): + row = by_label.get(label) + assert row is not None, (label, by_label) + assert row["precision"] == 1.0 and row["recall"] == 1.0, (label, row) diff --git a/codebase_rag/tests/test_project_name_flag.py b/codebase_rag/tests/test_project_name_flag.py new file mode 100644 index 000000000..214aa710c --- /dev/null +++ b/codebase_rag/tests/test_project_name_flag.py @@ -0,0 +1,348 @@ +from __future__ import annotations + +from pathlib import Path +from unittest.mock import MagicMock + +import pytest + +from codebase_rag.graph_updater import GraphUpdater +from codebase_rag.parser_loader import load_parsers +from codebase_rag.tests.conftest import get_node_names + + +@pytest.fixture(scope="module") +def parsers_and_queries() -> tuple[dict, dict]: + return load_parsers() + + +def _make_updater( + repo_path: Path, + mock_ingestor: MagicMock, + parsers_and_queries: tuple[dict, dict], + project_name: str | None = None, +) -> GraphUpdater: + parsers, queries = parsers_and_queries + return GraphUpdater( + ingestor=mock_ingestor, + repo_path=repo_path, + parsers=parsers, + queries=queries, + project_name=project_name, + ) + + +def _write_python_file(repo_path: Path, rel_path: str, content: str) -> None: + full = repo_path / rel_path + full.parent.mkdir(parents=True, exist_ok=True) + full.write_text(content) + + +class TestDefaultProjectName: + def test_default_uses_directory_name( + self, + temp_repo: Path, + mock_ingestor: MagicMock, + parsers_and_queries: tuple[dict, dict], + ) -> None: + updater = _make_updater(temp_repo, mock_ingestor, parsers_and_queries) + assert updater.project_name == temp_repo.resolve().name + + def test_default_none_uses_directory_name( + self, + temp_repo: Path, + mock_ingestor: MagicMock, + parsers_and_queries: tuple[dict, dict], + ) -> None: + updater = _make_updater( + temp_repo, mock_ingestor, parsers_and_queries, project_name=None + ) + assert updater.project_name == temp_repo.resolve().name + + def test_default_empty_string_uses_directory_name( + self, + temp_repo: Path, + mock_ingestor: MagicMock, + parsers_and_queries: tuple[dict, dict], + ) -> None: + updater = _make_updater( + temp_repo, mock_ingestor, parsers_and_queries, project_name="" + ) + assert updater.project_name == temp_repo.resolve().name + + def test_default_whitespace_only_uses_directory_name( + self, + temp_repo: Path, + mock_ingestor: MagicMock, + parsers_and_queries: tuple[dict, dict], + ) -> None: + updater = _make_updater( + temp_repo, mock_ingestor, parsers_and_queries, project_name=" " + ) + assert updater.project_name == temp_repo.resolve().name + + +class TestExplicitProjectName: + def test_override_simple( + self, + temp_repo: Path, + mock_ingestor: MagicMock, + parsers_and_queries: tuple[dict, dict], + ) -> None: + updater = _make_updater( + temp_repo, mock_ingestor, parsers_and_queries, project_name="MyProject" + ) + assert updater.project_name == "MyProject" + + def test_override_with_hyphens( + self, + temp_repo: Path, + mock_ingestor: MagicMock, + parsers_and_queries: tuple[dict, dict], + ) -> None: + updater = _make_updater( + temp_repo, + mock_ingestor, + parsers_and_queries, + project_name="my-cool-project", + ) + assert updater.project_name == "my-cool-project" + + def test_override_with_dots( + self, + temp_repo: Path, + mock_ingestor: MagicMock, + parsers_and_queries: tuple[dict, dict], + ) -> None: + updater = _make_updater( + temp_repo, + mock_ingestor, + parsers_and_queries, + project_name="com.example.app", + ) + assert updater.project_name == "com.example.app" + + +class TestEdgeCases: + def test_generic_dir_name_src( + self, + temp_repo: Path, + mock_ingestor: MagicMock, + parsers_and_queries: tuple[dict, dict], + ) -> None: + src_dir = temp_repo / "src" + src_dir.mkdir() + updater = _make_updater( + src_dir, mock_ingestor, parsers_and_queries, project_name="BlazingRenderer" + ) + assert updater.project_name == "BlazingRenderer" + updater_default = _make_updater(src_dir, mock_ingestor, parsers_and_queries) + assert updater_default.project_name == "src" + + def test_generic_dir_name_main( + self, + temp_repo: Path, + mock_ingestor: MagicMock, + parsers_and_queries: tuple[dict, dict], + ) -> None: + main_dir = temp_repo / "main" + main_dir.mkdir() + updater = _make_updater( + main_dir, + mock_ingestor, + parsers_and_queries, + project_name="ActualProjectName", + ) + assert updater.project_name == "ActualProjectName" + + def test_version_named_directory( + self, + temp_repo: Path, + mock_ingestor: MagicMock, + parsers_and_queries: tuple[dict, dict], + ) -> None: + ver_dir = temp_repo / "v1.3.2" + ver_dir.mkdir() + updater = _make_updater( + ver_dir, mock_ingestor, parsers_and_queries, project_name="my-library" + ) + assert updater.project_name == "my-library" + updater_default = _make_updater(ver_dir, mock_ingestor, parsers_and_queries) + assert updater_default.project_name == "v1.3.2" + + def test_nested_same_name_parent( + self, + temp_repo: Path, + mock_ingestor: MagicMock, + parsers_and_queries: tuple[dict, dict], + ) -> None: + nested = temp_repo / "BRender" / "BlazingRenderer" + nested.mkdir(parents=True) + updater = _make_updater( + nested, mock_ingestor, parsers_and_queries, project_name="BlazingRenderer" + ) + assert updater.project_name == "BlazingRenderer" + + +class TestFactoryPropagation: + def test_factory_receives_project_name( + self, + temp_repo: Path, + mock_ingestor: MagicMock, + parsers_and_queries: tuple[dict, dict], + ) -> None: + updater = _make_updater( + temp_repo, mock_ingestor, parsers_and_queries, project_name="CustomName" + ) + assert updater.factory.project_name == "CustomName" + + def test_factory_default_project_name( + self, + temp_repo: Path, + mock_ingestor: MagicMock, + parsers_and_queries: tuple[dict, dict], + ) -> None: + updater = _make_updater(temp_repo, mock_ingestor, parsers_and_queries) + assert updater.factory.project_name == temp_repo.resolve().name + + def test_structure_processor_receives_project_name( + self, + temp_repo: Path, + mock_ingestor: MagicMock, + parsers_and_queries: tuple[dict, dict], + ) -> None: + updater = _make_updater( + temp_repo, mock_ingestor, parsers_and_queries, project_name="CustomName" + ) + assert updater.factory.structure_processor.project_name == "CustomName" + + def test_import_processor_receives_project_name( + self, + temp_repo: Path, + mock_ingestor: MagicMock, + parsers_and_queries: tuple[dict, dict], + ) -> None: + updater = _make_updater( + temp_repo, mock_ingestor, parsers_and_queries, project_name="CustomName" + ) + assert updater.factory.import_processor.project_name == "CustomName" + + def test_definition_processor_receives_project_name( + self, + temp_repo: Path, + mock_ingestor: MagicMock, + parsers_and_queries: tuple[dict, dict], + ) -> None: + updater = _make_updater( + temp_repo, mock_ingestor, parsers_and_queries, project_name="CustomName" + ) + assert updater.factory.definition_processor.project_name == "CustomName" + + def test_call_processor_receives_project_name( + self, + temp_repo: Path, + mock_ingestor: MagicMock, + parsers_and_queries: tuple[dict, dict], + ) -> None: + updater = _make_updater( + temp_repo, mock_ingestor, parsers_and_queries, project_name="CustomName" + ) + assert updater.factory.call_processor.project_name == "CustomName" + + def test_type_inference_receives_project_name( + self, + temp_repo: Path, + mock_ingestor: MagicMock, + parsers_and_queries: tuple[dict, dict], + ) -> None: + updater = _make_updater( + temp_repo, mock_ingestor, parsers_and_queries, project_name="CustomName" + ) + assert updater.factory.type_inference.project_name == "CustomName" + + +class TestQualifiedNameIntegration: + def test_module_qualified_names_use_override( + self, + temp_repo: Path, + mock_ingestor: MagicMock, + parsers_and_queries: tuple[dict, dict], + ) -> None: + _write_python_file(temp_repo, "hello.py", "def greet():\n pass\n") + updater = _make_updater( + temp_repo, mock_ingestor, parsers_and_queries, project_name="MyApp" + ) + updater.run(force=True) + module_names = get_node_names(mock_ingestor, "Module") + assert "MyApp.hello" in module_names + + def test_function_qualified_names_use_override( + self, + temp_repo: Path, + mock_ingestor: MagicMock, + parsers_and_queries: tuple[dict, dict], + ) -> None: + _write_python_file(temp_repo, "utils.py", "def helper():\n return 42\n") + updater = _make_updater( + temp_repo, mock_ingestor, parsers_and_queries, project_name="MyApp" + ) + updater.run(force=True) + func_names = get_node_names(mock_ingestor, "Function") + assert "MyApp.utils.helper" in func_names + + def test_class_qualified_names_use_override( + self, + temp_repo: Path, + mock_ingestor: MagicMock, + parsers_and_queries: tuple[dict, dict], + ) -> None: + _write_python_file(temp_repo, "models.py", "class User:\n pass\n") + updater = _make_updater( + temp_repo, mock_ingestor, parsers_and_queries, project_name="MyApp" + ) + updater.run(force=True) + class_names = get_node_names(mock_ingestor, "Class") + assert "MyApp.models.User" in class_names + + def test_default_qualified_names_use_directory( + self, + temp_repo: Path, + mock_ingestor: MagicMock, + parsers_and_queries: tuple[dict, dict], + ) -> None: + _write_python_file(temp_repo, "foo.py", "def bar():\n pass\n") + updater = _make_updater(temp_repo, mock_ingestor, parsers_and_queries) + updater.run(force=True) + dir_name = temp_repo.resolve().name + func_names = get_node_names(mock_ingestor, "Function") + assert f"{dir_name}.foo.bar" in func_names + + def test_package_qualified_names_use_override( + self, + temp_repo: Path, + mock_ingestor: MagicMock, + parsers_and_queries: tuple[dict, dict], + ) -> None: + _write_python_file(temp_repo, "pkg/__init__.py", "") + _write_python_file(temp_repo, "pkg/core.py", "def run():\n pass\n") + updater = _make_updater( + temp_repo, mock_ingestor, parsers_and_queries, project_name="CustomProj" + ) + updater.run(force=True) + func_names = get_node_names(mock_ingestor, "Function") + assert "CustomProj.pkg.core.run" in func_names + + def test_override_vs_default_different_names( + self, + temp_repo: Path, + mock_ingestor: MagicMock, + parsers_and_queries: tuple[dict, dict], + ) -> None: + _write_python_file(temp_repo, "app.py", "def main():\n pass\n") + dir_name = temp_repo.resolve().name + updater = _make_updater( + temp_repo, mock_ingestor, parsers_and_queries, project_name="OverrideName" + ) + updater.run(force=True) + func_names = get_node_names(mock_ingestor, "Function") + assert "OverrideName.app.main" in func_names + assert f"{dir_name}.app.main" not in func_names diff --git a/codebase_rag/tests/test_project_naming.py b/codebase_rag/tests/test_project_naming.py new file mode 100644 index 000000000..29470944a --- /dev/null +++ b/codebase_rag/tests/test_project_naming.py @@ -0,0 +1,74 @@ +from pathlib import Path + +import pytest + +from codebase_rag.utils.path_utils import derive_project_name, resolve_repo_path + + +def test_derive_project_name_is_stable(tmp_path: Path) -> None: + repo = tmp_path / "myrepo" + repo.mkdir() + first = derive_project_name(repo) + second = derive_project_name(repo) + assert first == second + + +def test_derive_project_name_includes_basename(tmp_path: Path) -> None: + repo = tmp_path / "myrepo" + repo.mkdir() + name = derive_project_name(repo) + assert name.startswith("myrepo__") + assert len(name.split("__")[1]) == 8 + + +def test_derive_project_name_disambiguates_same_basename(tmp_path: Path) -> None: + repo_a = tmp_path / "a" / "frontend" + repo_b = tmp_path / "b" / "frontend" + repo_a.mkdir(parents=True) + repo_b.mkdir(parents=True) + assert derive_project_name(repo_a) != derive_project_name(repo_b) + assert derive_project_name(repo_a).startswith("frontend__") + assert derive_project_name(repo_b).startswith("frontend__") + + +def test_derive_project_name_slugifies_special_chars(tmp_path: Path) -> None: + weird = tmp_path / "my repo (v2)!" + weird.mkdir() + name = derive_project_name(weird) + base = name.split("__")[0] + assert all(c.isalnum() or c in "_-" for c in base) + + +def test_derive_project_name_fallback_for_root() -> None: + name = derive_project_name(Path("/")) + assert name.startswith("repo__") + + +def test_resolve_repo_path_explicit_wins(tmp_path: Path) -> None: + repo = tmp_path / "explicit" + repo.mkdir() + resolved = resolve_repo_path(str(repo), "/some/other/path") + assert resolved == repo.resolve() + + +def test_resolve_repo_path_uses_target_default(tmp_path: Path) -> None: + repo = tmp_path / "target" + repo.mkdir() + resolved = resolve_repo_path(None, str(repo)) + assert resolved == repo.resolve() + + +def test_resolve_repo_path_dot_falls_back_to_cwd( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + monkeypatch.chdir(tmp_path) + resolved = resolve_repo_path(None, ".") + assert resolved == tmp_path.resolve() + + +def test_resolve_repo_path_empty_falls_back_to_cwd( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + monkeypatch.chdir(tmp_path) + resolved = resolve_repo_path(None, "") + assert resolved == tmp_path.resolve() diff --git a/codebase_rag/tests/test_property_getter_calls.py b/codebase_rag/tests/test_property_getter_calls.py new file mode 100644 index 000000000..9168177cf --- /dev/null +++ b/codebase_rag/tests/test_property_getter_calls.py @@ -0,0 +1,102 @@ +# (H) L3 finding from the evals/ harness: accessing an @property getter runs the +# (H) getter method at runtime, but cgr saw a plain attribute access and emitted no +# (H) CALLS edge. A property access must produce a CALLS edge to the getter method, +# (H) while a normal attribute / method reference must not. +from __future__ import annotations + +from pathlib import Path + +from codebase_rag import constants as cs +from codebase_rag.graph_updater import GraphUpdater +from codebase_rag.parser_loader import load_parsers +from codebase_rag.types_defs import PropertyDict, PropertyValue, ResultRow + +PROJECT = "proj" + +MODULE_SRC = """class Engine: + def __init__(self) -> None: + self._n = 0 + + @property + def status(self) -> str: + return self._compute() + + def _compute(self) -> str: + return "ok" + + def check(self) -> str: + return self.status + + +def use(e: Engine) -> str: + return e.status + + +def plain(e: Engine) -> str: + return e._compute() +""" + + +class _Capture: + def __init__(self) -> None: + self.rels: list[tuple[PropertyValue, str, PropertyValue]] = [] + + def ensure_node_batch(self, label: str, properties: PropertyDict) -> None: + return None + + def ensure_relationship_batch( + self, + from_spec: tuple[str, str, PropertyValue], + rel_type: str, + to_spec: tuple[str, str, PropertyValue], + properties: PropertyDict | None = None, + ) -> None: + self.rels.append((from_spec[2], str(rel_type), to_spec[2])) + + def flush_all(self) -> None: + return None + + def fetch_all( + self, query: str, params: PropertyDict | None = None + ) -> list[ResultRow]: + return [] + + def execute_write(self, query: str, params: PropertyDict | None = None) -> None: + return None + + +def _calls(tmp_path: Path) -> set[tuple[PropertyValue, PropertyValue]]: + (tmp_path / "m.py").write_text(MODULE_SRC) + parsers, queries = load_parsers() + cap = _Capture() + GraphUpdater( + ingestor=cap, + repo_path=tmp_path, + parsers=parsers, + queries=queries, + project_name=PROJECT, + ).run(force=True) + return { + (frm, to) for (frm, rel, to) in cap.rels if rel == cs.RelationshipType.CALLS + } + + +class TestPropertyGetterCalls: + def test_property_access_via_self_is_a_call(self, tmp_path: Path) -> None: + calls = _calls(tmp_path) + assert ("proj.m.Engine.check", "proj.m.Engine.status") in calls, calls + + def test_property_access_via_typed_param_is_a_call(self, tmp_path: Path) -> None: + calls = _calls(tmp_path) + assert ("proj.m.use", "proj.m.Engine.status") in calls, calls + + def test_property_access_only_emits_the_getter_edge(self, tmp_path: Path) -> None: + calls = _calls(tmp_path) + # (H) `use` only reads e.status; no spurious edge to the unrelated _compute. + from_use = {to for (frm, to) in calls if frm == "proj.m.use"} + assert from_use == {"proj.m.Engine.status"}, from_use + + def test_regular_method_call_is_unaffected(self, tmp_path: Path) -> None: + calls = _calls(tmp_path) + # (H) plain() calls a normal method, resolved by the existing call path. + assert ("proj.m.plain", "proj.m.Engine._compute") in calls, calls diff --git a/codebase_rag/tests/test_property_return_type_chain.py b/codebase_rag/tests/test_property_return_type_chain.py new file mode 100644 index 000000000..06f985764 --- /dev/null +++ b/codebase_rag/tests/test_property_return_type_chain.py @@ -0,0 +1,87 @@ +# (H) L3 finding from the evals/ harness: a method calls self.prop.method(), where +# (H) self.prop is an @property whose declared return type names the class owning +# (H) the real method. The property's return type must seed self.prop's type so the +# (H) chained call resolves to the correct class instead of an ambiguous same-class +# (H) method of the same name. +from __future__ import annotations + +from pathlib import Path + +from codebase_rag import constants as cs +from codebase_rag.graph_updater import GraphUpdater +from codebase_rag.parser_loader import load_parsers +from codebase_rag.types_defs import PropertyDict, PropertyValue, ResultRow + +PROJECT = "proj" + +MODULE_SRC = """class Worker: + def build(self) -> str: + return "real" + + +class Engine: + @property + def inner(self) -> Worker: + return Worker() + + def build(self) -> str: + return self.inner.build() +""" + + +class _Capture: + def __init__(self) -> None: + self.rels: list[tuple[PropertyValue, str, PropertyValue]] = [] + + def ensure_node_batch(self, label: str, properties: PropertyDict) -> None: + return None + + def ensure_relationship_batch( + self, + from_spec: tuple[str, str, PropertyValue], + rel_type: str, + to_spec: tuple[str, str, PropertyValue], + properties: PropertyDict | None = None, + ) -> None: + self.rels.append((from_spec[2], str(rel_type), to_spec[2])) + + def flush_all(self) -> None: + return None + + def fetch_all( + self, query: str, params: PropertyDict | None = None + ) -> list[ResultRow]: + return [] + + def execute_write(self, query: str, params: PropertyDict | None = None) -> None: + return None + + +def _calls(tmp_path: Path) -> set[tuple[PropertyValue, PropertyValue]]: + (tmp_path / "m.py").write_text(MODULE_SRC) + parsers, queries = load_parsers() + cap = _Capture() + GraphUpdater( + ingestor=cap, + repo_path=tmp_path, + parsers=parsers, + queries=queries, + project_name=PROJECT, + ).run(force=True) + return { + (frm, to) for (frm, rel, to) in cap.rels if rel == cs.RelationshipType.CALLS + } + + +class TestPropertyReturnTypeChain: + def test_chained_call_through_property_resolves_to_return_type_class( + self, tmp_path: Path + ) -> None: + calls = _calls(tmp_path) + assert ("proj.m.Engine.build", "proj.m.Worker.build") in calls, calls + + def test_does_not_resolve_to_same_class_method_of_same_name( + self, tmp_path: Path + ) -> None: + calls = _calls(tmp_path) + assert ("proj.m.Engine.build", "proj.m.Engine.build") not in calls, calls diff --git a/codebase_rag/tests/test_protobuf_service.py b/codebase_rag/tests/test_protobuf_service.py index 7bb2c0de0..2b8da8a08 100644 --- a/codebase_rag/tests/test_protobuf_service.py +++ b/codebase_rag/tests/test_protobuf_service.py @@ -169,3 +169,130 @@ def test_protobuf_ingestor_split_index_serialization_and_deserialization( assert rel.target_id == "test_project.UserService.get_user" assert rel.source_label == NodeType.CLASS assert rel.target_label == NodeType.METHOD + + +def test_ensure_node_batch_no_message_class_logs_warning(tmp_path: Path) -> None: + from codebase_rag.services.protobuf_service import _MSG_CLASS_CACHE + + output_dir = tmp_path / "out" + output_dir.mkdir() + ingestor = ProtobufFileIngestor(str(output_dir)) + + from codebase_rag import constants as cs + + _MSG_CLASS_CACHE[cs.NodeLabel.UNION] = None + + ingestor.ensure_node_batch(cs.NodeLabel.UNION, {"qualified_name": "foo.bar"}) + + assert "foo.bar" not in ingestor._nodes + _MSG_CLASS_CACHE.pop(cs.NodeLabel.UNION, None) + + +def test_ensure_node_batch_no_oneof_mapping_logs_warning(tmp_path: Path) -> None: + from codebase_rag.services.protobuf_service import LABEL_TO_ONEOF_FIELD + + output_dir = tmp_path / "out" + output_dir.mkdir() + ingestor = ProtobufFileIngestor(str(output_dir)) + + from codebase_rag import constants as cs + + ingestor.ensure_node_batch( + cs.NodeLabel.PROJECT, {"name": "test_proj", "qualified_name": "test_proj"} + ) + assert "test_proj" in ingestor._nodes + + +def test_ensure_relationship_batch_dedup(tmp_path: Path) -> None: + output_dir = tmp_path / "out" + output_dir.mkdir() + ingestor = ProtobufFileIngestor(str(output_dir)) + + from_spec = ("Class", "qualified_name", "proj.MyClass") + to_spec = ("Method", "qualified_name", "proj.MyClass.method") + rel_type = "DEFINES_METHOD" + + ingestor.ensure_relationship_batch(from_spec, rel_type, to_spec) + ingestor.ensure_relationship_batch(from_spec, rel_type, to_spec) + + assert len(ingestor._relationships) == 1 + + +def test_ensure_relationship_batch_dedup_with_properties_merge(tmp_path: Path) -> None: + output_dir = tmp_path / "out" + output_dir.mkdir() + ingestor = ProtobufFileIngestor(str(output_dir)) + + from_spec = ("Class", "qualified_name", "proj.MyClass") + to_spec = ("Method", "qualified_name", "proj.MyClass.method") + rel_type = "DEFINES_METHOD" + + ingestor.ensure_relationship_batch(from_spec, rel_type, to_spec) + ingestor.ensure_relationship_batch(from_spec, rel_type, to_spec, {"extra": "val"}) + + assert len(ingestor._relationships) == 1 + + +def test_ensure_relationship_batch_invalid_empty_source(tmp_path: Path) -> None: + output_dir = tmp_path / "out" + output_dir.mkdir() + ingestor = ProtobufFileIngestor(str(output_dir)) + + from_spec = ("Class", "qualified_name", "") + to_spec = ("Method", "qualified_name", "proj.MyClass.method") + rel_type = "DEFINES_METHOD" + + ingestor.ensure_relationship_batch(from_spec, rel_type, to_spec) + + assert len(ingestor._relationships) == 0 + + +def test_ensure_relationship_batch_invalid_empty_target(tmp_path: Path) -> None: + output_dir = tmp_path / "out" + output_dir.mkdir() + ingestor = ProtobufFileIngestor(str(output_dir)) + + from_spec = ("Class", "qualified_name", "proj.MyClass") + to_spec = ("Method", "qualified_name", " ") + rel_type = "DEFINES_METHOD" + + ingestor.ensure_relationship_batch(from_spec, rel_type, to_spec) + + assert len(ingestor._relationships) == 0 + + +def test_ensure_relationship_batch_unknown_rel_type(tmp_path: Path) -> None: + from codebase_rag.services.protobuf_service import _REL_TYPE_CACHE + + output_dir = tmp_path / "out" + output_dir.mkdir() + ingestor = ProtobufFileIngestor(str(output_dir)) + + fake_rel_type = "COMPLETELY_FAKE_REL_TYPE_XYZ" + _REL_TYPE_CACHE.pop(fake_rel_type, None) + + from_spec = ("Class", "qualified_name", "proj.A") + to_spec = ("Method", "qualified_name", "proj.A.b") + + ingestor.ensure_relationship_batch(from_spec, fake_rel_type, to_spec) + + assert len(ingestor._relationships) == 1 + key = next(iter(ingestor._relationships)) + rel_obj = ingestor._relationships[key] + assert ( + rel_obj.type + == pb.Relationship.RelationshipType.RELATIONSHIP_TYPE_UNSPECIFIED + ) + + +def test_ensure_relationship_batch_none_values(tmp_path: Path) -> None: + output_dir = tmp_path / "out" + output_dir.mkdir() + ingestor = ProtobufFileIngestor(str(output_dir)) + + from_spec = ("Class", "qualified_name", None) + to_spec = ("Method", "qualified_name", "proj.A.b") + + ingestor.ensure_relationship_batch(from_spec, "DEFINES_METHOD", to_spec) + + assert len(ingestor._relationships) == 0 diff --git a/codebase_rag/tests/test_protocol_dispatch_resolution.py b/codebase_rag/tests/test_protocol_dispatch_resolution.py new file mode 100644 index 000000000..410eaf83b --- /dev/null +++ b/codebase_rag/tests/test_protocol_dispatch_resolution.py @@ -0,0 +1,123 @@ +# (H) L3 finding from the evals/ harness: DefinitionProcessor._extract_decorators calls +# (H) self._handler.extract_decorators(node), where _handler is annotated as the Protocol +# (H) LanguageHandler (class-level annotation) and assigned dynamically via +# (H) get_handler(language). The runtime type is one of several conformers, so the sound +# (H) call graph emits an edge to extract_decorators on every conformer (capturing the +# (H) traced PythonHandler edge) and never to the Protocol stub, which never runs. +from __future__ import annotations + +from pathlib import Path + +from codebase_rag import constants as cs +from codebase_rag.graph_updater import GraphUpdater +from codebase_rag.parser_loader import load_parsers +from codebase_rag.types_defs import PropertyDict, PropertyValue, ResultRow + +PROJECT = "proj" + +FILES = { + "pkg/__init__.py": "", + "pkg/proto.py": ( + "from typing import Protocol\n\n\n" + "class HandlerLike(Protocol):\n" + " def extract(self, node): ...\n" + ), + "pkg/base.py": ( + "class BaseHandler:\n def extract(self, node):\n return []\n" + ), + "pkg/python_h.py": ( + "from .base import BaseHandler\n\n\n" + "class PyHandler(BaseHandler):\n" + " def extract(self, node):\n" + " return ['py']\n" + ), + "pkg/js_h.py": ( + "from .base import BaseHandler\n\n\n" + "class JsHandler(BaseHandler):\n" + " def extract(self, node):\n" + " return ['js']\n" + ), + "pkg/proc.py": ( + "from .proto import HandlerLike\n\n\n" + "class Proc:\n" + " _handler: HandlerLike\n\n" + " def __init__(self, handler) -> None:\n" + " self._handler = handler\n\n" + " def go(self, node):\n" + " return self._handler.extract(node)\n" + ), +} + + +class _Capture: + def __init__(self) -> None: + self.rels: list[tuple[PropertyValue, str, PropertyValue]] = [] + + def ensure_node_batch(self, label: str, properties: PropertyDict) -> None: + return None + + def ensure_relationship_batch( + self, + from_spec: tuple[str, str, PropertyValue], + rel_type: str, + to_spec: tuple[str, str, PropertyValue], + properties: PropertyDict | None = None, + ) -> None: + self.rels.append((from_spec[2], str(rel_type), to_spec[2])) + + def flush_all(self) -> None: + return None + + def fetch_all( + self, query: str, params: PropertyDict | None = None + ) -> list[ResultRow]: + return [] + + def execute_write(self, query: str, params: PropertyDict | None = None) -> None: + return None + + +def _calls(tmp_path: Path) -> set[tuple[PropertyValue, PropertyValue]]: + for rel, content in FILES.items(): + p = tmp_path / rel + p.parent.mkdir(parents=True, exist_ok=True) + p.write_text(content) + parsers, queries = load_parsers() + cap = _Capture() + GraphUpdater( + ingestor=cap, + repo_path=tmp_path, + parsers=parsers, + queries=queries, + project_name=PROJECT, + ).run(force=True) + return { + (frm, to) for (frm, rel, to) in cap.rels if rel == cs.RelationshipType.CALLS + } + + +class TestProtocolDispatchResolution: + def test_dispatches_to_concrete_conformer(self, tmp_path: Path) -> None: + calls = _calls(tmp_path) + assert ( + "proj.pkg.proc.Proc.go", + "proj.pkg.python_h.PyHandler.extract", + ) in calls, calls + + def test_dispatches_to_all_conformers(self, tmp_path: Path) -> None: + calls = _calls(tmp_path) + assert ( + "proj.pkg.proc.Proc.go", + "proj.pkg.js_h.JsHandler.extract", + ) in calls, calls + assert ( + "proj.pkg.proc.Proc.go", + "proj.pkg.base.BaseHandler.extract", + ) in calls, calls + + def test_does_not_emit_protocol_stub_edge(self, tmp_path: Path) -> None: + calls = _calls(tmp_path) + assert ( + "proj.pkg.proc.Proc.go", + "proj.pkg.proto.HandlerLike.extract", + ) not in calls, calls diff --git a/codebase_rag/tests/test_protocol_impl_resolution.py b/codebase_rag/tests/test_protocol_impl_resolution.py new file mode 100644 index 000000000..a0c8036f9 --- /dev/null +++ b/codebase_rag/tests/test_protocol_impl_resolution.py @@ -0,0 +1,100 @@ +# (H) L3 finding from the evals/ harness: a call on a parameter typed as a +# (H) Protocol (function_registry.get() where function_registry is a +# (H) FunctionRegistryTrieProtocol) is traced to the concrete implementer +# (H) (FunctionRegistryTrie), not the Protocol stub. cgr infers the Protocol +# (H) type but stops at the stub; the XxxProtocol -> Xxx naming convention picks +# (H) the real implementer and disambiguates it from other structural conformers +# (H) such as a test mock. +from __future__ import annotations + +from pathlib import Path + +from codebase_rag import constants as cs +from codebase_rag.graph_updater import GraphUpdater +from codebase_rag.parser_loader import load_parsers +from codebase_rag.types_defs import PropertyDict, PropertyValue, ResultRow + +PROJECT = "proj" + +MODULE_SRC = """from typing import Protocol + + +class StoreProtocol(Protocol): + def fetch(self, key: str) -> int: ... + + +class Store: + def fetch(self, key: str) -> int: + return 1 + + +class MockStore: + def fetch(self, key: str) -> int: + return 2 + + +def use(store: StoreProtocol) -> int: + return store.fetch("x") +""" + + +class _Capture: + def __init__(self) -> None: + self.rels: list[tuple[PropertyValue, str, PropertyValue]] = [] + + def ensure_node_batch(self, label: str, properties: PropertyDict) -> None: + return None + + def ensure_relationship_batch( + self, + from_spec: tuple[str, str, PropertyValue], + rel_type: str, + to_spec: tuple[str, str, PropertyValue], + properties: PropertyDict | None = None, + ) -> None: + self.rels.append((from_spec[2], str(rel_type), to_spec[2])) + + def flush_all(self) -> None: + return None + + def fetch_all( + self, query: str, params: PropertyDict | None = None + ) -> list[ResultRow]: + return [] + + def execute_write(self, query: str, params: PropertyDict | None = None) -> None: + return None + + +def _calls(tmp_path: Path) -> set[tuple[PropertyValue, PropertyValue]]: + (tmp_path / "m.py").write_text(MODULE_SRC) + parsers, queries = load_parsers() + cap = _Capture() + GraphUpdater( + ingestor=cap, + repo_path=tmp_path, + parsers=parsers, + queries=queries, + project_name=PROJECT, + ).run(force=True) + return { + (frm, to) for (frm, rel, to) in cap.rels if rel == cs.RelationshipType.CALLS + } + + +class TestProtocolImplResolution: + def test_protocol_typed_call_resolves_to_concrete_implementer( + self, tmp_path: Path + ) -> None: + calls = _calls(tmp_path) + assert ("proj.m.use", "proj.m.Store.fetch") in calls, calls + + def test_does_not_resolve_to_protocol_stub(self, tmp_path: Path) -> None: + calls = _calls(tmp_path) + assert ("proj.m.use", "proj.m.StoreProtocol.fetch") not in calls, calls + + def test_naming_convention_disambiguates_from_other_conformer( + self, tmp_path: Path + ) -> None: + calls = _calls(tmp_path) + assert ("proj.m.use", "proj.m.MockStore.fetch") not in calls, calls diff --git a/codebase_rag/tests/test_protocol_operator_dispatch.py b/codebase_rag/tests/test_protocol_operator_dispatch.py new file mode 100644 index 000000000..45c469c13 --- /dev/null +++ b/codebase_rag/tests/test_protocol_operator_dispatch.py @@ -0,0 +1,125 @@ +# (H) L3 finding from the evals/ harness: an operator on a Protocol-typed attribute +# (H) (self.ast_cache[k], k in self.ast_cache) must dispatch to the dunder on the +# (H) concrete implementer even when the implementer's name does not follow the +# (H) XxxProtocol convention, and even when the dunder (e.g. __len__) is defined only on +# (H) the implementer and not declared on the Protocol stub. Structural conformance +# (H) (a class defining the Protocol's named methods) identifies the implementer. +from __future__ import annotations + +from pathlib import Path + +from codebase_rag import constants as cs +from codebase_rag.graph_updater import GraphUpdater +from codebase_rag.parser_loader import load_parsers +from codebase_rag.types_defs import PropertyDict, PropertyValue, ResultRow + +PROJECT = "proj" + +FILES = { + "pkg/__init__.py": "", + "pkg/proto.py": ( + "from typing import Protocol\n\n\n" + "class Cache(Protocol):\n" + " def snapshot(self):\n ...\n\n" + " def __getitem__(self, key):\n ...\n\n" + " def __contains__(self, key):\n ...\n" + ), + # (H) MemCache does not match the Cache name convention and adds __len__, which the + # (H) Protocol does not declare. It conforms via the named method snapshot. + "pkg/impl.py": ( + "class MemCache:\n" + " def snapshot(self):\n return {}\n\n" + " def __getitem__(self, key):\n return 1\n\n" + " def __contains__(self, key):\n return True\n\n" + " def __len__(self):\n return 0\n" + ), + "pkg/user.py": ( + "from .proto import Cache\n\n\n" + "class User:\n" + " def __init__(self, cache: Cache) -> None:\n" + " self._cache = cache\n\n" + " def _touch(self):\n" + " return None\n\n" + " def use(self, key):\n" + " self._touch()\n" + " if key in self._cache:\n" + " return self._cache[key]\n" + " return len(self._cache)\n" + ), +} + + +class _Capture: + def __init__(self) -> None: + self.rels: list[tuple[PropertyValue, str, PropertyValue]] = [] + + def ensure_node_batch(self, label: str, properties: PropertyDict) -> None: + return None + + def ensure_relationship_batch( + self, + from_spec: tuple[str, str, PropertyValue], + rel_type: str, + to_spec: tuple[str, str, PropertyValue], + properties: PropertyDict | None = None, + ) -> None: + self.rels.append((from_spec[2], str(rel_type), to_spec[2])) + + def flush_all(self) -> None: + return None + + def fetch_all( + self, query: str, params: PropertyDict | None = None + ) -> list[ResultRow]: + return [] + + def execute_write(self, query: str, params: PropertyDict | None = None) -> None: + return None + + +def _calls(tmp_path: Path) -> set[tuple[PropertyValue, PropertyValue]]: + for rel, content in FILES.items(): + p = tmp_path / rel + p.parent.mkdir(parents=True, exist_ok=True) + p.write_text(content) + parsers, queries = load_parsers() + cap = _Capture() + GraphUpdater( + ingestor=cap, + repo_path=tmp_path, + parsers=parsers, + queries=queries, + project_name=PROJECT, + ).run(force=True) + return { + (frm, to) for (frm, rel, to) in cap.rels if rel == cs.RelationshipType.CALLS + } + + +class TestProtocolOperatorDispatch: + def test_subscript_and_membership_reach_structural_conformer( + self, tmp_path: Path + ) -> None: + calls = _calls(tmp_path) + assert ( + "proj.pkg.user.User.use", + "proj.pkg.impl.MemCache.__getitem__", + ) in calls, calls + assert ( + "proj.pkg.user.User.use", + "proj.pkg.impl.MemCache.__contains__", + ) in calls, calls + + def test_dunder_only_on_implementer_resolves(self, tmp_path: Path) -> None: + calls = _calls(tmp_path) + assert ( + "proj.pkg.user.User.use", + "proj.pkg.impl.MemCache.__len__", + ) in calls, calls + + def test_protocol_stub_not_emitted(self, tmp_path: Path) -> None: + calls = _calls(tmp_path) + assert ( + "proj.pkg.user.User.use", + "proj.pkg.proto.Cache.__getitem__", + ) not in calls, calls diff --git a/codebase_rag/tests/test_provider_classes.py b/codebase_rag/tests/test_provider_classes.py index 1475914a0..da492ebd7 100644 --- a/codebase_rag/tests/test_provider_classes.py +++ b/codebase_rag/tests/test_provider_classes.py @@ -9,6 +9,8 @@ from codebase_rag.constants import GoogleProviderType, Provider from codebase_rag.providers.base import ( + AnthropicProvider, + AzureOpenAIProvider, GoogleProvider, ModelProvider, OllamaProvider, @@ -37,16 +39,42 @@ def test_get_valid_providers(self) -> None: assert isinstance(ollama_provider, OllamaProvider) assert ollama_provider.provider_name == Provider.OLLAMA + anthropic_provider = get_provider(Provider.ANTHROPIC, api_key="test-key") + assert isinstance(anthropic_provider, AnthropicProvider) + assert anthropic_provider.provider_name == Provider.ANTHROPIC + + azure_provider = get_provider( + Provider.AZURE, + api_key="test-key", + endpoint="https://myresource.openai.azure.com", + ) + assert isinstance(azure_provider, AzureOpenAIProvider) + assert azure_provider.provider_name == Provider.AZURE + def test_get_invalid_provider(self) -> None: with pytest.raises(ValueError, match="Unknown provider 'invalid_provider'"): get_provider("invalid_provider") + def test_get_litellm_provider(self) -> None: + litellm_provider = get_provider( + Provider.LITELLM_PROXY, + api_key="sk-test", + endpoint="http://localhost:4000/v1", + ) + from codebase_rag.providers.litellm import LiteLLMProvider + + assert isinstance(litellm_provider, LiteLLMProvider) + assert litellm_provider.provider_name == Provider.LITELLM_PROXY + def test_list_providers(self) -> None: providers = list_providers() assert Provider.GOOGLE in providers assert Provider.OPENAI in providers assert Provider.OLLAMA in providers - assert len(providers) >= 3 + assert Provider.ANTHROPIC in providers + assert Provider.AZURE in providers + assert Provider.LITELLM_PROXY in providers + assert len(providers) >= 6 def test_register_custom_provider(self) -> None: class CustomProvider(ModelProvider): @@ -190,6 +218,107 @@ def test_ollama_validation_connection_error(self, mock_client: Any) -> None: provider.validate_config() +class TestAnthropicProvider: + def test_anthropic_configuration(self) -> None: + provider = AnthropicProvider(api_key="sk-ant-test-key") + assert provider.provider_name == Provider.ANTHROPIC + assert provider.api_key == "sk-ant-test-key" + provider.validate_config() + + def test_anthropic_validation_error(self) -> None: + provider = AnthropicProvider() + with pytest.raises(ValueError, match="Anthropic provider requires api_key"): + provider.validate_config() + + @patch("codebase_rag.providers.base.PydanticAnthropicProvider") + @patch("codebase_rag.providers.base.AnthropicModel") + def test_anthropic_model_creation( + self, mock_anthropic_model: Any, mock_anthropic_provider: Any + ) -> None: + provider = AnthropicProvider(api_key="sk-ant-test-key") + mock_model = MagicMock() + mock_anthropic_model.return_value = mock_model + result = provider.create_model("claude-opus-4-6") + mock_anthropic_model.assert_called_once() + assert result == mock_model + + @patch("codebase_rag.providers.base.PydanticAnthropicProvider") + @patch("codebase_rag.providers.base.AnthropicModel") + def test_anthropic_model_enables_prompt_caching( + self, mock_anthropic_model: Any, mock_anthropic_provider: Any + ) -> None: + provider = AnthropicProvider(api_key="sk-ant-test-key") + provider.create_model("claude-opus-4-7") + + settings_arg = mock_anthropic_model.call_args.kwargs["settings"] + assert settings_arg["anthropic_cache_instructions"] is True + assert settings_arg["anthropic_cache_tool_definitions"] is True + assert settings_arg["anthropic_cache_messages"] is True + + def test_anthropic_api_key_from_env(self) -> None: + with patch.dict("os.environ", {"ANTHROPIC_API_KEY": "env-key"}): + provider = AnthropicProvider() + assert provider.api_key == "env-key" + + +class TestAzureOpenAIProvider: + def test_azure_configuration(self) -> None: + provider = AzureOpenAIProvider( + api_key="azure-key", + endpoint="https://myresource.openai.azure.com", + api_version="2024-06-01", + ) + assert provider.provider_name == Provider.AZURE + assert provider.api_key == "azure-key" + assert provider.endpoint == "https://myresource.openai.azure.com" + assert provider.api_version == "2024-06-01" + provider.validate_config() + + def test_azure_validation_error_no_key(self) -> None: + provider = AzureOpenAIProvider(endpoint="https://myresource.openai.azure.com") + with pytest.raises(ValueError, match="Azure OpenAI provider requires api_key"): + provider.validate_config() + + def test_azure_validation_error_no_endpoint(self) -> None: + provider = AzureOpenAIProvider(api_key="azure-key") + with pytest.raises(ValueError, match="Azure OpenAI provider requires endpoint"): + provider.validate_config() + + @patch("codebase_rag.providers.base.PydanticAzureProvider") + @patch("codebase_rag.providers.base.OpenAIChatModel") + def test_azure_model_creation( + self, mock_chat_model: Any, mock_azure_provider: Any + ) -> None: + provider = AzureOpenAIProvider( + api_key="azure-key", + endpoint="https://myresource.openai.azure.com", + ) + mock_model = MagicMock() + mock_chat_model.return_value = mock_model + result = provider.create_model("gpt-4o") + mock_azure_provider.assert_called_once_with( + api_key="azure-key", + azure_endpoint="https://myresource.openai.azure.com", + api_version=None, + ) + mock_chat_model.assert_called_once_with( + "gpt-4o", provider=mock_azure_provider.return_value + ) + assert result == mock_model + + def test_azure_api_key_from_env(self) -> None: + with patch.dict( + "os.environ", + { + "AZURE_API_KEY": "env-key", + "AZURE_OPENAI_ENDPOINT": "https://env.openai.azure.com", + }, + ): + provider = AzureOpenAIProvider() + assert provider.api_key == "env-key" + assert provider.endpoint == "https://env.openai.azure.com" + + class TestModelCreation: @patch("codebase_rag.providers.base.PydanticGoogleProvider") @patch("codebase_rag.providers.base.GoogleModel") @@ -275,3 +404,109 @@ def test_ollama_model_creation( mock_openai_provider.assert_called_once_with( api_key="ollama", base_url="http://localhost:11434/v1" ) + + +class TestLiteLLMProvider: + def test_litellm_configuration(self) -> None: + from codebase_rag.providers.litellm import LiteLLMProvider + + provider = LiteLLMProvider( + api_key="sk-litellm-key", endpoint="http://litellm:4000/v1" + ) + assert provider.provider_name == Provider.LITELLM_PROXY + assert provider.api_key == "sk-litellm-key" + assert provider.endpoint == "http://litellm:4000/v1" + + def test_litellm_default_endpoint(self) -> None: + from codebase_rag.providers.litellm import LiteLLMProvider + + provider = LiteLLMProvider() + assert provider.endpoint == "http://localhost:4000/v1" + + def test_litellm_no_endpoint_validation_error(self) -> None: + from codebase_rag.providers.litellm import LiteLLMProvider + + provider = LiteLLMProvider(endpoint="") + with pytest.raises(ValueError, match="LiteLLM provider requires endpoint"): + provider.validate_config() + + @patch("httpx.Client") + def test_litellm_validation_success(self, mock_client: Any) -> None: + from codebase_rag.providers.litellm import LiteLLMProvider + + mock_response = MagicMock() + mock_response.status_code = 200 + mock_client.return_value.__enter__.return_value.get.return_value = mock_response + + provider = LiteLLMProvider(api_key="sk-test", endpoint="http://litellm:4000/v1") + provider.validate_config() + + @patch("httpx.Client") + def test_litellm_validation_server_not_running(self, mock_client: Any) -> None: + from codebase_rag.providers.litellm import LiteLLMProvider + + mock_response = MagicMock() + mock_response.status_code = 404 + mock_client.return_value.__enter__.return_value.get.return_value = mock_response + + provider = LiteLLMProvider(endpoint="http://litellm:4000/v1") + with pytest.raises(ValueError, match="LiteLLM proxy server not responding"): + provider.validate_config() + + @patch("httpx.Client") + def test_litellm_validation_fallback_to_models_endpoint( + self, mock_client: Any + ) -> None: + from codebase_rag.providers.litellm import LiteLLMProvider + + health_response = MagicMock() + health_response.status_code = 401 + models_response = MagicMock() + models_response.status_code = 200 + mock_client.return_value.__enter__.return_value.get.side_effect = [ + health_response, + models_response, + ] + + provider = LiteLLMProvider(api_key="sk-test", endpoint="http://litellm:4000/v1") + provider.validate_config() + + @patch("httpx.Client") + def test_litellm_validation_connection_error(self, mock_client: Any) -> None: + import httpx + + from codebase_rag.providers.litellm import LiteLLMProvider + + mock_client.return_value.__enter__.return_value.get.side_effect = ( + httpx.ConnectError("Connection failed") + ) + + provider = LiteLLMProvider(endpoint="http://litellm:4000/v1") + with pytest.raises(ValueError, match="LiteLLM proxy server not responding"): + provider.validate_config() + + @patch("codebase_rag.providers.litellm.PydanticLiteLLMProvider") + @patch("codebase_rag.providers.litellm.OpenAIChatModel") + @patch("httpx.Client") + def test_litellm_model_creation( + self, mock_client: Any, mock_chat_model: Any, mock_litellm_provider: Any + ) -> None: + from codebase_rag.providers.litellm import LiteLLMProvider + + mock_response = MagicMock() + mock_response.status_code = 200 + mock_client.return_value.__enter__.return_value.get.return_value = mock_response + + provider = LiteLLMProvider(api_key="sk-test", endpoint="http://litellm:4000/v1") + mock_model = MagicMock() + mock_chat_model.return_value = mock_model + + result = provider.create_model("openai/gpt-4o") + + mock_litellm_provider.assert_called_once_with( + api_key="sk-test", api_base="http://litellm:4000/v1" + ) + mock_chat_model.assert_called_once_with( + "openai/gpt-4o", provider=mock_litellm_provider.return_value + ) + assert result == mock_model diff --git a/codebase_rag/tests/test_py_variable_analyzer_integration.py b/codebase_rag/tests/test_py_variable_analyzer_integration.py index 93b9f7fbb..ca193ee39 100644 --- a/codebase_rag/tests/test_py_variable_analyzer_integration.py +++ b/codebase_rag/tests/test_py_variable_analyzer_integration.py @@ -596,3 +596,88 @@ def _find_node_recursive(self, node, node_type: str, name: str): if result: return result return None + + +def _find_func_node(root_node, func_name: str): + stack = [root_node] + while stack: + node = stack.pop() + if node.type == "function_definition": + name_node = node.child_by_field_name("name") + if name_node and name_node.text.decode() == func_name: + return node + stack.extend(reversed(node.children)) + return None + + +class TestTraverseSinglePassWithQueries: + @pytest.fixture + def engine_with_queries( + self, + import_processor: MagicMock, + mock_function_registry: MagicMock, + mock_ast_cache: MagicMock, + ) -> PythonTypeInferenceEngine: + from codebase_rag import constants as cs + from codebase_rag.parser_loader import load_parsers + + parsers, queries = load_parsers() + if cs.SupportedLanguage.PYTHON not in parsers: + pytest.skip("Python parser not available") + + return PythonTypeInferenceEngine( + import_processor=import_processor, + function_registry=mock_function_registry, + repo_path=Path("/test/repo"), + project_name="test_project", + ast_cache=mock_ast_cache, + queries=queries, + module_qn_to_file_path={}, + class_inheritance={}, + simple_name_lookup=defaultdict(set), + js_type_inference_getter=lambda: MagicMock(), + ) + + def test_traverse_with_query_path( + self, + python_parser: Parser, + engine_with_queries: PythonTypeInferenceEngine, + ) -> None: + python_code = b""" +def process(name: str, count: int) -> None: + result = name.upper() + items = [] + for i in range(count): + items.append(i) +""" + tree = python_parser.parse(python_code) + func_node = _find_func_node(tree.root_node, "process") + assert func_node is not None + + result = engine_with_queries.build_local_variable_type_map( + func_node, "test.module" + ) + + assert "name" in result + assert result["name"] == "str" + assert "count" in result + assert result["count"] == "int" + + def test_traverse_with_query_path_caches_return_stmts( + self, + python_parser: Parser, + engine_with_queries: PythonTypeInferenceEngine, + ) -> None: + python_code = b""" +def get_value(x: int) -> int: + return x + 1 +""" + tree = python_parser.parse(python_code) + func_node = _find_func_node(tree.root_node, "get_value") + assert func_node is not None + + engine_with_queries.build_local_variable_type_map(func_node, "test.module") + + return_nodes: list = [] + engine_with_queries._find_return_statements(func_node, return_nodes) + assert len(return_nodes) >= 1 diff --git a/codebase_rag/tests/test_python_nested_functions.py b/codebase_rag/tests/test_python_nested_functions.py index 66f64b989..2a164d94d 100644 --- a/codebase_rag/tests/test_python_nested_functions.py +++ b/codebase_rag/tests/test_python_nested_functions.py @@ -318,10 +318,6 @@ def main(): def test_function_in_class_method( nested_functions_project: Path, mock_ingestor: MagicMock ) -> None: - """Test that functions inside class methods are properly handled. - - Note: Functions inside methods are currently treated as methods rather than nested functions. - """ parsers, queries = load_parsers() updater = GraphUpdater( @@ -333,21 +329,51 @@ def test_function_in_class_method( updater.run() project_name = nested_functions_project.name - - expected_method_qn = f"{project_name}.nested_functions.OuterClass.nested_in_method" - created_methods = get_node_names(mock_ingestor, "Method") - assert expected_method_qn in created_methods, ( - f"Function in method not found as method: {expected_method_qn}" + assert ( + f"{project_name}.nested_functions.OuterClass.method_with_nested" + in created_methods + ) + + nested_qn = f"{project_name}.nested_functions.OuterClass.nested_in_method" + assert nested_qn not in created_methods, ( + f"Nested function inside method should not be ingested as class method: {nested_qn}" ) - expected_class_methods = [ - f"{project_name}.nested_functions.OuterClass.method_with_nested", - f"{project_name}.nested_functions.OuterClass.nested_in_method", - ] - for expected_method in expected_class_methods: - assert expected_method in created_methods, ( - f"Expected method not found: {expected_method}" +def test_nested_function_in_staticmethod_not_ingested_as_method( + temp_repo: Path, mock_ingestor: MagicMock +) -> None: + project_path = temp_repo / "static_nested" + os.makedirs(project_path) + (project_path / "__init__.py").touch() + + with open(project_path / "api.py", "w") as f: + f.write( + "class Api:\n" + " @staticmethod\n" + " def say_hello():\n" + " def test_func():\n" + ' print("api")\n' + " pass\n" ) + + parsers, queries = load_parsers() + updater = GraphUpdater( + ingestor=mock_ingestor, + repo_path=project_path, + parsers=parsers, + queries=queries, + ) + updater.run() + + project_name = project_path.name + created_methods = get_node_names(mock_ingestor, "Method") + + assert f"{project_name}.api.Api.say_hello" in created_methods + + bad_qn = f"{project_name}.api.Api.test_func" + assert bad_qn not in created_methods, ( + f"Nested function inside staticmethod should not be ingested as class method: {bad_qn}" + ) diff --git a/codebase_rag/tests/test_python_real_world.py b/codebase_rag/tests/test_python_real_world.py index 770014655..0243e2f04 100644 --- a/codebase_rag/tests/test_python_real_world.py +++ b/codebase_rag/tests/test_python_real_world.py @@ -874,24 +874,20 @@ class PlainTaskSchema(Schema): return project_path -def test_flask_model_calls( +def test_flask_no_calls_to_class_nodes( todo_app_project: Path, mock_ingestor: MagicMock, ) -> None: - """Test detection of model usage in controllers.""" + """Test that Class nodes are not targets of CALLS relationships.""" run_updater(todo_app_project, mock_ingestor) function_calls = get_relationships(mock_ingestor, "CALLS") - model_usage_calls = [ - call - for call in function_calls - if "task_controller" in call.args[0][2] and "TaskModel" in call.args[2][2] - ] + class_calls = [call for call in function_calls if call.args[2][0] == "Class"] - assert model_usage_calls, ( - f"Expected TaskController to use TaskModel, found: " - f"{[(c.args[0][2], c.args[2][2]) for c in model_usage_calls]}" + assert not class_calls, ( + f"Expected no CALLS edges to Class nodes, found: " + f"{[(c.args[0][2], c.args[2][2]) for c in class_calls]}" ) diff --git a/codebase_rag/tests/test_python_relative_import_resolution.py b/codebase_rag/tests/test_python_relative_import_resolution.py index 883dd1d97..6b305b690 100644 --- a/codebase_rag/tests/test_python_relative_import_resolution.py +++ b/codebase_rag/tests/test_python_relative_import_resolution.py @@ -43,7 +43,7 @@ def test_single_dot_relative_import(self, mock_updater: GraphUpdater) -> None: module_qn, ) - expected = "pkg.sub1.sub2.utils" + expected = "myproject.pkg.sub1.sub2.utils" assert result == expected def test_double_dot_relative_import(self, mock_updater: GraphUpdater) -> None: @@ -66,7 +66,7 @@ def test_double_dot_relative_import(self, mock_updater: GraphUpdater) -> None: module_qn, ) - expected = "pkg.sub1.shared" + expected = "myproject.pkg.sub1.shared" assert result == expected def test_triple_dot_relative_import(self, mock_updater: GraphUpdater) -> None: @@ -89,7 +89,7 @@ def test_triple_dot_relative_import(self, mock_updater: GraphUpdater) -> None: module_qn, ) - expected = "pkg.common" + expected = "myproject.pkg.common" assert result == expected def test_relative_import_to_package_root(self, mock_updater: GraphUpdater) -> None: @@ -112,7 +112,7 @@ def test_relative_import_to_package_root(self, mock_updater: GraphUpdater) -> No module_qn, ) - expected = "config" + expected = "myproject.config" assert result == expected def test_relative_import_without_module_name( @@ -133,7 +133,7 @@ def test_relative_import_without_module_name( module_qn, ) - expected = "pkg.sub1" + expected = "myproject.pkg.sub1" assert result == expected def test_relative_import_edge_case_shallow_module( @@ -158,7 +158,7 @@ def test_relative_import_edge_case_shallow_module( module_qn, ) - expected = "other" + expected = "myproject.other" assert result == expected def test_relative_import_complex_module_path( @@ -183,5 +183,5 @@ def test_relative_import_complex_module_path( module_qn, ) - expected = "pkg.sub1.sub2.helpers.database.models" + expected = "myproject.pkg.sub1.sub2.helpers.database.models" assert result == expected diff --git a/codebase_rag/tests/test_python_span_oracle.py b/codebase_rag/tests/test_python_span_oracle.py new file mode 100644 index 000000000..f2e51219a --- /dev/null +++ b/codebase_rag/tests/test_python_span_oracle.py @@ -0,0 +1,71 @@ +# (H) Covers Python L1 node SPAN (end_line) validation: cgr's end_line for each +# (H) Class/Function/Method is graded against the ast oracle (node.end_lineno) via +# (H) the L1 score(), joined on (kind, file, start). Exercises a decorated +# (H) multi-line def, a property, an async multi-line signature, and a nested +# (H) function so spans are not trivially single line. +from __future__ import annotations + +from pathlib import Path + +from evals import constants as ec +from evals.ast_oracle import extract_oracle_graph +from evals.cgr_graph import extract_cgr_graph +from evals.score import score + +PY_SRC = '''\ +import functools + + +@functools.cache +def decorated( + a: int, + b: int, +) -> int: + return a + b + + +class Widget: + """doc.""" + + @property + def size(self) -> int: + return self._n + + async def fetch( + self, + url: str, + ) -> str: + return await call(url) + + +def outer(): + def inner(): + return 1 + + return inner +''' + + +def test_cgr_matches_ast_oracle_on_python_node_spans(tmp_path: Path) -> None: + project = tmp_path / "py_span" + project.mkdir() + (project / "m.py").write_text(PY_SRC, encoding="utf-8") + + cgr = extract_cgr_graph(project, project.name) + oracle = extract_oracle_graph(project, project.name) + + result = score(cgr, oracle) + span_rows = { + row["label"]: row + for row in result.rows + if row["category"] == ec.Category.SPAN.value + } + # (H) score() must now emit graded span rows for Class/Function/Method. + assert span_rows, [r["category"] for r in result.rows] + aggregate = span_rows.get(ec.AGGREGATE_LABEL) + assert aggregate is not None, span_rows + assert aggregate["precision"] == 1.0 and aggregate["recall"] == 1.0, ( + aggregate, + result.diff, + ) + assert aggregate["tp"] >= 5, aggregate diff --git a/codebase_rag/tests/test_python_standard_library_imports.py b/codebase_rag/tests/test_python_standard_library_imports.py index c7cfa891e..98ec5f673 100644 --- a/codebase_rag/tests/test_python_standard_library_imports.py +++ b/codebase_rag/tests/test_python_standard_library_imports.py @@ -11,10 +11,10 @@ class TestStandardLibraryImports: """Test import resolution for standard library vs local modules.""" @pytest.fixture - def mock_updater(self) -> GraphUpdater: + def mock_updater(self, tmp_path: Path) -> GraphUpdater: mock_ingestor = MagicMock() - test_repo = Path("/tmp/myproject") + test_repo = tmp_path / "myproject" test_repo.mkdir(exist_ok=True) (test_repo / "utils").mkdir(exist_ok=True) diff --git a/codebase_rag/tests/test_query_truncation.py b/codebase_rag/tests/test_query_truncation.py new file mode 100644 index 000000000..57f582c69 --- /dev/null +++ b/codebase_rag/tests/test_query_truncation.py @@ -0,0 +1,76 @@ +from __future__ import annotations + +from unittest.mock import AsyncMock, MagicMock, patch + +import pytest + +from codebase_rag.tools.codebase_query import create_query_tool +from codebase_rag.types_defs import ResultRow + + +@pytest.fixture +def mock_ingestor() -> MagicMock: + return MagicMock() + + +@pytest.fixture +def mock_cypher_gen() -> MagicMock: + gen = MagicMock() + gen.generate = AsyncMock(return_value="MATCH (n) RETURN n") + return gen + + +class TestQueryTruncation: + @pytest.mark.asyncio + async def test_row_cap_truncation( + self, mock_ingestor: MagicMock, mock_cypher_gen: MagicMock + ) -> None: + rows: list[ResultRow] = [{"name": f"node_{i}"} for i in range(600)] + mock_ingestor.fetch_all.return_value = rows + + tool = create_query_tool(mock_ingestor, mock_cypher_gen) + with patch("codebase_rag.tools.codebase_query.settings") as mock_settings: + mock_settings.QUERY_RESULT_ROW_CAP = 500 + mock_settings.QUERY_RESULT_MAX_TOKENS = 100000 + mock_settings.QUERY_TIMEOUT_S = 60.0 + result = await tool.function(natural_language_query="list all nodes") + + assert len(result.results) <= 500 + assert "truncated" in result.summary.lower() or "600" in result.summary + + @pytest.mark.asyncio + async def test_token_truncation( + self, mock_ingestor: MagicMock, mock_cypher_gen: MagicMock + ) -> None: + rows: list[ResultRow] = [ + {"name": f"function_{i}", "body": f"def func_{i}(): pass # {'x' * 200}"} + for i in range(100) + ] + mock_ingestor.fetch_all.return_value = rows + + tool = create_query_tool(mock_ingestor, mock_cypher_gen) + with patch("codebase_rag.tools.codebase_query.settings") as mock_settings: + mock_settings.QUERY_RESULT_ROW_CAP = 500 + mock_settings.QUERY_RESULT_MAX_TOKENS = 500 + mock_settings.QUERY_TIMEOUT_S = 60.0 + result = await tool.function(natural_language_query="list functions") + + assert len(result.results) < 100 + assert "truncated" in result.summary.lower() + + @pytest.mark.asyncio + async def test_no_truncation_when_within_limits( + self, mock_ingestor: MagicMock, mock_cypher_gen: MagicMock + ) -> None: + rows: list[ResultRow] = [{"name": f"node_{i}"} for i in range(5)] + mock_ingestor.fetch_all.return_value = rows + + tool = create_query_tool(mock_ingestor, mock_cypher_gen) + with patch("codebase_rag.tools.codebase_query.settings") as mock_settings: + mock_settings.QUERY_RESULT_ROW_CAP = 500 + mock_settings.QUERY_RESULT_MAX_TOKENS = 16000 + mock_settings.QUERY_TIMEOUT_S = 60.0 + result = await tool.function(natural_language_query="small query") + + assert len(result.results) == 5 + assert "Successfully" in result.summary diff --git a/codebase_rag/tests/test_realtime_debounce.py b/codebase_rag/tests/test_realtime_debounce.py new file mode 100644 index 000000000..eee1fcf48 --- /dev/null +++ b/codebase_rag/tests/test_realtime_debounce.py @@ -0,0 +1,445 @@ +""" +Tests for the realtime_updater debouncing functionality. + +These tests verify the hybrid debounce strategy that prevents redundant +graph updates during rapid file saves. +""" + +from __future__ import annotations + +import threading +import time +from pathlib import Path +from typing import Any +from unittest.mock import MagicMock + +import pytest +from watchdog.events import FileCreatedEvent, FileDeletedEvent, FileModifiedEvent + +from codebase_rag.constants import DEFAULT_DEBOUNCE_SECONDS, DEFAULT_MAX_WAIT_SECONDS +from codebase_rag.services import QueryProtocol + + +class MockQueryIngestor: + def __init__(self) -> None: + self.execute_write = MagicMock() + self.flush_all = MagicMock() + self.fetch_all = MagicMock(return_value=[]) + self.ensure_node_batch = MagicMock() + self.ensure_relationship_batch = MagicMock() + + def __enter__(self) -> MockQueryIngestor: + return self + + def __exit__(self, *args: Any) -> None: + pass + + +# Register MockQueryIngestor as implementing QueryProtocol for isinstance checks +QueryProtocol.register(MockQueryIngestor) + + +class TestCodeChangeEventHandlerDebounce: + @pytest.fixture(autouse=True) + def _patch_ignore(self, monkeypatch: pytest.MonkeyPatch) -> None: + from codebase_rag import constants as cs + + patched = cs.IGNORE_PATTERNS - {"tmp"} + monkeypatch.setattr(cs, "IGNORE_PATTERNS", patched) + monkeypatch.setattr("realtime_updater.IGNORE_PATTERNS", patched) + + @pytest.fixture + def mock_ingestor(self) -> MockQueryIngestor: + return MockQueryIngestor() + + @pytest.fixture + def mock_updater( + self, tmp_path: Path, mock_ingestor: MockQueryIngestor + ) -> MagicMock: + updater = MagicMock() + updater.repo_path = tmp_path + updater.ingestor = mock_ingestor + updater.remove_file_from_state = MagicMock() + updater.factory = MagicMock() + updater.factory.definition_processor.process_file = MagicMock(return_value=None) + updater._process_function_calls = MagicMock() + updater.parsers = {} + updater.queries = {} + updater.ast_cache = {} + return updater + + @pytest.fixture + def sample_file(self, tmp_path: Path) -> Path: + test_file = tmp_path / "test.py" + test_file.write_text("# test file") + return test_file + + def test_handler_initialization_with_debounce( + self, mock_updater: MagicMock + ) -> None: + from realtime_updater import CodeChangeEventHandler + + handler = CodeChangeEventHandler( + mock_updater, debounce_seconds=5, max_wait_seconds=30 + ) + + assert handler.debounce_seconds == 5 + assert handler.max_wait_seconds == 30 + assert handler.debounce_enabled is True + assert len(handler.timers) == 0 + assert len(handler.first_event_time) == 0 + assert len(handler.pending_events) == 0 + + def test_handler_initialization_without_debounce( + self, mock_updater: MagicMock + ) -> None: + from realtime_updater import CodeChangeEventHandler + + handler = CodeChangeEventHandler( + mock_updater, debounce_seconds=0, max_wait_seconds=30 + ) + + assert handler.debounce_seconds == 0 + assert handler.debounce_enabled is False + + def test_handler_uses_default_constants(self, mock_updater: MagicMock) -> None: + from realtime_updater import CodeChangeEventHandler + + handler = CodeChangeEventHandler(mock_updater) + + assert handler.debounce_seconds == DEFAULT_DEBOUNCE_SECONDS + assert handler.max_wait_seconds == DEFAULT_MAX_WAIT_SECONDS + + def test_is_relevant_filters_ignored_patterns( + self, mock_updater: MagicMock, tmp_path: Path + ) -> None: + from realtime_updater import CodeChangeEventHandler + + handler = CodeChangeEventHandler(mock_updater) + + # Should be ignored (directories in ignore patterns) + assert handler._is_relevant(str(tmp_path / ".git" / "config")) is False + assert handler._is_relevant(str(tmp_path / "node_modules" / "pkg.js")) is False + assert handler._is_relevant(str(tmp_path / "__pycache__" / "mod.pyc")) is False + + # Should be relevant + assert handler._is_relevant(str(tmp_path / "main.py")) is True + assert handler._is_relevant(str(tmp_path / "src" / "lib.rs")) is True + assert handler._is_relevant(str(tmp_path / "app.js")) is True + + def test_dispatch_ignores_directories( + self, mock_updater: MagicMock, mock_ingestor: MockQueryIngestor, tmp_path: Path + ) -> None: + from realtime_updater import CodeChangeEventHandler + + handler = CodeChangeEventHandler( + mock_updater, debounce_seconds=0.1, max_wait_seconds=1 + ) + + # Create event that is marked as directory + event = FileModifiedEvent(str(tmp_path / "some_dir")) + # The is_directory property is set by watchdog based on the event type + # For FileModifiedEvent, we need to check is_directory attribute + object.__setattr__(event, "is_directory", True) + + handler.dispatch(event) + + # No timer should be created for directory events + assert len(handler.timers) == 0 + mock_ingestor.execute_write.assert_not_called() + + def test_debounce_batches_rapid_events( + self, + mock_updater: MagicMock, + mock_ingestor: MockQueryIngestor, + sample_file: Path, + ) -> None: + from realtime_updater import CodeChangeEventHandler + + handler = CodeChangeEventHandler( + mock_updater, debounce_seconds=0.2, max_wait_seconds=5 + ) + + # Simulate 5 rapid saves + for _ in range(5): + event = FileModifiedEvent(str(sample_file)) + handler.dispatch(event) + time.sleep(0.05) # 50ms between saves + + # Should have one pending event + assert len(handler.pending_events) == 1 + + # Wait for debounce to complete + time.sleep(0.4) + + # After debounce, ingestor should have been called only once + mock_ingestor.flush_all.assert_called_once() + + def test_no_debounce_processes_immediately( + self, + mock_updater: MagicMock, + mock_ingestor: MockQueryIngestor, + sample_file: Path, + ) -> None: + from realtime_updater import CodeChangeEventHandler + + handler = CodeChangeEventHandler( + mock_updater, debounce_seconds=0, max_wait_seconds=30 + ) + + event = FileModifiedEvent(str(sample_file)) + handler.dispatch(event) + + # Should process immediately (no pending events) + assert len(handler.pending_events) == 0 + assert len(handler.timers) == 0 + mock_ingestor.flush_all.assert_called_once() + + def test_max_wait_forces_update( + self, + mock_updater: MagicMock, + mock_ingestor: MockQueryIngestor, + sample_file: Path, + ) -> None: + from realtime_updater import CodeChangeEventHandler + + handler = CodeChangeEventHandler( + mock_updater, debounce_seconds=0.5, max_wait_seconds=0.3 + ) + + # First event + event = FileModifiedEvent(str(sample_file)) + handler.dispatch(event) + + # Wait until max_wait is exceeded + time.sleep(0.4) + + # Second event should trigger immediate processing due to max_wait + event2 = FileModifiedEvent(str(sample_file)) + handler.dispatch(event2) + + # Give time for processing + time.sleep(0.15) + + # Should have processed at least once due to max_wait + assert mock_ingestor.flush_all.call_count >= 1 + + def test_different_files_tracked_separately( + self, mock_updater: MagicMock, tmp_path: Path + ) -> None: + from realtime_updater import CodeChangeEventHandler + + file1 = tmp_path / "file1.py" + file2 = tmp_path / "file2.py" + file1.write_text("# file 1") + file2.write_text("# file 2") + + handler = CodeChangeEventHandler( + mock_updater, debounce_seconds=0.2, max_wait_seconds=5 + ) + + # Events for different files + event1 = FileModifiedEvent(str(file1)) + event2 = FileModifiedEvent(str(file2)) + + handler.dispatch(event1) + handler.dispatch(event2) + + # Should have two pending events + assert len(handler.pending_events) == 2 + assert len(handler.timers) == 2 + + def test_timer_cleanup_after_processing( + self, + mock_updater: MagicMock, + mock_ingestor: MockQueryIngestor, + sample_file: Path, + ) -> None: + from realtime_updater import CodeChangeEventHandler + + handler = CodeChangeEventHandler( + mock_updater, debounce_seconds=0.1, max_wait_seconds=5 + ) + + event = FileModifiedEvent(str(sample_file)) + handler.dispatch(event) + + # Should have pending state + assert len(handler.pending_events) == 1 + assert len(handler.first_event_time) == 1 + + # Wait for processing + time.sleep(0.25) + + # State should be cleaned up + assert len(handler.pending_events) == 0 + assert len(handler.first_event_time) == 0 + assert len(handler.timers) == 0 + + def test_created_event_triggers_debounce( + self, mock_updater: MagicMock, tmp_path: Path + ) -> None: + from realtime_updater import CodeChangeEventHandler + + new_file = tmp_path / "new_file.py" + new_file.write_text("# new file") + + handler = CodeChangeEventHandler( + mock_updater, debounce_seconds=0.2, max_wait_seconds=5 + ) + + event = FileCreatedEvent(str(new_file)) + handler.dispatch(event) + + assert len(handler.pending_events) == 1 + + def test_deleted_event_triggers_debounce( + self, mock_updater: MagicMock, sample_file: Path + ) -> None: + from realtime_updater import CodeChangeEventHandler + + handler = CodeChangeEventHandler( + mock_updater, debounce_seconds=0.2, max_wait_seconds=5 + ) + + event = FileDeletedEvent(str(sample_file)) + handler.dispatch(event) + + assert len(handler.pending_events) == 1 + + def test_thread_safety_concurrent_events( + self, mock_updater: MagicMock, tmp_path: Path + ) -> None: + from realtime_updater import CodeChangeEventHandler + + handler = CodeChangeEventHandler( + mock_updater, debounce_seconds=5.0, max_wait_seconds=30 + ) + + files = [tmp_path / f"file{i}.py" for i in range(10)] + for f in files: + f.write_text(f"# {f.name}") + + def send_events(file_path: Path) -> None: + for _ in range(5): + event = FileModifiedEvent(str(file_path)) + handler.dispatch(event) + time.sleep(0.02) + + # Send events from multiple threads + threads = [threading.Thread(target=send_events, args=(f,)) for f in files[:5]] + for t in threads: + t.start() + for t in threads: + t.join() + + # Should have 5 pending events (one per file) + assert len(handler.pending_events) == 5 + + +class TestDebounceValidation: + def test_validate_non_negative_float_accepts_zero(self) -> None: + from realtime_updater import _validate_non_negative_float + + assert _validate_non_negative_float(0) == 0 + assert _validate_non_negative_float(0.0) == 0.0 + + def test_validate_non_negative_float_accepts_positive(self) -> None: + from realtime_updater import _validate_non_negative_float + + assert _validate_non_negative_float(5) == 5 + assert _validate_non_negative_float(0.5) == 0.5 + assert _validate_non_negative_float(100) == 100 + + def test_validate_non_negative_float_rejects_negative(self) -> None: + import typer + + from realtime_updater import _validate_non_negative_float + + with pytest.raises(typer.BadParameter): + _validate_non_negative_float(-1) + + with pytest.raises(typer.BadParameter): + _validate_non_negative_float(-0.1) + + +class TestDebounceIntegration: + @pytest.fixture(autouse=True) + def _patch_ignore(self, monkeypatch: pytest.MonkeyPatch) -> None: + from codebase_rag import constants as cs + + patched = cs.IGNORE_PATTERNS - {"tmp"} + monkeypatch.setattr(cs, "IGNORE_PATTERNS", patched) + monkeypatch.setattr("realtime_updater.IGNORE_PATTERNS", patched) + + @pytest.fixture + def mock_ingestor(self) -> MockQueryIngestor: + return MockQueryIngestor() + + @pytest.fixture + def mock_updater( + self, tmp_path: Path, mock_ingestor: MockQueryIngestor + ) -> MagicMock: + updater = MagicMock() + updater.repo_path = tmp_path + updater.ingestor = mock_ingestor + updater.remove_file_from_state = MagicMock() + updater.factory = MagicMock() + updater.factory.definition_processor.process_file = MagicMock(return_value=None) + updater._process_function_calls = MagicMock() + updater.parsers = {} + updater.queries = {} + updater.ast_cache = {} + return updater + + def test_realistic_rapid_save_scenario( + self, mock_updater: MagicMock, mock_ingestor: MockQueryIngestor, tmp_path: Path + ) -> None: + """ + Simulate realistic rapid save scenario: + - User saves file 10 times over 3 seconds + - With 0.5s debounce and 2s max_wait, should result in ~2-4 updates + """ + from realtime_updater import CodeChangeEventHandler + + test_file = tmp_path / "editor.py" + test_file.write_text("# editing") + + handler = CodeChangeEventHandler( + mock_updater, debounce_seconds=0.5, max_wait_seconds=2 + ) + + # Simulate 10 saves over 3 seconds + for i in range(10): + event = FileModifiedEvent(str(test_file)) + handler.dispatch(event) + time.sleep(0.3) + + # Wait for final debounce + time.sleep(0.7) + + # Should have batched into fewer updates due to max_wait and debounce + # With max_wait=2s and 3s total time, expect ~2-4 updates + call_count = mock_ingestor.flush_all.call_count + assert 1 <= call_count <= 4, f"Expected 1-4 updates, got {call_count}" + + def test_single_edit_after_quiet_period( + self, mock_updater: MagicMock, mock_ingestor: MockQueryIngestor, tmp_path: Path + ) -> None: + from realtime_updater import CodeChangeEventHandler + + test_file = tmp_path / "single.py" + test_file.write_text("# single edit") + + handler = CodeChangeEventHandler( + mock_updater, debounce_seconds=0.1, max_wait_seconds=5 + ) + + event = FileModifiedEvent(str(test_file)) + handler.dispatch(event) + + # Wait for debounce + time.sleep(0.25) + + # Should have exactly one update + mock_ingestor.flush_all.assert_called_once() diff --git a/codebase_rag/tests/test_realtime_event_filtering.py b/codebase_rag/tests/test_realtime_event_filtering.py new file mode 100644 index 000000000..68f641d93 --- /dev/null +++ b/codebase_rag/tests/test_realtime_event_filtering.py @@ -0,0 +1,210 @@ +from __future__ import annotations + +from pathlib import Path +from typing import Protocol, runtime_checkable +from unittest.mock import MagicMock + +import pytest +from watchdog.events import ( + FileClosedNoWriteEvent, + FileCreatedEvent, + FileDeletedEvent, + FileModifiedEvent, + FileOpenedEvent, + FileSystemEvent, +) + +from codebase_rag import constants as cs +from realtime_updater import CodeChangeEventHandler + + +@runtime_checkable +class _AnyProtocol(Protocol): + pass + + +@pytest.fixture(autouse=True) +def _bypass_protocol_check(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.setattr("realtime_updater.QueryProtocol", _AnyProtocol) + + +@pytest.fixture +def handler(mock_updater: MagicMock) -> CodeChangeEventHandler: + h = CodeChangeEventHandler(mock_updater, debounce_seconds=0) + h.ignore_patterns = h.ignore_patterns - {"tmp", "temp"} + return h + + +def _make_event(event_type: str, src_path: str) -> FileSystemEvent: + ev = MagicMock(spec=FileSystemEvent) + ev.event_type = event_type + ev.src_path = src_path + ev.is_directory = False + return ev + + +class TestEventFiltering: + def test_modified_event_is_processed( + self, handler: CodeChangeEventHandler, mock_updater: MagicMock, temp_repo: Path + ) -> None: + f = temp_repo / "app.py" + f.write_text("x = 1", encoding="utf-8") + handler.dispatch(FileModifiedEvent(str(f))) + assert mock_updater.ingestor.execute_write.call_count == 3 + + def test_created_event_is_processed( + self, handler: CodeChangeEventHandler, mock_updater: MagicMock, temp_repo: Path + ) -> None: + f = temp_repo / "new.py" + f.write_text("y = 2", encoding="utf-8") + handler.dispatch(FileCreatedEvent(str(f))) + assert mock_updater.ingestor.execute_write.call_count == 3 + mock_updater.ingestor.flush_all.assert_called_once() + + def test_deleted_event_is_processed( + self, handler: CodeChangeEventHandler, mock_updater: MagicMock, temp_repo: Path + ) -> None: + f = temp_repo / "gone.py" + handler.dispatch(FileDeletedEvent(str(f))) + assert mock_updater.ingestor.execute_write.call_count == 3 + mock_updater.factory.definition_processor.process_file.assert_not_called() + mock_updater.factory.structure_processor.process_generic_file.assert_not_called() + + def test_opened_event_is_ignored( + self, handler: CodeChangeEventHandler, mock_updater: MagicMock, temp_repo: Path + ) -> None: + f = temp_repo / "read_only.py" + f.touch() + handler.dispatch(FileOpenedEvent(str(f))) + mock_updater.ingestor.execute_write.assert_not_called() + mock_updater.ingestor.flush_all.assert_not_called() + + def test_closed_no_write_event_is_ignored( + self, handler: CodeChangeEventHandler, mock_updater: MagicMock, temp_repo: Path + ) -> None: + f = temp_repo / "viewed.py" + f.touch() + handler.dispatch(FileClosedNoWriteEvent(str(f))) + mock_updater.ingestor.execute_write.assert_not_called() + mock_updater.ingestor.flush_all.assert_not_called() + + def test_access_event_is_ignored( + self, handler: CodeChangeEventHandler, mock_updater: MagicMock, temp_repo: Path + ) -> None: + f = temp_repo / "accessed.py" + f.touch() + ev = _make_event("access", str(f)) + handler.dispatch(ev) + mock_updater.ingestor.execute_write.assert_not_called() + mock_updater.ingestor.flush_all.assert_not_called() + + +class TestNonCodeFileHandling: + def test_markdown_file_creates_file_node( + self, handler: CodeChangeEventHandler, mock_updater: MagicMock, temp_repo: Path + ) -> None: + f = temp_repo / "readme.md" + f.write_text("# Title", encoding="utf-8") + handler.dispatch(FileCreatedEvent(str(f))) + mock_updater.factory.structure_processor.process_generic_file.assert_called_once_with( + f, "readme.md" + ) + + def test_json_file_creates_file_node( + self, handler: CodeChangeEventHandler, mock_updater: MagicMock, temp_repo: Path + ) -> None: + f = temp_repo / "config.json" + f.write_text("{}", encoding="utf-8") + handler.dispatch(FileCreatedEvent(str(f))) + mock_updater.factory.structure_processor.process_generic_file.assert_called_once_with( + f, "config.json" + ) + + def test_non_code_file_deletion_removes_file_node( + self, handler: CodeChangeEventHandler, mock_updater: MagicMock, temp_repo: Path + ) -> None: + f = temp_repo / "notes.md" + handler.dispatch(FileDeletedEvent(str(f))) + delete_file_calls = [ + c + for c in mock_updater.ingestor.execute_write.call_args_list + if c.args[0] == cs.CYPHER_DELETE_FILE + ] + assert len(delete_file_calls) == 1 + assert delete_file_calls[0].args[1] == { + cs.KEY_PATH: "notes.md", + } + mock_updater.factory.structure_processor.process_generic_file.assert_not_called() + + def test_non_code_file_has_no_module_node( + self, handler: CodeChangeEventHandler, mock_updater: MagicMock, temp_repo: Path + ) -> None: + f = temp_repo / "data.md" + f.write_text("text", encoding="utf-8") + handler.dispatch(FileCreatedEvent(str(f))) + mock_updater.factory.definition_processor.process_file.assert_not_called() + + +class TestMixedEventSequences: + def test_rapid_create_modify_delete( + self, handler: CodeChangeEventHandler, mock_updater: MagicMock, temp_repo: Path + ) -> None: + f = temp_repo / "ephemeral.py" + f.write_text("a = 1", encoding="utf-8") + handler.dispatch(FileCreatedEvent(str(f))) + + mock_updater.ingestor.reset_mock() + mock_updater.factory.reset_mock() + f.write_text("a = 2", encoding="utf-8") + handler.dispatch(FileModifiedEvent(str(f))) + + mock_updater.ingestor.reset_mock() + mock_updater.factory.reset_mock() + handler.dispatch(FileDeletedEvent(str(f))) + + # (H) After delete, no re-parse or file node creation + mock_updater.factory.definition_processor.process_file.assert_not_called() + mock_updater.factory.structure_processor.process_generic_file.assert_not_called() + assert mock_updater.ingestor.execute_write.call_count == 3 + mock_updater.ingestor.flush_all.assert_called_once() + + def test_multiple_files_changed( + self, handler: CodeChangeEventHandler, mock_updater: MagicMock, temp_repo: Path + ) -> None: + f1 = temp_repo / "a.py" + f2 = temp_repo / "b.py" + f1.write_text("x = 1", encoding="utf-8") + f2.write_text("y = 2", encoding="utf-8") + + handler.dispatch(FileModifiedEvent(str(f1))) + handler.dispatch(FileModifiedEvent(str(f2))) + + assert mock_updater.ingestor.execute_write.call_count == 6 + assert mock_updater.ingestor.flush_all.call_count == 2 + + +class TestCypherDeleteFileQuery: + def test_delete_file_only_targets_specific_path( + self, handler: CodeChangeEventHandler, mock_updater: MagicMock, temp_repo: Path + ) -> None: + f1 = temp_repo / "keep.py" + f2 = temp_repo / "remove.py" + f1.write_text("a = 1", encoding="utf-8") + + handler.dispatch(FileDeletedEvent(str(f2))) + + delete_file_calls = [ + c + for c in mock_updater.ingestor.execute_write.call_args_list + if c.args[0] == cs.CYPHER_DELETE_FILE + ] + assert len(delete_file_calls) == 1 + assert delete_file_calls[0].args[1] == {cs.KEY_PATH: "remove.py"} + + delete_module_calls = [ + c + for c in mock_updater.ingestor.execute_write.call_args_list + if c.args[0] == cs.CYPHER_DELETE_MODULE + ] + assert len(delete_module_calls) == 1 + assert delete_module_calls[0].args[1] == {cs.KEY_PATH: "remove.py"} diff --git a/codebase_rag/tests/test_realtime_updater.py b/codebase_rag/tests/test_realtime_updater.py index c53b5b6ae..fdf1b604a 100644 --- a/codebase_rag/tests/test_realtime_updater.py +++ b/codebase_rag/tests/test_realtime_updater.py @@ -1,4 +1,7 @@ +from __future__ import annotations + from pathlib import Path +from typing import Protocol, runtime_checkable from unittest.mock import MagicMock import pytest @@ -12,10 +15,21 @@ from realtime_updater import CodeChangeEventHandler +@runtime_checkable +class _AnyProtocol(Protocol): + pass + + +@pytest.fixture(autouse=True) +def _bypass_protocol_check(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.setattr("realtime_updater.QueryProtocol", _AnyProtocol) + + @pytest.fixture def event_handler(mock_updater: MagicMock) -> CodeChangeEventHandler: - """Provides a CodeChangeEventHandler instance with a mocked updater.""" - return CodeChangeEventHandler(mock_updater) + handler = CodeChangeEventHandler(mock_updater, debounce_seconds=0) + handler.ignore_patterns = handler.ignore_patterns - {"tmp", "temp"} + return handler def test_file_creation_flow( @@ -28,7 +42,8 @@ def test_file_creation_flow( event_handler.dispatch(event) - assert mock_updater.ingestor.execute_write.call_count == 2 + # (H) 3 execute_write calls: DELETE_MODULE, DELETE_FILE, DELETE_CALLS + assert mock_updater.ingestor.execute_write.call_count == 3 mock_updater.factory.definition_processor.process_file.assert_called_once_with( test_file, "python", @@ -48,7 +63,8 @@ def test_file_modification_flow( event_handler.dispatch(event) - assert mock_updater.ingestor.execute_write.call_count == 2 + # (H) 3 execute_write calls: DELETE_MODULE, DELETE_FILE, DELETE_CALLS + assert mock_updater.ingestor.execute_write.call_count == 3 mock_updater.factory.definition_processor.process_file.assert_called_once_with( test_file, "python", @@ -67,7 +83,8 @@ def test_file_deletion_flow( event_handler.dispatch(event) - assert mock_updater.ingestor.execute_write.call_count == 2 + # (H) 3 execute_write calls: DELETE_MODULE, DELETE_FILE, DELETE_CALLS + assert mock_updater.ingestor.execute_write.call_count == 3 mock_updater.factory.definition_processor.process_file.assert_not_called() mock_updater.ingestor.flush_all.assert_called_once() @@ -103,16 +120,22 @@ def test_directory_creation_is_ignored( mock_updater.ingestor.flush_all.assert_not_called() -def test_unsupported_file_types_are_ignored( +def test_non_code_files_create_file_nodes( event_handler: CodeChangeEventHandler, mock_updater: MagicMock, temp_repo: Path ) -> None: - """Test that changing an unsupported file type is ignored after deletion query.""" - unsupported_file = temp_repo / "document.md" - unsupported_file.write_text(encoding="utf-8", data="# Markdown file") - event = FileModifiedEvent(str(unsupported_file)) + """Test that non-code files (like .md) create File nodes but skip AST parsing.""" + non_code_file = temp_repo / "document.md" + non_code_file.write_text(encoding="utf-8", data="# Markdown file") + event = FileModifiedEvent(str(non_code_file)) event_handler.dispatch(event) - assert mock_updater.ingestor.execute_write.call_count == 2 + # (H) 3 execute_write calls: DELETE_MODULE, DELETE_FILE, DELETE_CALLS + assert mock_updater.ingestor.execute_write.call_count == 3 + # (H) AST parsing is skipped for non-code files mock_updater.factory.definition_processor.process_file.assert_not_called() + # (H) But File node creation IS called for all file types + mock_updater.factory.structure_processor.process_generic_file.assert_called_once_with( + non_code_file, "document.md" + ) mock_updater.ingestor.flush_all.assert_called_once() diff --git a/codebase_rag/tests/test_reconcile_embeddings.py b/codebase_rag/tests/test_reconcile_embeddings.py new file mode 100644 index 000000000..0e69f646e --- /dev/null +++ b/codebase_rag/tests/test_reconcile_embeddings.py @@ -0,0 +1,94 @@ +from collections.abc import Generator +from pathlib import Path +from unittest.mock import MagicMock + +import pytest +from loguru import logger + +from codebase_rag.graph_updater import GraphUpdater +from codebase_rag.parser_loader import load_parsers +from codebase_rag.services.graph_service import MemgraphIngestor + + +@pytest.fixture +def updater(temp_repo: Path) -> GraphUpdater: + mock = MagicMock(spec=MemgraphIngestor) + mock.fetch_all = MagicMock(return_value=[]) + parsers, queries = load_parsers() + return GraphUpdater( + ingestor=mock, + repo_path=temp_repo, + parsers=parsers, + queries=queries, + ) + + +@pytest.fixture +def log_messages() -> Generator[list[str], None, None]: + messages: list[str] = [] + handler_id = logger.add(lambda msg: messages.append(str(msg)), level="DEBUG") + yield messages + logger.remove(handler_id) + + +class TestReconcileEmbeddings: + def test_noop_when_expected_empty(self, updater: GraphUpdater) -> None: + mock_fn = MagicMock() + updater._reconcile_embeddings(set(), mock_fn) + mock_fn.assert_not_called() + + def test_logs_ok_when_all_found( + self, updater: GraphUpdater, log_messages: list[str] + ) -> None: + expected = {1, 2, 3} + mock_fn = MagicMock(return_value={1, 2, 3}) + + updater._reconcile_embeddings(expected, mock_fn) + + mock_fn.assert_called_once_with(expected) + combined = "\n".join(log_messages) + assert "all 3 expected embeddings found" in combined + + def test_logs_warning_when_ids_missing( + self, updater: GraphUpdater, log_messages: list[str] + ) -> None: + expected = {1, 2, 3, 4, 5} + mock_fn = MagicMock(return_value={1, 3}) + + updater._reconcile_embeddings(expected, mock_fn) + + combined = "\n".join(log_messages) + assert "3 of 5 embeddings missing" in combined + + def test_sample_ids_in_warning( + self, updater: GraphUpdater, log_messages: list[str] + ) -> None: + expected = {10, 20, 30} + mock_fn = MagicMock(return_value={10}) + + updater._reconcile_embeddings(expected, mock_fn) + + combined = "\n".join(log_messages) + assert "20" in combined + assert "30" in combined + + def test_handles_verify_fn_exception( + self, updater: GraphUpdater, log_messages: list[str] + ) -> None: + mock_fn = MagicMock(side_effect=RuntimeError("connection lost")) + + updater._reconcile_embeddings({1, 2}, mock_fn) + + combined = "\n".join(log_messages).lower() + assert "reconciliation check failed" in combined + + def test_sample_limited_to_ten( + self, updater: GraphUpdater, log_messages: list[str] + ) -> None: + expected = set(range(20)) + mock_fn = MagicMock(return_value=set()) + + updater._reconcile_embeddings(expected, mock_fn) + + combined = "\n".join(log_messages) + assert "20 of 20 embeddings missing" in combined diff --git a/codebase_rag/tests/test_reexport_chain_resolution.py b/codebase_rag/tests/test_reexport_chain_resolution.py new file mode 100644 index 000000000..b9a6a8d65 --- /dev/null +++ b/codebase_rag/tests/test_reexport_chain_resolution.py @@ -0,0 +1,110 @@ +# (H) L3 finding from the evals/ harness: TypeInferenceEngine.build_local_variable_type_map +# (H) calls self.python_type_inference.build_local_variable_type_map(...), where the +# (H) python_type_inference property returns PythonTypeInferenceEngine imported via a +# (H) package re-export (from .py import PythonTypeInferenceEngine). The caller's import +# (H) map points the name at the re-export module, not the class's real definition, so +# (H) the chained method must follow the re-export hop to resolve to the concrete class +# (H) rather than collapsing to an ambiguous same-named method (the caller itself). +from __future__ import annotations + +from pathlib import Path + +from codebase_rag import constants as cs +from codebase_rag.graph_updater import GraphUpdater +from codebase_rag.parser_loader import load_parsers +from codebase_rag.types_defs import PropertyDict, PropertyValue, ResultRow + +PROJECT = "proj" + +# (H) PythonEngine lives in pkg/py/engine.py and is re-exported from pkg/py/__init__.py. +# (H) A sibling JsEngine.build_map exists so the bare name is ambiguous in the trie. +FILES = { + "pkg/__init__.py": "", + "pkg/py/__init__.py": "from .engine import PythonEngine\n\n__all__ = ['PythonEngine']\n", + "pkg/py/engine.py": ( + "class PythonEngine:\n def build_map(self, node):\n return {}\n" + ), + "pkg/js_engine.py": ( + "class JsEngine:\n def build_map(self, node):\n return {}\n" + ), + "pkg/dispatch.py": ( + "from .py import PythonEngine\n\n\n" + "class Dispatch:\n" + " def __init__(self) -> None:\n" + " self._python_engine = None\n\n" + " @property\n" + " def python_engine(self) -> PythonEngine:\n" + " if self._python_engine is None:\n" + " self._python_engine = PythonEngine()\n" + " return self._python_engine\n\n" + " def build_map(self, node):\n" + " return self.python_engine.build_map(node)\n" + ), +} + + +class _Capture: + def __init__(self) -> None: + self.rels: list[tuple[PropertyValue, str, PropertyValue]] = [] + + def ensure_node_batch(self, label: str, properties: PropertyDict) -> None: + return None + + def ensure_relationship_batch( + self, + from_spec: tuple[str, str, PropertyValue], + rel_type: str, + to_spec: tuple[str, str, PropertyValue], + properties: PropertyDict | None = None, + ) -> None: + self.rels.append((from_spec[2], str(rel_type), to_spec[2])) + + def flush_all(self) -> None: + return None + + def fetch_all( + self, query: str, params: PropertyDict | None = None + ) -> list[ResultRow]: + return [] + + def execute_write(self, query: str, params: PropertyDict | None = None) -> None: + return None + + +def _calls(tmp_path: Path) -> set[tuple[PropertyValue, PropertyValue]]: + for rel, content in FILES.items(): + p = tmp_path / rel + p.parent.mkdir(parents=True, exist_ok=True) + p.write_text(content) + parsers, queries = load_parsers() + cap = _Capture() + GraphUpdater( + ingestor=cap, + repo_path=tmp_path, + parsers=parsers, + queries=queries, + project_name=PROJECT, + ).run(force=True) + return { + (frm, to) for (frm, rel, to) in cap.rels if rel == cs.RelationshipType.CALLS + } + + +class TestReexportChainResolution: + def test_property_typed_by_reexport_resolves_to_real_class( + self, tmp_path: Path + ) -> None: + calls = _calls(tmp_path) + assert ( + "proj.pkg.dispatch.Dispatch.build_map", + "proj.pkg.py.engine.PythonEngine.build_map", + ) in calls, calls + + def test_does_not_collapse_to_caller_same_named_method( + self, tmp_path: Path + ) -> None: + calls = _calls(tmp_path) + assert ( + "proj.pkg.dispatch.Dispatch.build_map", + "proj.pkg.dispatch.Dispatch.build_map", + ) not in calls, calls diff --git a/codebase_rag/tests/test_relative_import_package_init.py b/codebase_rag/tests/test_relative_import_package_init.py new file mode 100644 index 000000000..d6b12a8be --- /dev/null +++ b/codebase_rag/tests/test_relative_import_package_init.py @@ -0,0 +1,72 @@ +# (H) L2 residual from the evals/ harness: relative imports inside an __init__.py +# (H) resolved one level too high. A package's qualified name IS the package, so +# (H) `from . import sub` in pkg/__init__.py must target pkg.sub, not the parent. +from __future__ import annotations + +from pathlib import Path + +from codebase_rag import constants as cs +from codebase_rag.graph_updater import GraphUpdater +from codebase_rag.parser_loader import load_parsers +from codebase_rag.types_defs import PropertyDict, PropertyValue, ResultRow + +PROJECT = "proj" + + +class _Capture: + def __init__(self) -> None: + self.rels: list[tuple[PropertyValue, str, PropertyValue]] = [] + + def ensure_node_batch(self, label: str, properties: PropertyDict) -> None: + return None + + def ensure_relationship_batch( + self, + from_spec: tuple[str, str, PropertyValue], + rel_type: str, + to_spec: tuple[str, str, PropertyValue], + properties: PropertyDict | None = None, + ) -> None: + self.rels.append((from_spec[2], str(rel_type), to_spec[2])) + + def flush_all(self) -> None: + return None + + def fetch_all( + self, query: str, params: PropertyDict | None = None + ) -> list[ResultRow]: + return [] + + def execute_write(self, query: str, params: PropertyDict | None = None) -> None: + return None + + +def _import_edges( + tmp_path: Path, +) -> set[tuple[PropertyValue, PropertyValue]]: + (tmp_path / "__init__.py").touch() + pkg = tmp_path / "pkg" + pkg.mkdir() + pkg.joinpath("__init__.py").write_text("from . import sub\n\nuse = sub\n") + pkg.joinpath("sub.py").write_text("X = 1\n") + parsers, queries = load_parsers() + cap = _Capture() + GraphUpdater( + ingestor=cap, + repo_path=tmp_path, + parsers=parsers, + queries=queries, + project_name=PROJECT, + ).run(force=True) + return { + (frm, to) for (frm, rel, to) in cap.rels if rel == cs.RelationshipType.IMPORTS + } + + +class TestRelativeImportPackageInit: + def test_from_dot_import_in_package_init_targets_own_submodule( + self, tmp_path: Path + ) -> None: + edges = _import_edges(tmp_path) + assert ("proj.pkg", "proj.pkg.sub") in edges, edges + assert ("proj.pkg", "proj.sub") not in edges, edges diff --git a/codebase_rag/tests/test_relative_import_root_level.py b/codebase_rag/tests/test_relative_import_root_level.py new file mode 100644 index 000000000..68146e489 --- /dev/null +++ b/codebase_rag/tests/test_relative_import_root_level.py @@ -0,0 +1,70 @@ +# (H) L2 finding from the evals/ harness: `from . import ` at the +# (H) package root (e.g. cli.py doing `from . import constants as cs`) produced +# (H) no IMPORTS edge, because relative-import resolution dropped the project +# (H) name and computed an empty base module. In a subpackage it worked. +from __future__ import annotations + +from pathlib import Path + +from codebase_rag import constants as cs +from codebase_rag.graph_updater import GraphUpdater +from codebase_rag.parser_loader import load_parsers +from codebase_rag.types_defs import PropertyDict, PropertyValue, ResultRow + +PROJECT = "proj" + + +class _Capture: + def __init__(self) -> None: + self.rels: list[tuple[PropertyValue, str, PropertyValue]] = [] + + def ensure_node_batch(self, label: str, properties: PropertyDict) -> None: + return None + + def ensure_relationship_batch( + self, + from_spec: tuple[str, str, PropertyValue], + rel_type: str, + to_spec: tuple[str, str, PropertyValue], + properties: PropertyDict | None = None, + ) -> None: + self.rels.append((from_spec[2], str(rel_type), to_spec[2])) + + def flush_all(self) -> None: + return None + + def fetch_all( + self, query: str, params: PropertyDict | None = None + ) -> list[ResultRow]: + return [] + + def execute_write(self, query: str, params: PropertyDict | None = None) -> None: + return None + + +def _imports( + tmp_path: Path, importer: str, src: str +) -> set[tuple[PropertyValue, PropertyValue]]: + (tmp_path / "__init__.py").touch() + (tmp_path / "constants.py").write_text("X = 1\n") + (tmp_path / importer).write_text(src) + parsers, queries = load_parsers() + cap = _Capture() + GraphUpdater( + ingestor=cap, + repo_path=tmp_path, + parsers=parsers, + queries=queries, + project_name=PROJECT, + ).run(force=True) + return { + (frm, to) for (frm, rel, to) in cap.rels if rel == cs.RelationshipType.IMPORTS + } + + +class TestRelativeImportRootLevel: + def test_from_dot_import_submodule_at_root(self, tmp_path: Path) -> None: + edges = _imports( + tmp_path, "cli.py", "from . import constants as cs\n\nuse = cs\n" + ) + assert ("proj.cli", "proj.constants") in edges, edges diff --git a/codebase_rag/tests/test_rust.py b/codebase_rag/tests/test_rust.py index 0751458e6..14f534809 100644 --- a/codebase_rag/tests/test_rust.py +++ b/codebase_rag/tests/test_rust.py @@ -302,25 +302,43 @@ def test_rust_structs_enums_unions( project_name = rust_project.name - expected_classes = [ + expected_structs = [ f"{project_name}.types.Point", f"{project_name}.types.Color", f"{project_name}.types.Unit", f"{project_name}.types.Container", f"{project_name}.types.Borrowed", f"{project_name}.types.GenericBorrowed", + ] + + created_classes = get_node_names(mock_ingestor, "Class") + + missing_structs = set(expected_structs) - created_classes + assert not missing_structs, ( + f"Missing expected structs: {sorted(list(missing_structs))}" + ) + + expected_enums = [ f"{project_name}.types.Direction", f"{project_name}.types.Message", f"{project_name}.types.Option", f"{project_name}.types.Cow", + ] + + created_enums = get_node_names(mock_ingestor, "Enum") + + missing_enums = set(expected_enums) - created_enums + assert not missing_enums, f"Missing expected enums: {sorted(list(missing_enums))}" + + expected_unions = [ f"{project_name}.types.FloatOrInt", ] - created_classes = get_node_names(mock_ingestor, "Class") + created_unions = get_node_names(mock_ingestor, "Union") - missing_classes = set(expected_classes) - created_classes - assert not missing_classes, ( - f"Missing expected types: {sorted(list(missing_classes))}" + missing_unions = set(expected_unions) - created_unions + assert not missing_unions, ( + f"Missing expected unions: {sorted(list(missing_unions))}" ) expected_methods = [ @@ -495,6 +513,13 @@ def test_rust_traits_and_implementations( f"{project_name}.traits.Drawable", ] + created_interfaces = get_node_names(mock_ingestor, "Interface") + + missing_traits = set(expected_traits) - created_interfaces + assert not missing_traits, ( + f"Missing expected traits: {sorted(list(missing_traits))}" + ) + expected_structs = [ f"{project_name}.traits.Point", f"{project_name}.traits.Circle", @@ -502,10 +527,9 @@ def test_rust_traits_and_implementations( created_classes = get_node_names(mock_ingestor, "Class") - all_expected = expected_traits + expected_structs - missing_classes = set(all_expected) - created_classes - assert not missing_classes, ( - f"Missing expected traits/structs: {sorted(list(missing_classes))}" + missing_structs = set(expected_structs) - created_classes + assert not missing_structs, ( + f"Missing expected structs: {sorted(list(missing_structs))}" ) expected_methods = [ @@ -1059,19 +1083,27 @@ def test_rust_pattern_matching( project_name = rust_project.name - expected_types = [ - f"{project_name}.pattern_matching.Color", - f"{project_name}.pattern_matching.Message", + expected_structs = [ f"{project_name}.pattern_matching.Point", ] created_classes = get_node_names(mock_ingestor, "Class") - found_types = set(expected_types) & created_classes - assert len(found_types) >= 3, ( - f"Expected at least 3 types, found: {sorted(list(found_types))}" + missing_structs = set(expected_structs) - created_classes + assert not missing_structs, ( + f"Missing expected structs: {sorted(list(missing_structs))}" ) + expected_enums = [ + f"{project_name}.pattern_matching.Color", + f"{project_name}.pattern_matching.Message", + ] + + created_enums = get_node_names(mock_ingestor, "Enum") + + missing_enums = set(expected_enums) - created_enums + assert not missing_enums, f"Missing expected enums: {sorted(list(missing_enums))}" + expected_functions = [ f"{project_name}.pattern_matching.match_color", f"{project_name}.pattern_matching.match_with_guards", @@ -1535,19 +1567,25 @@ def test_rust_macros( ) expected_structs = [ - f"{project_name}.macros.Person", - f"{project_name}.macros.Point", f"{project_name}.macros.MacroStruct", - f"{project_name}.macros.MacroEnum", ] created_classes = get_node_names(mock_ingestor, "Class") - found_structs = set(expected_structs) & created_classes - assert len(found_structs) >= 2, ( - f"Expected at least 2 macro structs, found: {sorted(list(found_structs))}" + missing_structs = set(expected_structs) - created_classes + assert not missing_structs, ( + f"Missing expected structs: {sorted(list(missing_structs))}" ) + expected_enums = [ + f"{project_name}.macros.MacroEnum", + ] + + created_enums = get_node_names(mock_ingestor, "Enum") + + missing_enums = set(expected_enums) - created_enums + assert not missing_enums, f"Missing expected enums: {sorted(list(missing_enums))}" + def test_rust_imports_and_use_statements( rust_project: Path, @@ -2050,9 +2088,9 @@ def test_rust_error_handling( f"{project_name}.error_handling.CustomError", ] - created_classes = get_node_names(mock_ingestor, "Class") + created_enums = get_node_names(mock_ingestor, "Enum") - found_enums = set(expected_enums) & created_classes + found_enums = set(expected_enums) & created_enums assert len(found_enums) >= 1, ( f"Expected at least 1 custom error enum, found: {sorted(list(found_enums))}" ) @@ -2403,18 +2441,36 @@ def test_rust_comprehensive_integration( project_name = rust_project.name - expected_types = [ + expected_structs = [ f"{project_name}.comprehensive.User", - f"{project_name}.comprehensive.RepositoryError", f"{project_name}.comprehensive.UserRepository", - f"{project_name}.comprehensive.Repository", ] created_classes = get_node_names(mock_ingestor, "Class") - found_types = set(expected_types) & created_classes - assert len(found_types) >= 3, ( - f"Expected at least 3 comprehensive types, found: {sorted(list(found_types))}" + missing_structs = set(expected_structs) - created_classes + assert not missing_structs, ( + f"Missing expected structs: {sorted(list(missing_structs))}" + ) + + expected_enums = [ + f"{project_name}.comprehensive.RepositoryError", + ] + + created_enums = get_node_names(mock_ingestor, "Enum") + + missing_enums = set(expected_enums) - created_enums + assert not missing_enums, f"Missing expected enums: {sorted(list(missing_enums))}" + + expected_interfaces = [ + f"{project_name}.comprehensive.Repository", + ] + + created_interfaces = get_node_names(mock_ingestor, "Interface") + + missing_interfaces = set(expected_interfaces) - created_interfaces + assert not missing_interfaces, ( + f"Missing expected traits: {sorted(list(missing_interfaces))}" ) diff --git a/codebase_rag/tests/test_rust_call_recall.py b/codebase_rag/tests/test_rust_call_recall.py new file mode 100644 index 000000000..e0876373c --- /dev/null +++ b/codebase_rag/tests/test_rust_call_recall.py @@ -0,0 +1,110 @@ +from __future__ import annotations + +from pathlib import Path +from unittest.mock import MagicMock + +from codebase_rag import constants as cs +from codebase_rag.tests.conftest import run_updater + + +def _calls(mock_ingestor: MagicMock) -> set[tuple[str, str]]: + out: set[tuple[str, str]] = set() + for c in mock_ingestor.ensure_relationship_batch.call_args_list: + if c.args[1] == cs.RelationshipType.CALLS: + out.add((c.args[0][2], c.args[2][2])) + return out + + +class TestRustTurbofishCalls: + def test_turbofish_call_is_captured( + self, temp_repo: Path, mock_ingestor: MagicMock + ) -> None: + (temp_repo / "tf.rs").write_text( + "fn generic_function(value: T) -> T { value }\n" + "\n" + "fn caller() {\n" + " let _ = generic_function::(10);\n" + "}\n", + encoding="utf-8", + ) + + run_updater(temp_repo, mock_ingestor, skip_if_missing="rust") + calls = _calls(mock_ingestor) + + assert any( + caller.endswith(".caller") and callee.endswith(".generic_function") + for caller, callee in calls + ), f"turbofish call not captured; calls={sorted(calls)}" + + +class TestRustMacroCalls: + def test_call_inside_macro_is_captured( + self, temp_repo: Path, mock_ingestor: MagicMock + ) -> None: + (temp_repo / "mac.rs").write_text( + "fn describe(x: i32) -> i32 { x }\n" + "\n" + "fn caller() {\n" + ' println!("{}", describe(5));\n' + "}\n", + encoding="utf-8", + ) + + run_updater(temp_repo, mock_ingestor, skip_if_missing="rust") + calls = _calls(mock_ingestor) + + assert any( + caller.endswith(".caller") and callee.endswith(".describe") + for caller, callee in calls + ), f"macro-internal call not captured; calls={sorted(calls)}" + + def test_bare_identifier_in_macro_is_not_a_call( + self, temp_repo: Path, mock_ingestor: MagicMock + ) -> None: + # (H) a plain value interpolated into a macro must not become a CALLS edge + (temp_repo / "mac2.rs").write_text( + "fn value() -> i32 { 1 }\n" + "\n" + "fn caller() {\n" + " let value = 5;\n" + ' println!("{}", value);\n' + "}\n", + encoding="utf-8", + ) + + run_updater(temp_repo, mock_ingestor, skip_if_missing="rust") + calls = _calls(mock_ingestor) + + assert not any( + caller.endswith(".caller") and callee.endswith(".value") + for caller, callee in calls + ), f"bare identifier wrongly captured as call; calls={sorted(calls)}" + + def test_struct_literal_in_macro_is_not_a_call( + self, temp_repo: Path, mock_ingestor: MagicMock + ) -> None: + # (H) `Widget { ... }` (token_tree starting with `{`) and `arr[..]` (starting + # (H) with `[`) inside a macro are not calls; only `name(...)` is. + (temp_repo / "mac3.rs").write_text( + "struct Widget { n: i32 }\n" + "fn helper() -> i32 { 1 }\n" + "\n" + "fn caller() {\n" + ' println!("{}", Widget { n: helper() }.n);\n' + "}\n", + encoding="utf-8", + ) + + run_updater(temp_repo, mock_ingestor, skip_if_missing="rust") + calls = _calls(mock_ingestor) + + # (H) the real call inside the macro is still captured + assert any( + caller.endswith(".caller") and callee.endswith(".helper") + for caller, callee in calls + ), f"macro call not captured; calls={sorted(calls)}" + # (H) the struct literal `Widget { ... }` must not be a call + assert not any( + caller.endswith(".caller") and callee.endswith(".Widget") + for caller, callee in calls + ), f"struct literal wrongly captured as call; calls={sorted(calls)}" diff --git a/codebase_rag/tests/test_rust_closure_containment_oracle.py b/codebase_rag/tests/test_rust_closure_containment_oracle.py new file mode 100644 index 000000000..2e4666a33 --- /dev/null +++ b/codebase_rag/tests/test_rust_closure_containment_oracle.py @@ -0,0 +1,73 @@ +# (H) Covers Rust closure containment: a closure is DEFINEd by its nearest +# (H) enclosing function-like scope (impl/trait method -> Method, free fn or outer +# (H) closure -> Function). cgr routes closures through its free-function path; the +# (H) syn oracle (evals/oracles/rs_oracle) emits the matching DEFINES via a stack +# (H) of enclosing function-likes. Joined on (kind, file, line) endpoints. +from __future__ import annotations + +from pathlib import Path + +import pytest + +from codebase_rag import constants as cs +from codebase_rag.parser_loader import load_parsers +from evals import constants as ec +from evals.cgr_graph import extract_cgr_rust_graph +from evals.oracles import run_rust_oracle, rust_available +from evals.score import score_edge_types + +RS_SRC = """\ +pub struct Foo; + +impl Foo { + pub fn run(&self) -> i32 { + let c = |x: i32| x + 1; + let nested = || { + let inner = |z: i32| z * 2; + inner(5) + }; + c(2) + nested() + } +} + +pub trait Bar { + fn act(&self) -> i32 { + let t = |q: i32| q - 1; + t(9) + } +} + +pub fn free() -> i32 { + let d = |y: i32| y + 2; + d(3) +} +""" + + +def _require_rust() -> None: + if not rust_available(): + pytest.skip("cargo toolchain not available") + if cs.SupportedLanguage.RUST not in load_parsers()[0]: + pytest.skip("rust parser not available") + + +def test_cgr_matches_syn_oracle_on_closure_containment(tmp_path: Path) -> None: + _require_rust() + project = tmp_path / "rs_clo_edge" + (project / "src").mkdir(parents=True) + (project / "Cargo.toml").write_text( + encoding="utf-8", data='[package]\nname = "rs_clo_edge"\nversion = "0.1.0"\n' + ) + (project / "src" / "lib.rs").write_text(RS_SRC, encoding="utf-8") + + cgr = extract_cgr_rust_graph(project, project.name) + oracle = run_rust_oracle(project) + + result = score_edge_types(cgr, oracle, ec.SCORED_EDGE_TYPES) + by_label = {row["label"]: row for row in result.rows} + row = by_label.get(cs.RelationshipType.DEFINES.value) + assert row is not None, (by_label, result.diff) + assert row["precision"] == 1.0 and row["recall"] == 1.0, (row, result.diff) + # (H) The method-nested closures must contribute resolvable DEFINES edges, + # (H) not just the free-function one (the gap this fix closes). + assert row["tp"] >= 5, (row, result.diff) diff --git a/codebase_rag/tests/test_rust_closure_method_defines.py b/codebase_rag/tests/test_rust_closure_method_defines.py new file mode 100644 index 000000000..e46722b83 --- /dev/null +++ b/codebase_rag/tests/test_rust_closure_method_defines.py @@ -0,0 +1,84 @@ +# (H) Rust closures nested in an impl-method body must get a DEFINES edge from +# (H) the enclosing METHOD, exactly as closures in free functions get one from +# (H) the enclosing function. cgr used to derive the closure's DEFINES parent via +# (H) the FQN scope walk, which could not read an impl block's target type, so the +# (H) parent endpoint dropped the impl target (`lib.run` instead of `lib.Foo.run`) +# (H) and never matched the real Method node, silently dropping the containment. +from __future__ import annotations + +from pathlib import Path +from unittest.mock import MagicMock + +from codebase_rag.constants import KEY_QUALIFIED_NAME, NodeLabel, RelationshipType +from codebase_rag.tests.conftest import ( + create_and_run_updater, + get_nodes, + get_relationships, +) + +_RS = """pub struct Foo; + +impl Foo { + pub fn run(&self) -> i32 { + let c = |x: i32| x + 1; + c(2) + } +} + +pub fn free() -> i32 { + let d = |y: i32| y + 2; + d(3) +} +""" + + +def _project(temp_repo: Path) -> Path: + project = temp_repo / "rs_clo" + (project / "src").mkdir(parents=True) + (project / "Cargo.toml").write_text( + encoding="utf-8", data='[package]\nname = "rs_clo"\nversion = "0.1.0"\n' + ) + (project / "src" / "lib.rs").write_text(encoding="utf-8", data=_RS) + return project + + +def _defines_pairs(mock_ingestor: MagicMock) -> set[tuple[str, str, str]]: + # (H) (parent_label, parent_qn, child_qn) for DEFINES edges. + return { + (call[0][0][0], call[0][0][2], call[0][2][2]) + for call in get_relationships(mock_ingestor, RelationshipType.DEFINES.value) + } + + +def test_rust_closure_in_impl_method_defined_by_method( + temp_repo: Path, mock_ingestor: MagicMock +) -> None: + create_and_run_updater(_project(temp_repo), mock_ingestor, skip_if_missing="rust") + file_mod = "rs_clo.src.lib" + + method_qns = { + str(node[0][1].get(KEY_QUALIFIED_NAME)) + for node in get_nodes(mock_ingestor, NodeLabel.METHOD) + } + assert f"{file_mod}.Foo.run" in method_qns, method_qns + + function_qns = { + str(node[0][1].get(KEY_QUALIFIED_NAME)) + for node in get_nodes(mock_ingestor, NodeLabel.FUNCTION) + } + + pairs = _defines_pairs(mock_ingestor) + # (H) Every DEFINES edge's parent endpoint must resolve to a real node; + # (H) the method-closure edge used to point at the phantom `lib.run`. + method_defines = { + (parent_qn, child_qn) + for (parent_label, parent_qn, child_qn) in pairs + if parent_label == NodeLabel.METHOD.value + } + assert method_defines, pairs + closure_child = next( + child_qn + for (parent_qn, child_qn) in method_defines + if parent_qn == f"{file_mod}.Foo.run" + ) + assert closure_child in function_qns, (closure_child, function_qns) diff --git a/codebase_rag/tests/test_rust_containment_oracle.py b/codebase_rag/tests/test_rust_containment_oracle.py new file mode 100644 index 000000000..9c0820b58 --- /dev/null +++ b/codebase_rag/tests/test_rust_containment_oracle.py @@ -0,0 +1,89 @@ +# (H) Covers Rust containment-edge validation: cgr's DEFINES (module -> item / +# (H) nested module) and DEFINES_METHOD (struct/trait -> method) edges are graded +# (H) against the independent syn oracle (evals/oracles/rs_oracle), joined on +# (H) (kind, file, line) endpoints. Exercises an inherent impl, a trait method, +# (H) and an impl inside a nested `mod` (cross-module type resolution). +from __future__ import annotations + +from pathlib import Path + +import pytest + +from codebase_rag import constants as cs +from codebase_rag.parser_loader import load_parsers +from evals import constants as ec +from evals.cgr_graph import extract_cgr_rust_graph +from evals.oracles import run_rust_oracle, rust_available +from evals.score import score_edge_types + +RS_SRC = """\ +pub trait Shape { + fn area(&self) -> f64 { 0.0 } +} + +pub struct Point { + x: i32, +} + +impl Point { + pub fn new() -> Point { + Point { x: 0 } + } +} + +impl Shape for Point { + fn area(&self) -> f64 { + 1.0 + } +} + +pub fn free() -> i32 { + 1 +} + +pub mod inner { + pub struct Widget { + w: i32, + } + + impl Widget { + pub fn build(&self) -> i32 { + self.w + } + } +} +""" + + +def _require_rust() -> None: + if not rust_available(): + pytest.skip("cargo toolchain not available") + if cs.SupportedLanguage.RUST not in load_parsers()[0]: + pytest.skip("rust parser not available") + + +def test_cgr_matches_syn_oracle_on_containment_edges(tmp_path: Path) -> None: + _require_rust() + project = tmp_path / "rs_edge" + (project / "src").mkdir(parents=True) + (project / "Cargo.toml").write_text( + encoding="utf-8", data='[package]\nname = "rs_edge"\nversion = "0.1.0"\n' + ) + (project / "src" / "lib.rs").write_text(RS_SRC, encoding="utf-8") + + cgr = extract_cgr_rust_graph(project, project.name) + oracle = run_rust_oracle(project) + + result = score_edge_types(cgr, oracle, ec.SCORED_EDGE_TYPES) + by_label = {row["label"]: row for row in result.rows} + for label in ( + cs.RelationshipType.DEFINES.value, + cs.RelationshipType.DEFINES_METHOD.value, + ): + row = by_label.get(label) + assert row is not None, (label, by_label, result.diff) + assert row["precision"] == 1.0 and row["recall"] == 1.0, ( + label, + row, + result.diff, + ) diff --git a/codebase_rag/tests/test_rust_impl_primitive_target.py b/codebase_rag/tests/test_rust_impl_primitive_target.py new file mode 100644 index 000000000..a6be79f62 --- /dev/null +++ b/codebase_rag/tests/test_rust_impl_primitive_target.py @@ -0,0 +1,44 @@ +# (H) Regression: methods in an `impl Trait for ` block (e.g. +# (H) `impl From for u8`) must be captured. The impl target `u8` is a +# (H) `primitive_type` node, which extract_impl_target did not recognise, so every +# (H) method in such a block was silently dropped. +from __future__ import annotations + +from pathlib import Path +from unittest.mock import MagicMock + +from codebase_rag.constants import KEY_QUALIFIED_NAME, NodeLabel +from codebase_rag.tests.conftest import create_and_run_updater, get_nodes + + +def test_rust_method_on_primitive_impl_target_is_captured( + temp_repo: Path, mock_ingestor: MagicMock +) -> None: + project = temp_repo / "rs_prim" + (project / "src").mkdir(parents=True) + (project / "Cargo.toml").write_text( + encoding="utf-8", data='[package]\nname = "rs_prim"\nversion = "0.1.0"\n' + ) + (project / "src" / "lib.rs").write_text( + encoding="utf-8", + data="""pub enum Foo { A, B } + +impl From for u8 { + fn from(value: Foo) -> Self { + match value { + Foo::A => 0, + Foo::B => 1, + } + } +} +""", + ) + create_and_run_updater(project, mock_ingestor, skip_if_missing="rust") + + method_qns = { + str(node[0][1].get(KEY_QUALIFIED_NAME)) + for node in get_nodes(mock_ingestor, NodeLabel.METHOD) + } + assert any(qn.endswith(".u8.from") for qn in method_qns), ( + f"from() on impl-for-u8 not captured: {method_qns}" + ) diff --git a/codebase_rag/tests/test_rust_inheritance_edges.py b/codebase_rag/tests/test_rust_inheritance_edges.py new file mode 100644 index 000000000..88bd34c58 --- /dev/null +++ b/codebase_rag/tests/test_rust_inheritance_edges.py @@ -0,0 +1,49 @@ +# (H) Rust inheritance was uncaptured: `impl Trait for Type` means Type +# (H) IMPLEMENTS Trait, and a supertrait bound `trait Sub: Super` means Sub +# (H) INHERITS Super. cgr emitted neither (impl blocks and trait bounds were +# (H) never turned into inheritance edges). +from __future__ import annotations + +from pathlib import Path +from unittest.mock import MagicMock + +from codebase_rag.constants import RelationshipType +from codebase_rag.tests.conftest import create_and_run_updater, get_relationships + +_RS = """\ +pub trait Shape {} +pub trait Drawable: Shape {} + +pub struct Circle; + +impl Shape for Circle {} +impl Drawable for Circle {} +""" + + +def _pairs(mock_ingestor: MagicMock, rel: str) -> set[tuple[str, str]]: + return { + (call[0][0][2], call[0][2][2]) for call in get_relationships(mock_ingestor, rel) + } + + +def test_rust_impl_and_supertrait_edges( + temp_repo: Path, mock_ingestor: MagicMock +) -> None: + project = temp_repo / "rs_inh" + (project / "src").mkdir(parents=True) + (project / "Cargo.toml").write_text( + encoding="utf-8", data='[package]\nname = "rs_inh"\nversion = "0.1.0"\n' + ) + (project / "src" / "lib.rs").write_text(encoding="utf-8", data=_RS) + create_and_run_updater(project, mock_ingestor, skip_if_missing="rust") + + inherits = _pairs(mock_ingestor, RelationshipType.INHERITS.value) + implements = _pairs(mock_ingestor, RelationshipType.IMPLEMENTS.value) + base = "rs_inh.src.lib" + + # (H) impl Trait for Type -> Type IMPLEMENTS Trait. + assert (f"{base}.Circle", f"{base}.Shape") in implements, implements + assert (f"{base}.Circle", f"{base}.Drawable") in implements, implements + # (H) Supertrait bound -> Sub INHERITS Super. + assert (f"{base}.Drawable", f"{base}.Shape") in inherits, inherits diff --git a/codebase_rag/tests/test_rust_inheritance_oracle.py b/codebase_rag/tests/test_rust_inheritance_oracle.py new file mode 100644 index 000000000..3204a224e --- /dev/null +++ b/codebase_rag/tests/test_rust_inheritance_oracle.py @@ -0,0 +1,59 @@ +# (H) Covers Rust inheritance-edge validation: cgr's INHERITS (supertrait bound) +# (H) and IMPLEMENTS (`impl Trait for Type`) edges are graded against the syn +# (H) oracle, by (source node, base SIMPLE NAME). +from __future__ import annotations + +from pathlib import Path + +import pytest + +from codebase_rag import constants as cs +from codebase_rag.parser_loader import load_parsers +from evals import constants as ec +from evals.cgr_graph import extract_cgr_rust_graph +from evals.oracles import run_rust_oracle, rust_available +from evals.score import score_name_edge_types + +RS_SRC = """\ +pub trait Shape {} +pub trait Drawable: Shape {} + +pub struct Circle; + +impl Shape for Circle {} +impl Drawable for Circle {} +""" + + +def _require_rust() -> None: + if not rust_available(): + pytest.skip("cargo toolchain not available") + if cs.SupportedLanguage.RUST not in load_parsers()[0]: + pytest.skip("rust parser not available") + + +def test_cgr_matches_syn_oracle_on_inheritance_edges(tmp_path: Path) -> None: + _require_rust() + project = tmp_path / "rs_inh_edge" + (project / "src").mkdir(parents=True) + (project / "Cargo.toml").write_text( + encoding="utf-8", data='[package]\nname = "rs_inh_edge"\nversion = "0.1.0"\n' + ) + (project / "src" / "lib.rs").write_text(RS_SRC, encoding="utf-8") + + cgr = extract_cgr_rust_graph(project, project.name) + oracle = run_rust_oracle(project) + + result = score_name_edge_types(cgr, oracle, ec.INHERITANCE_NAME_EDGE_TYPES) + by_label = {row["label"]: row for row in result.rows} + for label in ( + cs.RelationshipType.INHERITS.value, + cs.RelationshipType.IMPLEMENTS.value, + ): + row = by_label.get(label) + assert row is not None, (label, by_label, result.diff) + assert row["precision"] == 1.0 and row["recall"] == 1.0, ( + label, + row, + result.diff, + ) diff --git a/codebase_rag/tests/test_rust_nested_module_containment.py b/codebase_rag/tests/test_rust_nested_module_containment.py new file mode 100644 index 000000000..7a924ed4c --- /dev/null +++ b/codebase_rag/tests/test_rust_nested_module_containment.py @@ -0,0 +1,85 @@ +# (H) Rust nested-module containment. cgr qualifies items inside `mod inner` +# (H) with the module path (proj...inner.X), but used to (a) DEFINE them from the +# (H) FILE module while leaving the inner Module node an orphan, and (b) qualify +# (H) impl methods inside the mod against the file module, producing a phantom +# (H) DEFINES_METHOD parent that never matched the real type node. Containment +# (H) must be module-nested: file module -> inner module -> its items, and an +# (H) impl method binds to the type under its enclosing module path. +from __future__ import annotations + +from pathlib import Path +from unittest.mock import MagicMock + +from codebase_rag.constants import KEY_QUALIFIED_NAME, NodeLabel, RelationshipType +from codebase_rag.tests.conftest import ( + create_and_run_updater, + get_nodes, + get_relationships, +) + +_RS = """pub mod inner { + pub fn helper() -> i32 { 1 } + + pub struct Widget { w: i32 } + + impl Widget { + pub fn build(&self) -> i32 { self.w } + } +} +""" + + +def _project(temp_repo: Path) -> Path: + project = temp_repo / "rs_mod" + (project / "src").mkdir(parents=True) + (project / "Cargo.toml").write_text( + encoding="utf-8", data='[package]\nname = "rs_mod"\nversion = "0.1.0"\n' + ) + (project / "src" / "lib.rs").write_text(encoding="utf-8", data=_RS) + return project + + +def _defines_pairs(mock_ingestor: MagicMock) -> set[tuple[str, str, str]]: + # (H) (parent_label, parent_qn, child_qn) for DEFINES edges. + return { + (call[0][0][0], call[0][0][2], call[0][2][2]) + for call in get_relationships(mock_ingestor, RelationshipType.DEFINES.value) + } + + +def test_rust_nested_module_is_module_nested( + temp_repo: Path, mock_ingestor: MagicMock +) -> None: + create_and_run_updater(_project(temp_repo), mock_ingestor, skip_if_missing="rust") + file_mod = "rs_mod.src.lib" + inner = f"{file_mod}.inner" + pairs = _defines_pairs(mock_ingestor) + + # (H) file module DEFINES the inner module (no longer an orphan node). + assert (NodeLabel.MODULE.value, file_mod, inner) in pairs, pairs + # (H) inner module DEFINES its own items, not the file module. + assert (NodeLabel.MODULE.value, inner, f"{inner}.helper") in pairs, pairs + assert (NodeLabel.MODULE.value, inner, f"{inner}.Widget") in pairs, pairs + + +def test_rust_impl_method_in_module_binds_to_nested_type( + temp_repo: Path, mock_ingestor: MagicMock +) -> None: + create_and_run_updater(_project(temp_repo), mock_ingestor, skip_if_missing="rust") + inner = "rs_mod.src.lib.inner" + + method_qns = { + str(node[0][1].get(KEY_QUALIFIED_NAME)) + for node in get_nodes(mock_ingestor, NodeLabel.METHOD) + } + assert f"{inner}.Widget.build" in method_qns, method_qns + + defines_method = { + (call[0][0][2], call[0][2][2]) + for call in get_relationships( + mock_ingestor, RelationshipType.DEFINES_METHOD.value + ) + } + assert (f"{inner}.Widget", f"{inner}.Widget.build") in defines_method, ( + defines_method + ) diff --git a/codebase_rag/tests/test_rust_node_type.py b/codebase_rag/tests/test_rust_node_type.py new file mode 100644 index 000000000..edfa95e13 --- /dev/null +++ b/codebase_rag/tests/test_rust_node_type.py @@ -0,0 +1,99 @@ +from __future__ import annotations + +from pathlib import Path +from unittest.mock import MagicMock + +import pytest + +from codebase_rag import constants as cs +from codebase_rag.parsers.class_ingest.node_type import determine_node_type +from codebase_rag.tests.conftest import ( + create_mock_node, + get_node_names, + run_updater, +) +from codebase_rag.types_defs import NodeType + + +@pytest.mark.parametrize( + ("ts_node_type", "expected"), + [ + (cs.TS_RS_ENUM_ITEM, NodeType.ENUM), + (cs.TS_RS_TRAIT_ITEM, NodeType.INTERFACE), + (cs.TS_RS_TYPE_ITEM, NodeType.TYPE), + (cs.TS_RS_UNION_ITEM, NodeType.UNION), + (cs.TS_RS_STRUCT_ITEM, NodeType.CLASS), + ], +) +def test_determine_node_type_rust(ts_node_type: str, expected: NodeType) -> None: + node = create_mock_node(ts_node_type) + result = determine_node_type(node, "Foo", "crate::Foo", cs.SupportedLanguage.RUST) + assert result == expected + + +@pytest.fixture +def rust_node_type_project(temp_repo: Path) -> Path: + project_path = temp_repo / "rust_node_type_test" + project_path.mkdir() + (project_path / "Cargo.toml").write_text( + encoding="utf-8", + data='[package]\nname = "rust_node_type_test"\nversion = "0.1.0"\n', + ) + (project_path / "src").mkdir() + (project_path / "src" / "lib.rs").write_text(encoding="utf-8", data="") + (project_path / "types.rs").write_text( + encoding="utf-8", + data=( + "pub enum Color { Red, Green, Blue }\n" + "pub trait Drawable { fn draw(&self); }\n" + "pub type Pair = (i32, i32);\n" + "pub union IntOrFloat { i: i32, f: f32 }\n" + "pub struct Point { pub x: f64, pub y: f64 }\n" + ), + ) + return project_path + + +def test_rust_enum_label( + rust_node_type_project: Path, mock_ingestor: MagicMock +) -> None: + run_updater(rust_node_type_project, mock_ingestor, skip_if_missing="rust") + enum_names = get_node_names(mock_ingestor, NodeType.ENUM) + assert len(enum_names) == 1 + assert enum_names.pop().endswith(".Color") + + +def test_rust_trait_label( + rust_node_type_project: Path, mock_ingestor: MagicMock +) -> None: + run_updater(rust_node_type_project, mock_ingestor, skip_if_missing="rust") + interface_names = get_node_names(mock_ingestor, NodeType.INTERFACE) + assert len(interface_names) == 1 + assert interface_names.pop().endswith(".Drawable") + + +def test_rust_type_alias_label( + rust_node_type_project: Path, mock_ingestor: MagicMock +) -> None: + run_updater(rust_node_type_project, mock_ingestor, skip_if_missing="rust") + type_names = get_node_names(mock_ingestor, NodeType.TYPE) + assert len(type_names) == 1 + assert type_names.pop().endswith(".Pair") + + +def test_rust_union_label( + rust_node_type_project: Path, mock_ingestor: MagicMock +) -> None: + run_updater(rust_node_type_project, mock_ingestor, skip_if_missing="rust") + union_names = get_node_names(mock_ingestor, NodeType.UNION) + assert len(union_names) == 1 + assert union_names.pop().endswith(".IntOrFloat") + + +def test_rust_struct_label( + rust_node_type_project: Path, mock_ingestor: MagicMock +) -> None: + run_updater(rust_node_type_project, mock_ingestor, skip_if_missing="rust") + class_names = get_node_names(mock_ingestor, NodeType.CLASS) + assert len(class_names) == 1 + assert class_names.pop().endswith(".Point") diff --git a/codebase_rag/tests/test_rust_span_oracle.py b/codebase_rag/tests/test_rust_span_oracle.py new file mode 100644 index 000000000..5bd9abb53 --- /dev/null +++ b/codebase_rag/tests/test_rust_span_oracle.py @@ -0,0 +1,83 @@ +# (H) Covers Rust node SPAN (end_line) validation: cgr's end_line for each node is +# (H) graded against the syn oracle (which emits the whole-node span end), joined +# (H) on (kind, file, start) endpoints. Exercises doc comments, multi-line +# (H) attributes, a multi-line signature, a where-clause, and a multi-line closure +# (H) so the span is not trivially the start line. +from __future__ import annotations + +from pathlib import Path + +import pytest + +from codebase_rag import constants as cs +from codebase_rag.parser_loader import load_parsers +from evals import constants as ec +from evals.cgr_graph import extract_cgr_rust_graph +from evals.oracles import run_rust_oracle, rust_available +from evals.score import score_span + +RS_SRC = """\ +/// A documented struct +/// spanning several doc lines. +#[derive(Debug, Clone)] +pub struct Widget { + name: String, + size: u32, +} + +impl Widget { + pub fn area( + &self, + scale: u32, + ) -> u32 { + self.size * scale + } +} + +pub trait Drawable { + fn draw(&self) -> String { + String::from("x") + } +} + +pub fn standalone() +where + u32: Sized, +{ + let cb = |v: u32| { + v + 1 + }; + let _ = cb(2); +} +""" + + +def _require_rust() -> None: + if not rust_available(): + pytest.skip("cargo toolchain not available") + if cs.SupportedLanguage.RUST not in load_parsers()[0]: + pytest.skip("rust parser not available") + + +def test_cgr_matches_syn_oracle_on_node_spans(tmp_path: Path) -> None: + _require_rust() + project = tmp_path / "rs_span" + (project / "src").mkdir(parents=True) + (project / "Cargo.toml").write_text( + encoding="utf-8", data='[package]\nname = "rs_span"\nversion = "0.1.0"\n' + ) + (project / "src" / "lib.rs").write_text(RS_SRC, encoding="utf-8") + + cgr = extract_cgr_rust_graph(project, project.name) + oracle = run_rust_oracle(project) + + result = score_span(cgr, oracle, ec.RS_SCORED_NODE_KINDS) + by_label = {row["label"]: row for row in result.rows} + aggregate = by_label.get(ec.AGGREGATE_LABEL) + assert aggregate is not None, (by_label, result.diff) + assert aggregate["precision"] == 1.0 and aggregate["recall"] == 1.0, ( + aggregate, + result.diff, + ) + # (H) Guard the sample actually exercises multi-line spans (else it is vacuous). + assert aggregate["tp"] >= 5, aggregate diff --git a/codebase_rag/tests/test_rust_structure_oracle.py b/codebase_rag/tests/test_rust_structure_oracle.py new file mode 100644 index 000000000..f9e9e9fa8 --- /dev/null +++ b/codebase_rag/tests/test_rust_structure_oracle.py @@ -0,0 +1,68 @@ +# (H) Covers the Rust structure oracle harness (evals/oracles/rs_oracle + +# (H) evals/rust_l1.py): the syn-based oracle is authoritative ground truth, and +# (H) cgr's captured Rust nodes are graded against it on (kind, file, start_line). +from __future__ import annotations + +from pathlib import Path + +import pytest + +from codebase_rag import constants as cs +from codebase_rag.parser_loader import load_parsers +from evals import constants as ec +from evals.cgr_graph import extract_cgr_rust_nodes +from evals.oracles import run_rust_oracle, rust_available +from evals.score import score_node_kinds +from evals.types_defs import GraphData + +RS_SRC = """\ +pub struct Point { pub x: i32, pub y: i32 } +pub enum Direction { North, South } +pub trait Shape { fn area(&self) -> f64; } +pub type Meters = f64; + +pub fn free_fn(a: i32) -> i32 { a + 1 } + +impl Point { + pub fn new(x: i32, y: i32) -> Self { Point { x, y } } +} + +impl Shape for Point { + fn area(&self) -> f64 { 0.0 } +} +""" + + +def _require_rust() -> None: + if not rust_available(): + pytest.skip("cargo toolchain not available") + if cs.SupportedLanguage.RUST not in load_parsers()[0]: + pytest.skip("rust parser not available") + + +def _project(tmp_path: Path) -> Path: + project = tmp_path / "rs_oracle_test" + (project / "src").mkdir(parents=True) + (project / "Cargo.toml").write_text( + encoding="utf-8", data='[package]\nname = "rs_oracle_test"\nversion = "0.1.0"\n' + ) + (project / "src" / "lib.rs").write_text(RS_SRC, encoding="utf-8") + return project + + +def test_cgr_matches_syn_oracle_on_rust_structure(tmp_path: Path) -> None: + _require_rust() + project = _project(tmp_path) + cgr = GraphData( + nodes=extract_cgr_rust_nodes(project, project.name), + edges=set(), + name_edges=set(), + ) + oracle = run_rust_oracle(project) + + result = score_node_kinds(cgr, oracle, ec.RS_SCORED_NODE_KINDS) + by_label = {row["label"]: row for row in result.rows} + for label in ("Class", "Interface", "Enum", "Type", "Function", "Method"): + row = by_label.get(label) + assert row is not None, (label, by_label) + assert row["precision"] == 1.0 and row["recall"] == 1.0, (label, row) diff --git a/codebase_rag/tests/test_rust_trait_method_containment.py b/codebase_rag/tests/test_rust_trait_method_containment.py new file mode 100644 index 000000000..26db7c491 --- /dev/null +++ b/codebase_rag/tests/test_rust_trait_method_containment.py @@ -0,0 +1,43 @@ +# (H) Regression: a DEFINES_METHOD relationship is matched in the graph by the +# (H) parent's LABEL and qualified_name, so a method on a non-Class container +# (H) (a Rust trait -> Interface node) must be emitted with the parent's real +# (H) label. It was hardcoded to Class, so MATCH (a:Class {qn: trait}) found +# (H) nothing and the trait -> method containment edge was silently dropped. +from __future__ import annotations + +from pathlib import Path +from unittest.mock import MagicMock + +from codebase_rag.constants import NodeLabel, RelationshipType +from codebase_rag.tests.conftest import create_and_run_updater, get_relationships + + +def test_rust_trait_method_defined_by_interface_node( + temp_repo: Path, mock_ingestor: MagicMock +) -> None: + project = temp_repo / "rs_trait" + (project / "src").mkdir(parents=True) + (project / "Cargo.toml").write_text( + encoding="utf-8", data='[package]\nname = "rs_trait"\nversion = "0.1.0"\n' + ) + (project / "src" / "lib.rs").write_text( + encoding="utf-8", + data="""pub trait Shape { + fn area(&self) -> f64 { 0.0 } +} +""", + ) + create_and_run_updater(project, mock_ingestor, skip_if_missing="rust") + + defines_method = get_relationships( + mock_ingestor, RelationshipType.DEFINES_METHOD.value + ) + # (H) (parent_label, parent_qn) pairs for the trait's method. + parents = { + (call[0][0][0], call[0][0][2]) + for call in defines_method + if str(call[0][2][2]).endswith(".Shape.area") + } + assert (NodeLabel.INTERFACE.value, "rs_trait.src.lib.Shape") in parents, parents + # (H) The wrong Class-labelled parent must not be emitted. + assert (NodeLabel.CLASS.value, "rs_trait.src.lib.Shape") not in parents, parents diff --git a/codebase_rag/tests/test_shell_command.py b/codebase_rag/tests/test_shell_command.py index f745b2e30..cf57396d1 100644 --- a/codebase_rag/tests/test_shell_command.py +++ b/codebase_rag/tests/test_shell_command.py @@ -1,5 +1,6 @@ from __future__ import annotations +import sys from pathlib import Path from unittest.mock import MagicMock @@ -274,6 +275,64 @@ def test_empty_segment(self) -> None: available = ", ".join(sorted(settings.SHELL_COMMAND_ALLOWLIST)) assert _validate_segment("", available) is None + def test_bypass_allowlist_skips_allowlist_error(self) -> None: + available = ", ".join(sorted(settings.SHELL_COMMAND_ALLOWLIST)) + assert ( + _validate_segment( + "curl http://example.com", available, bypass_allowlist=True + ) + is None + ) + + def test_bypass_allowlist_still_blocks_dangerous_rm(self) -> None: + available = ", ".join(sorted(settings.SHELL_COMMAND_ALLOWLIST)) + error = _validate_segment("rm -rf /", available, bypass_allowlist=True) + assert error is not None + assert "dangerous" in error.lower() + + +class TestYoloMode: + async def test_yolo_skips_approval_for_write_command( + self, temp_project_root: Path + ) -> None: + test_file = temp_project_root / "yolo_target.txt" + test_file.write_text("bye", encoding="utf-8") + commander = ShellCommander( + str(temp_project_root), timeout=5, is_yolo=lambda: True + ) + tool = create_shell_command_tool(commander) + mock_ctx = MagicMock() + mock_ctx.tool_call_approved = False + result = await tool.function(mock_ctx, "rm yolo_target.txt") + assert result.return_code == 0 + assert not test_file.exists() + + async def test_yolo_runs_non_allowlist_command( + self, temp_project_root: Path + ) -> None: + commander = ShellCommander( + str(temp_project_root), timeout=5, is_yolo=lambda: True + ) + tool = create_shell_command_tool(commander) + mock_ctx = MagicMock() + mock_ctx.tool_call_approved = False + assert "printf" not in settings.SHELL_COMMAND_ALLOWLIST + result = await tool.function(mock_ctx, "printf hello") + assert "not in the allowlist" not in result.stderr + + async def test_yolo_still_blocks_dangerous_rm_rf( + self, temp_project_root: Path + ) -> None: + commander = ShellCommander( + str(temp_project_root), timeout=5, is_yolo=lambda: True + ) + tool = create_shell_command_tool(commander) + mock_ctx = MagicMock() + mock_ctx.tool_call_approved = False + result = await tool.function(mock_ctx, "rm -rf /") + assert result.return_code != 0 + assert "dangerous" in result.stderr.lower() + class TestHasRedirectOperators: def test_output_redirect(self) -> None: @@ -386,6 +445,9 @@ async def test_simple_pipe( assert result.return_code == 0 assert "5" in result.stdout + @pytest.mark.skipif( + sys.platform == "win32", reason="Unix find not available on Windows" + ) async def test_find_with_wc( self, shell_commander: ShellCommander, temp_project_root: Path ) -> None: @@ -398,6 +460,10 @@ async def test_find_with_wc( async def test_rg_in_pipeline( self, shell_commander: ShellCommander, temp_project_root: Path ) -> None: + import shutil + + if not shutil.which("rg"): + pytest.skip("rg (ripgrep) not installed") (temp_project_root / "data.txt").write_text("foo\nbar\nbaz\n", encoding="utf-8") result = await shell_commander.execute("cat data.txt | rg bar") assert result.return_code == 0 @@ -630,11 +696,11 @@ def test_path_outside_project(self, tmp_path: Path) -> None: ["rm", "-rf", "../other"], project_root ) assert is_dangerous - assert "outside project" in reason + assert "outside project" in reason or "system directory" in reason def test_safe_path_inside_project(self, tmp_path: Path) -> None: - project_root = tmp_path / "project" - project_root.mkdir() + project_root = (tmp_path / "project").resolve() + project_root.mkdir(exist_ok=True) is_dangerous, _ = _is_dangerous_rm_path( ["rm", "-rf", "subdir/file.txt"], project_root ) @@ -741,7 +807,8 @@ async def test_rm_outside_project_blocked( ) -> None: result = await shell_commander.execute("rm ../outside_project") assert result.return_code == -1 - assert "outside project" in result.stderr.lower() + stderr_lower = result.stderr.lower() + assert "outside project" in stderr_lower or "system directory" in stderr_lower class TestAwkSedXargsPatterns: diff --git a/codebase_rag/tests/test_sibling_mixin_resolution.py b/codebase_rag/tests/test_sibling_mixin_resolution.py new file mode 100644 index 000000000..48bb15156 --- /dev/null +++ b/codebase_rag/tests/test_sibling_mixin_resolution.py @@ -0,0 +1,97 @@ +# (H) L3 finding from the evals/ harness: PythonAstAnalyzerMixin._traverse_single_pass +# (H) calls self._infer_instance_variable_types_from_assignments(...), a method defined +# (H) on the sibling PythonVariableAnalyzerMixin. Neither is the other's base; both are +# (H) combined into the concrete PythonTypeInferenceEngine. A same-named stub in another +# (H) class makes the bare-name trie fallback ambiguous, so resolution must go through +# (H) the concrete subclass's MRO to land on the real sibling method. +from __future__ import annotations + +from pathlib import Path + +from codebase_rag import constants as cs +from codebase_rag.graph_updater import GraphUpdater +from codebase_rag.parser_loader import load_parsers +from codebase_rag.types_defs import PropertyDict, PropertyValue, ResultRow + +PROJECT = "proj" + +FILES = { + "pkg/__init__.py": "", + # (H) A decoy class declaring the same method name (mirrors a TYPE_CHECKING stub) + # (H) so the trie fallback alone cannot pick the right target. + "pkg/decoy.py": ("class Deps:\n def infer_vars(self):\n return None\n"), + "pkg/mixin_a.py": ( + "class AMixin:\n def traverse(self):\n return self.infer_vars()\n" + ), + "pkg/mixin_b.py": ("class BMixin:\n def infer_vars(self):\n return {}\n"), + "pkg/engine.py": ( + "from .mixin_a import AMixin\n" + "from .mixin_b import BMixin\n\n\n" + "class Engine(AMixin, BMixin):\n" + " def other(self):\n" + " return None\n" + ), +} + + +class _Capture: + def __init__(self) -> None: + self.rels: list[tuple[PropertyValue, str, PropertyValue]] = [] + + def ensure_node_batch(self, label: str, properties: PropertyDict) -> None: + return None + + def ensure_relationship_batch( + self, + from_spec: tuple[str, str, PropertyValue], + rel_type: str, + to_spec: tuple[str, str, PropertyValue], + properties: PropertyDict | None = None, + ) -> None: + self.rels.append((from_spec[2], str(rel_type), to_spec[2])) + + def flush_all(self) -> None: + return None + + def fetch_all( + self, query: str, params: PropertyDict | None = None + ) -> list[ResultRow]: + return [] + + def execute_write(self, query: str, params: PropertyDict | None = None) -> None: + return None + + +def _calls(tmp_path: Path) -> set[tuple[PropertyValue, PropertyValue]]: + for rel, content in FILES.items(): + p = tmp_path / rel + p.parent.mkdir(parents=True, exist_ok=True) + p.write_text(content) + parsers, queries = load_parsers() + cap = _Capture() + GraphUpdater( + ingestor=cap, + repo_path=tmp_path, + parsers=parsers, + queries=queries, + project_name=PROJECT, + ).run(force=True) + return { + (frm, to) for (frm, rel, to) in cap.rels if rel == cs.RelationshipType.CALLS + } + + +class TestSiblingMixinResolution: + def test_self_call_resolves_to_sibling_mixin_method(self, tmp_path: Path) -> None: + calls = _calls(tmp_path) + assert ( + "proj.pkg.mixin_a.AMixin.traverse", + "proj.pkg.mixin_b.BMixin.infer_vars", + ) in calls, calls + + def test_does_not_resolve_to_decoy_class(self, tmp_path: Path) -> None: + calls = _calls(tmp_path) + assert ( + "proj.pkg.mixin_a.AMixin.traverse", + "proj.pkg.decoy.Deps.infer_vars", + ) not in calls, calls diff --git a/codebase_rag/tests/test_single_file_repo_path.py b/codebase_rag/tests/test_single_file_repo_path.py new file mode 100644 index 000000000..71d4a28a7 --- /dev/null +++ b/codebase_rag/tests/test_single_file_repo_path.py @@ -0,0 +1,138 @@ +from pathlib import Path +from unittest.mock import MagicMock + +import pytest + +from codebase_rag.tests.conftest import ( + get_node_names, + get_relationships, + run_updater, +) + + +@pytest.fixture +def cpp_single_file(temp_repo: Path) -> Path: + test_file = temp_repo / "cmGlobalFastbuildGenerator.cxx" + test_file.write_text( + encoding="utf-8", + data=""" +#include +#include +#include + +static std::map const compilerIdToFastbuildFamily = { + {"GNU", "gcc"}, + {"Clang", "clang"}, +}; + +static std::set const supportedLanguages = { + "C", + "CXX", +}; + +template +T generateAlias(std::string const& name) { return T(); } + +static void helperFunc() {} + +class FastbuildTarget { +public: + void GenerateAliases(); +}; + +void FastbuildTarget::GenerateAliases() { + auto alias = generateAlias("test"); +} + +void freeFunction() { + helperFunc(); +} +""", + ) + return test_file + + +@pytest.fixture +def ran_single_file_updater(cpp_single_file: Path, mock_ingestor: MagicMock) -> None: + from codebase_rag.graph_updater import GraphUpdater + from codebase_rag.parser_loader import load_parsers + + parsers, queries = load_parsers() + updater = GraphUpdater( + ingestor=mock_ingestor, + repo_path=cpp_single_file, + parsers=parsers, + queries=queries, + ) + updater.run() + + +def test_single_file_repo_path_produces_graph( + ran_single_file_updater: None, + mock_ingestor: MagicMock, +) -> None: + functions = get_node_names(mock_ingestor, "Function") + methods = get_node_names(mock_ingestor, "Method") + classes = get_node_names(mock_ingestor, "Class") + + assert any("generateAlias" in qn for qn in functions) + assert any("helperFunc" in qn for qn in functions) + assert any("freeFunction" in qn for qn in functions) + + assert any("GenerateAliases" in qn for qn in methods) + assert any("FastbuildTarget" in qn for qn in classes) + + defines_rels = get_relationships(mock_ingestor, "DEFINES") + assert len(defines_rels) >= 3 + + calls_rels = get_relationships(mock_ingestor, "CALLS") + assert len(calls_rels) >= 1 + + +def test_single_file_repo_path_static_functions( + ran_single_file_updater: None, + mock_ingestor: MagicMock, +) -> None: + functions = get_node_names(mock_ingestor, "Function") + + assert any("helperFunc" in qn for qn in functions), ( + f"Static function helperFunc not found. Functions: {functions}" + ) + + assert any("generateAlias" in qn for qn in functions), ( + f"Template function generateAlias not found. Functions: {functions}" + ) + + +def test_single_file_repo_path_out_of_class_methods( + ran_single_file_updater: None, + mock_ingestor: MagicMock, +) -> None: + methods = get_node_names(mock_ingestor, "Method") + defines_method_rels = get_relationships(mock_ingestor, "DEFINES_METHOD") + + assert any("GenerateAliases" in qn for qn in methods), ( + f"Out-of-class method GenerateAliases not found. Methods: {methods}" + ) + assert len(defines_method_rels) >= 1 + + +def test_directory_repo_path_still_works( + temp_repo: Path, + mock_ingestor: MagicMock, +) -> None: + project = temp_repo / "normal_project" + project.mkdir() + (project / "main.cpp").write_text( + encoding="utf-8", + data=""" +void doStuff() {} +int main() { doStuff(); return 0; } +""", + ) + + run_updater(project, mock_ingestor) + + functions = get_node_names(mock_ingestor, "Function") + assert any("doStuff" in qn for qn in functions) + assert any("main" in qn for qn in functions) diff --git a/codebase_rag/tests/test_single_query_output_format.py b/codebase_rag/tests/test_single_query_output_format.py new file mode 100644 index 000000000..6e383d6ec --- /dev/null +++ b/codebase_rag/tests/test_single_query_output_format.py @@ -0,0 +1,91 @@ +from __future__ import annotations + +import json +import re +from collections.abc import Generator +from unittest.mock import AsyncMock, MagicMock, patch + +import pytest +from typer.testing import CliRunner + +from codebase_rag import constants as cs +from codebase_rag.cli import app +from codebase_rag.main import main_single_query + +_QUESTION = "What does the parser do?" +_ANSWER = "The parser builds a knowledge graph." + +runner = CliRunner() + +_ANSI = re.compile(r"\x1b\[[0-9;]*m") + + +def _plain(output: str) -> str: + # (H) ANSI-stripped output with Rich soft-wrap newlines rejoined + return _ANSI.sub("", output).replace("\n", "") + + +@pytest.fixture +def mock_agent_stack() -> Generator[MagicMock, None, None]: + agent = MagicMock() + agent.run = AsyncMock(return_value=MagicMock(output=_ANSWER)) + with ( + patch("codebase_rag.main._setup_common_initialization"), + patch("codebase_rag.main.connect_memgraph") as mock_connect, + patch( + "codebase_rag.main._initialize_services_and_agent", + return_value=(agent, [], ""), + ), + ): + mock_connect.return_value.__enter__ = MagicMock(return_value=MagicMock()) + mock_connect.return_value.__exit__ = MagicMock(return_value=False) + yield agent + + +def test_default_format_prints_plain_text( + mock_agent_stack: MagicMock, capsys: pytest.CaptureFixture[str] +) -> None: + main_single_query("/repo", 100, _QUESTION) + + out = capsys.readouterr().out.strip() + assert out == _ANSWER + + +def test_json_format_wraps_query_and_response( + mock_agent_stack: MagicMock, capsys: pytest.CaptureFixture[str] +) -> None: + main_single_query("/repo", 100, _QUESTION, output_format=cs.QueryFormat.JSON) + + payload = json.loads(capsys.readouterr().out) + assert payload == {cs.KEY_QUERY: _QUESTION, cs.KEY_RESPONSE: _ANSWER} + + +def test_json_format_preserves_non_ascii( + capsys: pytest.CaptureFixture[str], +) -> None: + answer = "Le générateur résout les nœuds — déjà" + agent = MagicMock() + agent.run = AsyncMock(return_value=MagicMock(output=answer)) + with ( + patch("codebase_rag.main._setup_common_initialization"), + patch("codebase_rag.main.connect_memgraph") as mock_connect, + patch( + "codebase_rag.main._initialize_services_and_agent", + return_value=(agent, [], ""), + ), + ): + mock_connect.return_value.__enter__ = MagicMock(return_value=MagicMock()) + mock_connect.return_value.__exit__ = MagicMock(return_value=False) + main_single_query("/repo", 100, _QUESTION, output_format=cs.QueryFormat.JSON) + + raw = capsys.readouterr().out + assert answer in raw + assert "\\u" not in raw + assert json.loads(raw)[cs.KEY_RESPONSE] == answer + + +def test_json_format_without_ask_agent_exits_with_error() -> None: + result = runner.invoke(app, ["start", "--output-format", "json"]) + + assert result.exit_code == 1, result.output + assert "ask-agent" in _plain(result.output) diff --git a/codebase_rag/tests/test_slots_and_optimizations.py b/codebase_rag/tests/test_slots_and_optimizations.py new file mode 100644 index 000000000..da8ca621b --- /dev/null +++ b/codebase_rag/tests/test_slots_and_optimizations.py @@ -0,0 +1,127 @@ +from __future__ import annotations + +import pytest + +from codebase_rag.parsers.dependency_parser import ( + CargoTomlParser, + ComposerJsonParser, + CsprojParser, + DependencyParser, + GemfileParser, + GoModParser, + PackageJsonParser, + PyProjectTomlParser, + RequirementsTxtParser, +) +from codebase_rag.parsers.handlers.base import BaseLanguageHandler +from codebase_rag.parsers.handlers.cpp import CppHandler +from codebase_rag.parsers.handlers.java import JavaHandler +from codebase_rag.parsers.handlers.js_ts import JsTsHandler +from codebase_rag.parsers.handlers.lua import LuaHandler +from codebase_rag.parsers.handlers.protocol import LanguageHandler +from codebase_rag.parsers.handlers.python import PythonHandler +from codebase_rag.parsers.handlers.rust import RustHandler +from codebase_rag.parsers.stdlib_extractor import StdlibExtractor +from codebase_rag.parsers.utils import _cached_decode_bytes + + +class TestHandlerSlots: + @pytest.mark.parametrize( + "handler_cls", + [ + BaseLanguageHandler, + PythonHandler, + JavaHandler, + JsTsHandler, + CppHandler, + RustHandler, + LuaHandler, + ], + ) + def test_handler_has_slots(self, handler_cls: type) -> None: + assert hasattr(handler_cls, "__slots__") + + @pytest.mark.parametrize( + "handler_cls", + [ + BaseLanguageHandler, + PythonHandler, + JavaHandler, + JsTsHandler, + CppHandler, + RustHandler, + LuaHandler, + ], + ) + def test_handler_no_instance_dict(self, handler_cls: type) -> None: + instance = handler_cls() + assert not hasattr(instance, "__dict__") + + def test_protocol_has_slots(self) -> None: + assert hasattr(LanguageHandler, "__slots__") + + +class TestDependencyParserSlots: + @pytest.mark.parametrize( + "parser_cls", + [ + DependencyParser, + PyProjectTomlParser, + RequirementsTxtParser, + PackageJsonParser, + CargoTomlParser, + GoModParser, + GemfileParser, + ComposerJsonParser, + CsprojParser, + ], + ) + def test_parser_has_slots(self, parser_cls: type) -> None: + assert hasattr(parser_cls, "__slots__") + + @pytest.mark.parametrize( + "parser_cls", + [ + DependencyParser, + PyProjectTomlParser, + RequirementsTxtParser, + PackageJsonParser, + CargoTomlParser, + GoModParser, + GemfileParser, + ComposerJsonParser, + CsprojParser, + ], + ) + def test_parser_no_instance_dict(self, parser_cls: type) -> None: + instance = parser_cls() + assert not hasattr(instance, "__dict__") + + +class TestStdlibExtractorSlots: + def test_has_slots(self) -> None: + assert hasattr(StdlibExtractor, "__slots__") + assert "function_registry" in StdlibExtractor.__slots__ + assert "repo_path" in StdlibExtractor.__slots__ + assert "project_name" in StdlibExtractor.__slots__ + + def test_no_instance_dict(self) -> None: + extractor = StdlibExtractor() + assert not hasattr(extractor, "__dict__") + + +class TestCachedDecodeBytes: + def test_cache_maxsize(self) -> None: + cache_info = _cached_decode_bytes.cache_info() + assert cache_info.maxsize == 50000 + + def test_decode_bytes(self) -> None: + result = _cached_decode_bytes(b"hello world") + assert result == "hello world" + + def test_decode_caches(self) -> None: + _cached_decode_bytes.cache_clear() + _cached_decode_bytes(b"test_cache") + _cached_decode_bytes(b"test_cache") + info = _cached_decode_bytes.cache_info() + assert info.hits >= 1 diff --git a/codebase_rag/tests/test_slots_lazy_logger.py b/codebase_rag/tests/test_slots_lazy_logger.py new file mode 100644 index 000000000..2772a11f4 --- /dev/null +++ b/codebase_rag/tests/test_slots_lazy_logger.py @@ -0,0 +1,205 @@ +from __future__ import annotations + +from pathlib import Path +from unittest.mock import MagicMock, patch + +import pytest + +from codebase_rag.graph_loader import GraphLoader +from codebase_rag.providers.base import ( + GoogleProvider, + ModelProvider, + OllamaProvider, + OpenAIProvider, +) +from codebase_rag.services.llm import CypherGenerator +from codebase_rag.tools.code_retrieval import CodeRetriever +from codebase_rag.tools.directory_lister import DirectoryLister +from codebase_rag.tools.file_editor import FileEditor +from codebase_rag.tools.file_reader import FileReader +from codebase_rag.tools.file_writer import FileWriter +from codebase_rag.tools.health_checker import HealthChecker +from codebase_rag.tools.shell_command import CommandGroup, ShellCommander + +REPO_ROOT = Path(__file__).resolve().parent.parent + + +SLOTS_CLASSES: list[tuple[type, tuple[str, ...]]] = [ + (FileEditor, ("project_root", "dmp", "parsers")), + (CodeRetriever, ("project_root", "ingestor")), + (FileReader, ("project_root",)), + (FileWriter, ("project_root",)), + (DirectoryLister, ("project_root",)), + (CommandGroup, ("commands", "operator")), + (ShellCommander, ("project_root", "timeout", "is_yolo")), + (HealthChecker, ("results",)), + (CypherGenerator, ("agent",)), + (ModelProvider, ("config",)), + ( + GoogleProvider, + ( + "api_key", + "provider_type", + "project_id", + "region", + "service_account_file", + "thinking_budget", + ), + ), + (OpenAIProvider, ("api_key", "endpoint")), + (OllamaProvider, ("endpoint", "api_key")), +] + +GRAPH_LOADER_SLOTS = ( + "file_path", + "_data", + "_nodes", + "_relationships", + "_nodes_by_id", + "_nodes_by_label", + "_outgoing_rels", + "_incoming_rels", + "_property_indexes", +) + + +class TestSlotsPresence: + @pytest.mark.parametrize( + ("cls", "expected_slots"), + SLOTS_CLASSES, + ids=[c.__name__ for c, _ in SLOTS_CLASSES], + ) + def test_class_has_slots(self, cls: type, expected_slots: tuple[str, ...]) -> None: + assert hasattr(cls, "__slots__") + assert set(cls.__slots__) == set(expected_slots) + + def test_graph_loader_has_slots(self) -> None: + assert hasattr(GraphLoader, "__slots__") + assert set(GraphLoader.__slots__) == set(GRAPH_LOADER_SLOTS) + + +class TestSlotsBlockDict: + def test_command_group_no_dict(self) -> None: + obj = CommandGroup(commands=["ls"], operator=None) + assert not hasattr(obj, "__dict__") + + def test_directory_lister_no_dict(self, tmp_path: Path) -> None: + obj = DirectoryLister(str(tmp_path)) + assert not hasattr(obj, "__dict__") + + def test_file_reader_no_dict(self, tmp_path: Path) -> None: + obj = FileReader(str(tmp_path)) + assert not hasattr(obj, "__dict__") + + def test_file_writer_no_dict(self, tmp_path: Path) -> None: + obj = FileWriter(str(tmp_path)) + assert not hasattr(obj, "__dict__") + + def test_health_checker_no_dict(self) -> None: + obj = HealthChecker() + assert not hasattr(obj, "__dict__") + + def test_shell_commander_no_dict(self, tmp_path: Path) -> None: + obj = ShellCommander(str(tmp_path)) + assert not hasattr(obj, "__dict__") + + def test_code_retriever_no_dict(self, tmp_path: Path) -> None: + mock_ingestor = MagicMock() + obj = CodeRetriever(str(tmp_path), mock_ingestor) + assert not hasattr(obj, "__dict__") + + +class TestSlotsRejectArbitraryAttrs: + def test_command_group_rejects_attr(self) -> None: + obj = CommandGroup(commands=["ls"], operator=None) + with pytest.raises(AttributeError): + obj.arbitrary = 42 + + def test_directory_lister_rejects_attr(self, tmp_path: Path) -> None: + obj = DirectoryLister(str(tmp_path)) + with pytest.raises(AttributeError): + obj.arbitrary = 42 + + def test_health_checker_rejects_attr(self) -> None: + obj = HealthChecker() + with pytest.raises(AttributeError): + obj.arbitrary = 42 + + def test_shell_commander_rejects_attr(self, tmp_path: Path) -> None: + obj = ShellCommander(str(tmp_path)) + with pytest.raises(AttributeError): + obj.arbitrary = 42 + + +LAZY_LOGGER_FILES: list[str] = [ + "parser_loader.py", + "utils/fqn_resolver.py", + "utils/source_extraction.py", + "tools/file_editor.py", +] + + +def _find_eager_debug_calls(source: str) -> list[str]: + results = [] + lines = source.split("\n") + i = 0 + while i < len(lines): + line = lines[i] + stripped = line.strip() + if stripped.startswith("logger.debug("): + block = stripped + j = i + paren_count = block.count("(") - block.count(")") + while paren_count > 0 and j + 1 < len(lines): + j += 1 + block += " " + lines[j].strip() + paren_count += lines[j].count("(") - lines[j].count(")") + if ".format(" in block: + results.append(block[:80]) + i = j + 1 + else: + i += 1 + return results + + +class TestLazyLoggerFormat: + @pytest.mark.parametrize("rel_path", LAZY_LOGGER_FILES) + def test_no_eager_debug_format(self, rel_path: str) -> None: + file_path = REPO_ROOT / rel_path + source = file_path.read_text(encoding="utf-8") + eager_calls = _find_eager_debug_calls(source) + assert len(eager_calls) == 0, ( + f"Found {len(eager_calls)} eager logger.debug(.format()) calls in {rel_path}: {eager_calls}" + ) + + +class TestProviderSlotsInheritance: + def test_google_provider_inherits_config_slot(self) -> None: + assert "config" in ModelProvider.__slots__ + assert "config" not in GoogleProvider.__slots__ + + def test_openai_provider_inherits_config_slot(self) -> None: + assert "config" not in OpenAIProvider.__slots__ + + def test_ollama_provider_inherits_config_slot(self) -> None: + assert "config" not in OllamaProvider.__slots__ + + @patch.dict("os.environ", {"GOOGLE_API_KEY": "test-key"}) + def test_google_provider_instance_has_all_attrs(self) -> None: + provider = GoogleProvider(api_key="test-key") + assert provider.api_key == "test-key" + assert provider.config == {} + + def test_openai_provider_instance_has_all_attrs(self) -> None: + provider = OpenAIProvider(api_key="test-key") + assert provider.api_key == "test-key" + assert provider.config == {} + + @patch("codebase_rag.providers.base.settings") + def test_ollama_provider_instance_has_all_attrs( + self, mock_settings: MagicMock + ) -> None: + mock_settings.ollama_endpoint = "http://localhost:11434/v1/" + provider = OllamaProvider() + assert provider.endpoint == "http://localhost:11434/v1/" + assert provider.config == {} diff --git a/codebase_rag/tests/test_source_extraction.py b/codebase_rag/tests/test_source_extraction.py index df7b9099e..9296c91fb 100644 --- a/codebase_rag/tests/test_source_extraction.py +++ b/codebase_rag/tests/test_source_extraction.py @@ -12,7 +12,7 @@ class TestExtractSourceLines: def test_extracts_single_line(self, tmp_path: Path) -> None: file_path = tmp_path / "test.py" - file_path.write_text(encoding="utf-8", data="line1\nline2\nline3\n") + file_path.write_bytes(b"line1\nline2\nline3\n") result = extract_source_lines(file_path, 2, 2) @@ -20,7 +20,7 @@ def test_extracts_single_line(self, tmp_path: Path) -> None: def test_extracts_multiple_lines(self, tmp_path: Path) -> None: file_path = tmp_path / "test.py" - file_path.write_text(encoding="utf-8", data="line1\nline2\nline3\nline4\n") + file_path.write_bytes(b"line1\nline2\nline3\nline4\n") result = extract_source_lines(file_path, 2, 3) @@ -28,7 +28,7 @@ def test_extracts_multiple_lines(self, tmp_path: Path) -> None: def test_extracts_all_lines(self, tmp_path: Path) -> None: file_path = tmp_path / "test.py" - file_path.write_text(encoding="utf-8", data="line1\nline2\nline3\n") + file_path.write_bytes(b"line1\nline2\nline3\n") result = extract_source_lines(file_path, 1, 3) @@ -36,7 +36,7 @@ def test_extracts_all_lines(self, tmp_path: Path) -> None: def test_strips_trailing_whitespace(self, tmp_path: Path) -> None: file_path = tmp_path / "test.py" - file_path.write_text(encoding="utf-8", data=" code \n more \n") + file_path.write_bytes(b" code \n more \n") result = extract_source_lines(file_path, 1, 2) @@ -51,7 +51,7 @@ def test_returns_none_for_nonexistent_file(self, tmp_path: Path) -> None: def test_returns_none_for_zero_start_line(self, tmp_path: Path) -> None: file_path = tmp_path / "test.py" - file_path.write_text(encoding="utf-8", data="line1\n") + file_path.write_bytes(b"line1\n") result = extract_source_lines(file_path, 0, 1) @@ -59,7 +59,7 @@ def test_returns_none_for_zero_start_line(self, tmp_path: Path) -> None: def test_returns_none_for_negative_start_line(self, tmp_path: Path) -> None: file_path = tmp_path / "test.py" - file_path.write_text(encoding="utf-8", data="line1\n") + file_path.write_bytes(b"line1\n") result = extract_source_lines(file_path, -1, 1) @@ -67,7 +67,7 @@ def test_returns_none_for_negative_start_line(self, tmp_path: Path) -> None: def test_returns_none_for_zero_end_line(self, tmp_path: Path) -> None: file_path = tmp_path / "test.py" - file_path.write_text(encoding="utf-8", data="line1\n") + file_path.write_bytes(b"line1\n") result = extract_source_lines(file_path, 1, 0) @@ -75,7 +75,7 @@ def test_returns_none_for_zero_end_line(self, tmp_path: Path) -> None: def test_returns_none_for_start_greater_than_end(self, tmp_path: Path) -> None: file_path = tmp_path / "test.py" - file_path.write_text(encoding="utf-8", data="line1\nline2\n") + file_path.write_bytes(b"line1\nline2\n") result = extract_source_lines(file_path, 2, 1) @@ -83,23 +83,23 @@ def test_returns_none_for_start_greater_than_end(self, tmp_path: Path) -> None: def test_returns_none_when_start_exceeds_file_length(self, tmp_path: Path) -> None: file_path = tmp_path / "test.py" - file_path.write_text(encoding="utf-8", data="line1\nline2\n") + file_path.write_bytes(b"line1\nline2\n") result = extract_source_lines(file_path, 5, 6) assert result is None - def test_returns_none_when_end_exceeds_file_length(self, tmp_path: Path) -> None: + def test_clamps_when_end_exceeds_file_length(self, tmp_path: Path) -> None: file_path = tmp_path / "test.py" - file_path.write_text(encoding="utf-8", data="line1\nline2\n") + file_path.write_bytes(b"line1\nline2\n") result = extract_source_lines(file_path, 1, 10) - assert result is None + assert result == "line1\nline2" def test_handles_empty_file(self, tmp_path: Path) -> None: file_path = tmp_path / "test.py" - file_path.write_text(encoding="utf-8", data="") + file_path.write_bytes(b"") result = extract_source_lines(file_path, 1, 1) @@ -107,17 +107,61 @@ def test_handles_empty_file(self, tmp_path: Path) -> None: def test_preserves_indentation(self, tmp_path: Path) -> None: file_path = tmp_path / "test.py" - file_path.write_text(encoding="utf-8", data="def func():\n return 42\n") + file_path.write_bytes(b"def func():\n return 42\n") result = extract_source_lines(file_path, 1, 2) assert result == "def func():\n return 42" + def test_counts_blank_lines(self, tmp_path: Path) -> None: + file_path = tmp_path / "test.py" + file_path.write_bytes(b"line1\n\nline3\n\nline5\n") + + result = extract_source_lines(file_path, 1, 5) + + assert result == "line1\n\nline3\n\nline5" + + def test_extracts_across_blank_lines(self, tmp_path: Path) -> None: + file_path = tmp_path / "test.py" + file_path.write_bytes( + b"def func1():\n pass\n\ndef func2():\n return 42\n" + ) + + result = extract_source_lines(file_path, 4, 5) + + assert result == "def func2():\n return 42" + + def test_preserves_internal_blank_lines(self, tmp_path: Path) -> None: + file_path = tmp_path / "test.py" + file_path.write_bytes( + b"def func():\n x = 1\n\n y = 2\n\n return x + y\n" + ) + + result = extract_source_lines(file_path, 1, 6) + + assert result == "def func():\n x = 1\n\n y = 2\n\n return x + y" + + def test_line_count_matches_with_many_blank_lines(self, tmp_path: Path) -> None: + file_path = tmp_path / "test.py" + file_path.write_bytes(b"a\n\n\n\nb\n\n\n\nc\n") + + result = extract_source_lines(file_path, 5, 5) + + assert result == "b" + + def test_clamps_end_line_returns_partial_content(self, tmp_path: Path) -> None: + file_path = tmp_path / "test.py" + file_path.write_bytes(b"def func():\n pass\n\ndef other():\n return 1\n") + + result = extract_source_lines(file_path, 4, 100) + + assert result == "def other():\n return 1" + class TestExtractSourceWithFallback: def test_uses_line_extraction_when_no_ast_extractor(self, tmp_path: Path) -> None: file_path = tmp_path / "test.py" - file_path.write_text(encoding="utf-8", data="line1\nline2\n") + file_path.write_bytes(b"line1\nline2\n") result = extract_source_with_fallback(file_path, 1, 2) @@ -125,7 +169,7 @@ def test_uses_line_extraction_when_no_ast_extractor(self, tmp_path: Path) -> Non def test_uses_ast_extractor_when_provided(self, tmp_path: Path) -> None: file_path = tmp_path / "test.py" - file_path.write_text(encoding="utf-8", data="line1\nline2\n") + file_path.write_bytes(b"line1\nline2\n") def mock_ast_extractor(name: str, path: Path) -> str: return f"AST result for {name}" @@ -140,7 +184,7 @@ def test_falls_back_to_lines_when_ast_extractor_returns_none( self, tmp_path: Path ) -> None: file_path = tmp_path / "test.py" - file_path.write_text(encoding="utf-8", data="line1\nline2\n") + file_path.write_bytes(b"line1\nline2\n") def mock_ast_extractor(name: str, path: Path) -> None: return None @@ -155,7 +199,7 @@ def test_falls_back_to_lines_when_ast_extractor_raises( self, tmp_path: Path ) -> None: file_path = tmp_path / "test.py" - file_path.write_text(encoding="utf-8", data="line1\nline2\n") + file_path.write_bytes(b"line1\nline2\n") def mock_ast_extractor(name: str, path: Path) -> str: raise RuntimeError("AST extraction failed") @@ -168,7 +212,7 @@ def mock_ast_extractor(name: str, path: Path) -> str: def test_skips_ast_when_qualified_name_is_none(self, tmp_path: Path) -> None: file_path = tmp_path / "test.py" - file_path.write_text(encoding="utf-8", data="line1\nline2\n") + file_path.write_bytes(b"line1\nline2\n") ast_called = False def mock_ast_extractor(name: str, path: Path) -> str: @@ -185,7 +229,7 @@ def mock_ast_extractor(name: str, path: Path) -> str: def test_skips_ast_when_extractor_is_none(self, tmp_path: Path) -> None: file_path = tmp_path / "test.py" - file_path.write_text(encoding="utf-8", data="line1\nline2\n") + file_path.write_bytes(b"line1\nline2\n") result = extract_source_with_fallback( file_path, 1, 2, qualified_name="my.func", ast_extractor=None diff --git a/codebase_rag/tests/test_stack_manager.py b/codebase_rag/tests/test_stack_manager.py new file mode 100644 index 000000000..1ca899856 --- /dev/null +++ b/codebase_rag/tests/test_stack_manager.py @@ -0,0 +1,209 @@ +from __future__ import annotations + +import subprocess +from pathlib import Path +from unittest.mock import patch + +import pytest + +from codebase_rag.stack import constants as stack_cs +from codebase_rag.stack.manager import StackError, StackManager + + +def _fake_subprocess_result( + returncode: int = 0, stdout: str = "", stderr: str = "" +) -> subprocess.CompletedProcess[str]: + return subprocess.CompletedProcess( + args=[], returncode=returncode, stdout=stdout, stderr=stderr + ) + + +def _make_compose_source(tmp_path: Path) -> Path: + src = tmp_path / "src_compose.yaml" + src.write_text("services: {}\n", encoding="utf-8") + return src + + +@pytest.fixture +def stack_home(tmp_path: Path) -> Path: + home = tmp_path / "cgr-home" + home.mkdir() + return home + + +def test_ensure_compose_file_copies_when_missing( + stack_home: Path, tmp_path: Path +) -> None: + src = _make_compose_source(tmp_path) + mgr = StackManager(home=stack_home, package_compose=src) + target = mgr.ensure_compose_file() + assert target == stack_home / stack_cs.COMPOSE_FILENAME + assert target.read_text(encoding="utf-8") == src.read_text(encoding="utf-8") + + +def test_ensure_compose_file_preserves_existing( + stack_home: Path, tmp_path: Path +) -> None: + src = _make_compose_source(tmp_path) + target = stack_home / stack_cs.COMPOSE_FILENAME + target.write_text("custom: yes\n", encoding="utf-8") + mgr = StackManager(home=stack_home, package_compose=src) + result = mgr.ensure_compose_file() + assert result.read_text(encoding="utf-8") == "custom: yes\n" + + +def test_ensure_compose_file_raises_when_source_missing( + stack_home: Path, tmp_path: Path +) -> None: + missing = tmp_path / "nope.yaml" + mgr = StackManager(home=stack_home, package_compose=missing) + with pytest.raises(StackError): + mgr.ensure_compose_file() + + +def test_check_docker_raises_when_docker_not_on_path(stack_home: Path) -> None: + mgr = StackManager(home=stack_home, package_compose=Path("/dev/null")) + with patch("codebase_rag.stack.manager.shutil.which", return_value=None): + with pytest.raises(StackError) as exc: + mgr.check_docker() + assert "docker not found" in str(exc.value).lower() + + +def test_check_docker_raises_when_daemon_down(stack_home: Path) -> None: + mgr = StackManager(home=stack_home, package_compose=Path("/dev/null")) + with ( + patch( + "codebase_rag.stack.manager.shutil.which", return_value="/usr/bin/docker" + ), + patch( + "codebase_rag.stack.manager.subprocess.run", + return_value=_fake_subprocess_result(returncode=1, stderr="daemon down"), + ), + ): + with pytest.raises(StackError) as exc: + mgr.check_docker() + assert "daemon" in str(exc.value).lower() + + +def test_check_docker_raises_when_compose_missing(stack_home: Path) -> None: + mgr = StackManager(home=stack_home, package_compose=Path("/dev/null")) + + def fake_run(cmd: list[str], **_: object) -> subprocess.CompletedProcess[str]: + if cmd[:2] == ["docker", "info"]: + return _fake_subprocess_result(returncode=0) + return _fake_subprocess_result(returncode=1) + + with ( + patch( + "codebase_rag.stack.manager.shutil.which", return_value="/usr/bin/docker" + ), + patch("codebase_rag.stack.manager.subprocess.run", side_effect=fake_run), + ): + with pytest.raises(StackError) as exc: + mgr.check_docker() + assert "compose" in str(exc.value).lower() + + +def test_status_returns_stopped_when_nothing_reachable(stack_home: Path) -> None: + mgr = StackManager(home=stack_home, package_compose=Path("/dev/null")) + with ( + patch("codebase_rag.stack.manager.wait_for_memgraph", return_value=False), + patch("codebase_rag.stack.manager.wait_for_qdrant", return_value=False), + ): + status = mgr.status() + assert status.state == stack_cs.StackState.STOPPED + + +def test_status_returns_running_when_both_reachable(stack_home: Path) -> None: + mgr = StackManager(home=stack_home, package_compose=Path("/dev/null")) + with ( + patch("codebase_rag.stack.manager.wait_for_memgraph", return_value=True), + patch("codebase_rag.stack.manager.wait_for_qdrant", return_value=True), + ): + status = mgr.status() + assert status.state == stack_cs.StackState.RUNNING + assert status.memgraph_reachable + assert status.qdrant_reachable + + +def test_status_returns_partial_when_only_memgraph_reachable(stack_home: Path) -> None: + mgr = StackManager(home=stack_home, package_compose=Path("/dev/null")) + with ( + patch("codebase_rag.stack.manager.wait_for_memgraph", return_value=True), + patch("codebase_rag.stack.manager.wait_for_qdrant", return_value=False), + ): + status = mgr.status() + assert status.state == stack_cs.StackState.PARTIAL + + +def test_compose_cmd_uses_project_and_file(stack_home: Path, tmp_path: Path) -> None: + src = _make_compose_source(tmp_path) + mgr = StackManager(home=stack_home, package_compose=src, project_name="cgr-test") + cmd = mgr._compose_cmd("up", "-d") + assert cmd[0] == "docker" + assert cmd[1] == "compose" + assert "-p" in cmd and "cgr-test" in cmd + assert "-f" in cmd + assert str(mgr.compose_file) in cmd + assert cmd[-2:] == ["up", "-d"] + + +def test_ensure_running_skips_docker_when_already_up( + stack_home: Path, tmp_path: Path +) -> None: + src = _make_compose_source(tmp_path) + mgr = StackManager(home=stack_home, package_compose=src) + with ( + patch("codebase_rag.stack.manager.wait_for_memgraph", return_value=True), + patch("codebase_rag.stack.manager.wait_for_qdrant", return_value=True), + patch.object(mgr, "up") as mock_up, + patch.object(mgr, "wait_healthy") as mock_wait, + ): + status = mgr.ensure_running() + assert status.state == stack_cs.StackState.RUNNING + mock_up.assert_not_called() + mock_wait.assert_not_called() + + +def test_ensure_running_starts_when_stopped(stack_home: Path, tmp_path: Path) -> None: + src = _make_compose_source(tmp_path) + mgr = StackManager(home=stack_home, package_compose=src) + reachable_state = {"memgraph": False, "qdrant": False} + + def memgraph_check(*_: object, **__: object) -> bool: + return reachable_state["memgraph"] + + def qdrant_check(*_: object, **__: object) -> bool: + return reachable_state["qdrant"] + + def fake_up(timeout: float = 0.0) -> None: + reachable_state["memgraph"] = True + reachable_state["qdrant"] = True + + with ( + patch( + "codebase_rag.stack.manager.wait_for_memgraph", side_effect=memgraph_check + ), + patch("codebase_rag.stack.manager.wait_for_qdrant", side_effect=qdrant_check), + patch.object(mgr, "up", side_effect=fake_up) as mock_up, + patch.object(mgr, "wait_healthy") as mock_wait, + ): + status = mgr.ensure_running() + assert status.state == stack_cs.StackState.RUNNING + mock_up.assert_called_once() + mock_wait.assert_called_once() + + +def test_up_propagates_failure(stack_home: Path, tmp_path: Path) -> None: + src = _make_compose_source(tmp_path) + mgr = StackManager(home=stack_home, package_compose=src) + with ( + patch.object(mgr, "check_docker"), + patch( + "codebase_rag.stack.manager.subprocess.run", + return_value=_fake_subprocess_result(returncode=1, stderr="boom"), + ), + ): + with pytest.raises(StackError) as exc: + mgr.up() + assert "boom" in str(exc.value) or "Failed" in str(exc.value) diff --git a/codebase_rag/tests/test_stats_command.py b/codebase_rag/tests/test_stats_command.py new file mode 100644 index 000000000..6e86f251b --- /dev/null +++ b/codebase_rag/tests/test_stats_command.py @@ -0,0 +1,138 @@ +from __future__ import annotations + +from unittest.mock import MagicMock, patch + +import pytest +from typer.testing import CliRunner + +from codebase_rag.cli import app +from codebase_rag.types_defs import ResultRow + + +@pytest.fixture +def runner() -> CliRunner: + return CliRunner() + + +@pytest.fixture +def mock_node_results() -> list[ResultRow]: + return [ + {"labels": ["Function"], "count": 100}, + {"labels": ["Class"], "count": 50}, + {"labels": ["Module"], "count": 30}, + ] + + +@pytest.fixture +def mock_rel_results() -> list[ResultRow]: + return [ + {"type": "CALLS", "count": 200}, + {"type": "DEFINES", "count": 80}, + ] + + +def _make_mock_ingestor(*fetch_side_effects: list[ResultRow]) -> MagicMock: + mock = MagicMock() + mock.fetch_all.side_effect = list(fetch_side_effects) + mock.__enter__ = MagicMock(return_value=mock) + mock.__exit__ = MagicMock(return_value=False) + return mock + + +class TestStatsCommand: + def test_stats_displays_node_table( + self, + runner: CliRunner, + mock_node_results: list[ResultRow], + mock_rel_results: list[ResultRow], + ) -> None: + mock_ingestor = _make_mock_ingestor(mock_node_results, mock_rel_results) + with patch("codebase_rag.cli.connect_memgraph", return_value=mock_ingestor): + result = runner.invoke(app, ["stats"]) + + assert result.exit_code == 0 + assert "Function" in result.output + assert "Class" in result.output + assert "Module" in result.output + + def test_stats_displays_relationship_table( + self, + runner: CliRunner, + mock_node_results: list[ResultRow], + mock_rel_results: list[ResultRow], + ) -> None: + mock_ingestor = _make_mock_ingestor(mock_node_results, mock_rel_results) + with patch("codebase_rag.cli.connect_memgraph", return_value=mock_ingestor): + result = runner.invoke(app, ["stats"]) + + assert result.exit_code == 0 + assert "CALLS" in result.output + assert "DEFINES" in result.output + + def test_stats_displays_totals( + self, + runner: CliRunner, + mock_node_results: list[ResultRow], + mock_rel_results: list[ResultRow], + ) -> None: + mock_ingestor = _make_mock_ingestor(mock_node_results, mock_rel_results) + with patch("codebase_rag.cli.connect_memgraph", return_value=mock_ingestor): + result = runner.invoke(app, ["stats"]) + + assert result.exit_code == 0 + assert "180" in result.output + assert "280" in result.output + + def test_stats_handles_empty_graph( + self, + runner: CliRunner, + ) -> None: + mock_ingestor = _make_mock_ingestor([], []) + with patch("codebase_rag.cli.connect_memgraph", return_value=mock_ingestor): + result = runner.invoke(app, ["stats"]) + + assert result.exit_code == 0 + assert "0" in result.output + + def test_stats_handles_connection_error( + self, + runner: CliRunner, + ) -> None: + with patch( + "codebase_rag.cli.connect_memgraph", + side_effect=ConnectionError("Cannot connect"), + ): + result = runner.invoke(app, ["stats"]) + + assert result.exit_code == 1 + assert "Failed" in result.output + + def test_stats_handles_multi_label_nodes( + self, + runner: CliRunner, + mock_rel_results: list[ResultRow], + ) -> None: + node_results: list[ResultRow] = [ + {"labels": ["Function", "Exported"], "count": 10}, + ] + mock_ingestor = _make_mock_ingestor(node_results, mock_rel_results) + with patch("codebase_rag.cli.connect_memgraph", return_value=mock_ingestor): + result = runner.invoke(app, ["stats"]) + + assert result.exit_code == 0 + assert "Function:Exported" in result.output + + def test_stats_handles_empty_labels( + self, + runner: CliRunner, + mock_rel_results: list[ResultRow], + ) -> None: + node_results: list[ResultRow] = [ + {"labels": [], "count": 5}, + ] + mock_ingestor = _make_mock_ingestor(node_results, mock_rel_results) + with patch("codebase_rag.cli.connect_memgraph", return_value=mock_ingestor): + result = runner.invoke(app, ["stats"]) + + assert result.exit_code == 0 + assert "Unknown" in result.output diff --git a/codebase_rag/tests/test_status_bar_config.py b/codebase_rag/tests/test_status_bar_config.py new file mode 100644 index 000000000..b33597009 --- /dev/null +++ b/codebase_rag/tests/test_status_bar_config.py @@ -0,0 +1,250 @@ +from __future__ import annotations + +from pathlib import Path +from unittest.mock import MagicMock, patch + +import pytest + +from codebase_rag import constants as cs +from codebase_rag import main as main_mod + + +@pytest.fixture(autouse=True) +def reset_session(monkeypatch: pytest.MonkeyPatch): + main_mod.app_context.session.confirm_edits = True + main_mod.app_context.session.load_cgr_instructions = True + main_mod.app_context.session.target_repo = None + yield + + +@patch("codebase_rag.main.settings") +def test_config_segments_always_shows_both_models( + mock_settings: MagicMock, +) -> None: + mock_settings.active_orchestrator_config.model_id = "anthropic:claude-opus-4-7" + mock_settings.active_cypher_config.model_id = "anthropic:claude-opus-4-7" + main_mod.app_context.session.target_repo = Path("/tmp/myrepo") + + segments = dict(main_mod._config_segments()) + + assert segments[cs.STATUS_BAR_CONFIG_LABEL_O] == "claude-opus-4-7" + assert segments[cs.STATUS_BAR_CONFIG_LABEL_C] == "claude-opus-4-7" + assert segments[cs.STATUS_BAR_CONFIG_LABEL_EDIT] == cs.STATUS_BAR_EDIT_ON + assert segments[cs.STATUS_BAR_CONFIG_LABEL_INSTRUCTIONS] == cs.STATUS_BAR_EDIT_ON + assert segments[cs.STATUS_BAR_CONFIG_LABEL_REPO] == "/tmp/myrepo" + + +@patch("codebase_rag.main.settings") +def test_config_segments_shows_distinct_models( + mock_settings: MagicMock, +) -> None: + mock_settings.active_orchestrator_config.model_id = "anthropic:claude-opus-4-7" + mock_settings.active_cypher_config.model_id = "anthropic:claude-haiku-4-5" + + segments = dict(main_mod._config_segments()) + + assert segments[cs.STATUS_BAR_CONFIG_LABEL_O] == "claude-opus-4-7" + assert segments[cs.STATUS_BAR_CONFIG_LABEL_C] == "claude-haiku-4-5" + + +@patch("codebase_rag.main.settings") +def test_config_segments_reflects_session_flags( + mock_settings: MagicMock, +) -> None: + mock_settings.active_orchestrator_config.model_id = "anthropic:claude-opus-4-7" + mock_settings.active_cypher_config.model_id = "anthropic:claude-opus-4-7" + main_mod.app_context.session.confirm_edits = False + main_mod.app_context.session.load_cgr_instructions = False + + segments = dict(main_mod._config_segments()) + + assert segments[cs.STATUS_BAR_CONFIG_LABEL_EDIT] == cs.STATUS_BAR_EDIT_OFF + assert segments[cs.STATUS_BAR_CONFIG_LABEL_INSTRUCTIONS] == cs.STATUS_BAR_EDIT_OFF + + +@patch("codebase_rag.main.settings") +def test_abbreviated_repo_uses_tilde_for_home_paths( + mock_settings: MagicMock, +) -> None: + inside_home = Path.home() / "Documents" / "platform" + + assert main_mod._abbreviated_repo(inside_home) == "~/Documents/platform" + + +def test_abbreviated_repo_keeps_absolute_for_outside_paths() -> None: + assert main_mod._abbreviated_repo(Path("/etc/hosts")) == "/etc/hosts" + + +def test_abbreviated_repo_handles_none() -> None: + assert main_mod._abbreviated_repo(None) == "" + + +@patch("codebase_rag.main.settings") +def test_config_status_html_includes_model_and_repo( + mock_settings: MagicMock, +) -> None: + mock_settings.active_orchestrator_config.model_id = "anthropic:claude-opus-4-7" + mock_settings.active_cypher_config.model_id = "anthropic:claude-opus-4-7" + main_mod.app_context.session.target_repo = Path("/tmp/showme") + + html = main_mod._config_status_html() + + assert "claude-opus-4-7" in html + assert "/tmp/showme" in html + assert cs.STATUS_BAR_CONFIG_LABEL_O in html + assert cs.STATUS_BAR_CONFIG_LABEL_REPO in html + + +@patch("codebase_rag.main._git_state", return_value=None) +@patch("codebase_rag.main._terminal_columns", return_value=200) +@patch("codebase_rag.main.settings") +def test_status_bar_html_inlines_config_when_wide( + mock_settings: MagicMock, + _columns: MagicMock, + _git: MagicMock, +) -> None: + mock_settings.active_orchestrator_config.model_id = "anthropic:claude-opus-4-7" + mock_settings.active_cypher_config.model_id = "anthropic:claude-opus-4-7" + main_mod.app_context.session.target_repo = Path("/tmp/x") + + html = main_mod._status_bar_label() + + rendered = str(html.value) if hasattr(html, "value") else str(html) + body_marker = main_mod._permission_mode_label() + body_idx = rendered.index(body_marker) + config_idx = rendered.index(cs.STATUS_BAR_CONFIG_LABEL_O + ":") + assert config_idx > body_idx, "config should appear after body when wide" + + +@patch("codebase_rag.main._git_state", return_value=None) +@patch("codebase_rag.main._terminal_columns", return_value=40) +@patch("codebase_rag.main.settings") +def test_status_bar_html_wraps_config_when_narrow( + mock_settings: MagicMock, + _columns: MagicMock, + _git: MagicMock, +) -> None: + mock_settings.active_orchestrator_config.model_id = "anthropic:claude-opus-4-7" + mock_settings.active_cypher_config.model_id = "anthropic:claude-opus-4-7" + main_mod.app_context.session.target_repo = Path("/tmp/x") + + html = main_mod._status_bar_label() + + rendered = str(html.value) if hasattr(html, "value") else str(html) + body_marker = main_mod._permission_mode_label() + body_idx = rendered.index(body_marker) + config_idx = rendered.index(cs.STATUS_BAR_CONFIG_LABEL_O + ":") + assert config_idx < body_idx, "config should appear above body when narrow" + + +@patch("codebase_rag.main._git_state", return_value=None) +@patch("codebase_rag.main._terminal_columns", return_value=200) +@patch("codebase_rag.main.settings") +def test_rich_status_bar_inlines_config_when_wide( + mock_settings: MagicMock, + _columns: MagicMock, + _git: MagicMock, +) -> None: + mock_settings.active_orchestrator_config.model_id = "anthropic:claude-opus-4-7" + mock_settings.active_cypher_config.model_id = "anthropic:claude-opus-4-7" + main_mod.app_context.session.target_repo = Path("/tmp/x") + + rendered = main_mod._rich_status_bar().plain + assert "\n" not in rendered + assert cs.STATUS_BAR_CONFIG_LABEL_O + ":" in rendered + + +@patch("codebase_rag.main._git_state", return_value=None) +@patch("codebase_rag.main._terminal_columns", return_value=30) +@patch("codebase_rag.main.settings") +def test_rich_status_bar_wraps_config_when_narrow( + mock_settings: MagicMock, + _columns: MagicMock, + _git: MagicMock, +) -> None: + mock_settings.active_orchestrator_config.model_id = "anthropic:claude-opus-4-7" + mock_settings.active_cypher_config.model_id = "anthropic:claude-opus-4-7" + main_mod.app_context.session.target_repo = Path("/tmp/x") + + rendered = main_mod._rich_status_bar().plain + assert "\n" in rendered + + +def test_git_state_returns_none_without_target_repo() -> None: + main_mod.app_context.session.target_repo = None + assert main_mod._git_state() is None + + +def test_git_state_uses_target_repo_cwd( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + target = tmp_path / "target-repo" + target.mkdir() + main_mod.app_context.session.target_repo = target + + captured: dict[str, object] = {} + + class _FakeCompleted: + stdout = "## feature/x\n M something.py\n" + + def fake_run(cmd, **kwargs): # noqa: ANN001, ANN003 + captured["cmd"] = cmd + captured["cwd"] = kwargs.get("cwd") + return _FakeCompleted() + + monkeypatch.setattr(main_mod.subprocess, "run", fake_run) + + result = main_mod._git_state() + assert result is not None + branch, is_dirty = result + + assert captured["cwd"] == target + assert branch == "feature/x" + assert is_dirty is True + + +def test_git_state_returns_none_when_target_missing(tmp_path: Path) -> None: + main_mod.app_context.session.target_repo = tmp_path / "does-not-exist" + assert main_mod._git_state() is None + + +@patch("codebase_rag.main._git_state", return_value=("feature/x", True)) +@patch("codebase_rag.main._terminal_columns", return_value=400) +@patch("codebase_rag.main.settings") +def test_branch_appears_after_repo_when_inline( + mock_settings: MagicMock, + _columns: MagicMock, + _git: MagicMock, +) -> None: + mock_settings.active_orchestrator_config.model_id = "anthropic:claude-opus-4-7" + mock_settings.active_cypher_config.model_id = "anthropic:claude-opus-4-7" + main_mod.app_context.session.target_repo = Path("/tmp/target") + + rendered = main_mod._rich_status_bar().plain + + repo_label = f"{cs.STATUS_BAR_CONFIG_LABEL_REPO}:/tmp/target" + assert repo_label in rendered + assert "feature/x" in rendered + assert rendered.index(repo_label) < rendered.index("feature/x") + mode_label = main_mod._permission_mode_label() + assert rendered.index(mode_label) < rendered.index("feature/x") + + +@patch("codebase_rag.main._git_state", return_value=("feature/x", False)) +@patch("codebase_rag.main._terminal_columns", return_value=400) +@patch("codebase_rag.main.settings") +def test_status_bar_html_places_branch_after_repo_when_inline( + mock_settings: MagicMock, + _columns: MagicMock, + _git: MagicMock, +) -> None: + mock_settings.active_orchestrator_config.model_id = "anthropic:claude-opus-4-7" + mock_settings.active_cypher_config.model_id = "anthropic:claude-opus-4-7" + main_mod.app_context.session.target_repo = Path("/tmp/target") + + html = main_mod._status_bar_label() + rendered = str(html.value) if hasattr(html, "value") else str(html) + + repo_idx = rendered.index(f"{cs.STATUS_BAR_CONFIG_LABEL_REPO}:") + branch_idx = rendered.index("feature/x") + assert repo_idx < branch_idx diff --git a/codebase_rag/tests/test_stdlib_extractor.py b/codebase_rag/tests/test_stdlib_extractor.py index bd09b0244..723650741 100644 --- a/codebase_rag/tests/test_stdlib_extractor.py +++ b/codebase_rag/tests/test_stdlib_extractor.py @@ -306,7 +306,7 @@ def test_js_stdlib_lowercase_entity_without_node( "fs.readFile", cs.SupportedLanguage.JS ) - assert result == "fs.readFile" + assert result == "fs" def test_ts_uses_js_extraction_uppercase(self, extractor: StdlibExtractor) -> None: with patch.object(se, "_is_tool_available", return_value=False): @@ -314,11 +314,11 @@ def test_ts_uses_js_extraction_uppercase(self, extractor: StdlibExtractor) -> No assert result == "path" - def test_ts_lowercase_returns_unchanged(self, extractor: StdlibExtractor) -> None: + def test_ts_lowercase_strips_entity(self, extractor: StdlibExtractor) -> None: with patch.object(se, "_is_tool_available", return_value=False): result = extractor.extract_module_path("path.join", cs.SupportedLanguage.TS) - assert result == "path.join" + assert result == "path" class TestEdgeCases: @@ -704,7 +704,7 @@ def test_js_extractor_fallback_on_entity_not_found( "fs.nonexistent", cs.SupportedLanguage.JS ) - assert result == "fs.nonexistent" + assert result == "fs" def test_js_extractor_fallback_on_json_decode_error( self, extractor: StdlibExtractor @@ -719,7 +719,7 @@ def test_js_extractor_fallback_on_json_decode_error( ): result = extractor.extract_module_path("path.join", cs.SupportedLanguage.JS) - assert result == "path.join" + assert result == "path" def test_js_extractor_fallback_on_timeout(self, extractor: StdlibExtractor) -> None: import subprocess @@ -732,4 +732,4 @@ def test_js_extractor_fallback_on_timeout(self, extractor: StdlibExtractor) -> N "http.createServer", cs.SupportedLanguage.JS ) - assert result == "http.createServer" + assert result == "http" diff --git a/codebase_rag/tests/test_structure_processor.py b/codebase_rag/tests/test_structure_processor.py index 51c23fe60..50c74ea2c 100644 --- a/codebase_rag/tests/test_structure_processor.py +++ b/codebase_rag/tests/test_structure_processor.py @@ -511,3 +511,22 @@ def test_multiple_package_indicators( ] qualified_names = {c[0][1]["qualified_name"] for c in package_calls} assert qualified_names == {"multi_lang.pypkg", "multi_lang.rustpkg"} + + +class TestStructureProcessorSlots: + def test_has_slots(self) -> None: + assert hasattr(StructureProcessor, "__slots__") + + def test_no_instance_dict(self, processor: StructureProcessor) -> None: + assert not hasattr(processor, "__dict__") + + def test_rejects_arbitrary_attribute(self, processor: StructureProcessor) -> None: + with pytest.raises(AttributeError): + processor.nonexistent_attr = 42 + + def test_slot_attributes_accessible(self, processor: StructureProcessor) -> None: + assert hasattr(processor, "ingestor") + assert hasattr(processor, "repo_path") + assert hasattr(processor, "project_name") + assert hasattr(processor, "queries") + assert hasattr(processor, "structural_elements") diff --git a/codebase_rag/tests/test_token_utils.py b/codebase_rag/tests/test_token_utils.py new file mode 100644 index 000000000..bbd116c13 --- /dev/null +++ b/codebase_rag/tests/test_token_utils.py @@ -0,0 +1,71 @@ +from __future__ import annotations + +from codebase_rag.types_defs import ResultRow +from codebase_rag.utils.token_utils import count_tokens, truncate_results_by_tokens + + +class TestCountTokens: + def test_empty_string(self) -> None: + assert count_tokens("") == 0 + + def test_simple_string(self) -> None: + tokens = count_tokens("hello world") + assert tokens > 0 + + def test_longer_string_has_more_tokens(self) -> None: + short = count_tokens("hello") + long = count_tokens("hello world this is a longer string with more tokens") + assert long > short + + +class TestTruncateResultsByTokens: + def test_empty_results(self) -> None: + results, tokens, truncated = truncate_results_by_tokens([], max_tokens=1000) + assert results == [] + assert tokens == 0 + assert truncated is False + + def test_results_within_limit(self) -> None: + rows: list[ResultRow] = [ + {"name": "foo", "count": 1}, + {"name": "bar", "count": 2}, + ] + results, tokens, truncated = truncate_results_by_tokens(rows, max_tokens=10000) + assert len(results) == 2 + assert tokens > 0 + assert truncated is False + + def test_results_exceed_limit(self) -> None: + rows: list[ResultRow] = [ + {"name": f"function_{i}", "path": f"src/module_{i}/file_{i}.py"} + for i in range(100) + ] + results, tokens, truncated = truncate_results_by_tokens(rows, max_tokens=200) + assert len(results) < 100 + assert len(results) > 0 + assert tokens <= 200 + assert truncated is True + + def test_single_large_row_still_included(self) -> None: + rows: list[ResultRow] = [ + {"content": "x" * 5000}, + ] + results, tokens, truncated = truncate_results_by_tokens(rows, max_tokens=10) + assert len(results) == 1 + assert truncated is False + + def test_preserves_row_order(self) -> None: + rows: list[ResultRow] = [ + {"name": "first"}, + {"name": "second"}, + {"name": "third"}, + ] + results, _, _ = truncate_results_by_tokens(rows, max_tokens=10000) + assert [r["name"] for r in results] == ["first", "second", "third"] + + def test_token_count_accuracy(self) -> None: + rows: list[ResultRow] = [ + {"name": "hello world"}, + ] + results, tokens, _ = truncate_results_by_tokens(rows, max_tokens=10000) + assert tokens == count_tokens('{"name": "hello world"}') diff --git a/codebase_rag/tests/test_truthiness_dispatch_resolution.py b/codebase_rag/tests/test_truthiness_dispatch_resolution.py new file mode 100644 index 000000000..9226bcb14 --- /dev/null +++ b/codebase_rag/tests/test_truthiness_dispatch_resolution.py @@ -0,0 +1,123 @@ +# (H) L3 finding from the evals/ harness: `if self.function_registry:` tests an object +# (H) for truthiness, which at runtime calls __bool__ if defined else __len__. cgr only +# (H) extracted explicit calls, missing FunctionRegistryTrie.__len__. These edges are +# (H) emitted only when the tested operand is a first-party object defining the dunder. +from __future__ import annotations + +from pathlib import Path + +from codebase_rag import constants as cs +from codebase_rag.graph_updater import GraphUpdater +from codebase_rag.parser_loader import load_parsers +from codebase_rag.types_defs import PropertyDict, PropertyValue, ResultRow + +PROJECT = "proj" + +FILES = { + "pkg/__init__.py": "", + "pkg/sized.py": ("class Sized:\n def __len__(self):\n return 0\n"), + "pkg/flag.py": ( + "class Flag:\n" + " def __bool__(self):\n return True\n\n" + " def __len__(self):\n return 0\n" + ), + "pkg/user.py": ( + "from .sized import Sized\n" + "from .flag import Flag\n\n\n" + "class User:\n" + " def __init__(self, sized: Sized, flag: Flag) -> None:\n" + " self._sized = sized\n" + " self._flag = flag\n\n" + " def _record(self):\n" + " return None\n\n" + " def check(self):\n" + " self._record()\n" + " if self._sized:\n" + " return 1\n" + " return 0\n\n" + " def combined(self, other):\n" + " self._record()\n" + " if self._sized and other:\n" + " return 1\n" + " return 0\n\n" + " def truthy_flag(self):\n" + " self._record()\n" + " if self._flag:\n" + " return 1\n" + " return 0\n" + ), +} + + +class _Capture: + def __init__(self) -> None: + self.rels: list[tuple[PropertyValue, str, PropertyValue]] = [] + + def ensure_node_batch(self, label: str, properties: PropertyDict) -> None: + return None + + def ensure_relationship_batch( + self, + from_spec: tuple[str, str, PropertyValue], + rel_type: str, + to_spec: tuple[str, str, PropertyValue], + properties: PropertyDict | None = None, + ) -> None: + self.rels.append((from_spec[2], str(rel_type), to_spec[2])) + + def flush_all(self) -> None: + return None + + def fetch_all( + self, query: str, params: PropertyDict | None = None + ) -> list[ResultRow]: + return [] + + def execute_write(self, query: str, params: PropertyDict | None = None) -> None: + return None + + +def _calls(tmp_path: Path) -> set[tuple[PropertyValue, PropertyValue]]: + for rel, content in FILES.items(): + p = tmp_path / rel + p.parent.mkdir(parents=True, exist_ok=True) + p.write_text(content) + parsers, queries = load_parsers() + cap = _Capture() + GraphUpdater( + ingestor=cap, + repo_path=tmp_path, + parsers=parsers, + queries=queries, + project_name=PROJECT, + ).run(force=True) + return { + (frm, to) for (frm, rel, to) in cap.rels if rel == cs.RelationshipType.CALLS + } + + +class TestTruthinessDispatchResolution: + def test_if_truthiness_dispatches_to_len(self, tmp_path: Path) -> None: + calls = _calls(tmp_path) + assert ( + "proj.pkg.user.User.check", + "proj.pkg.sized.Sized.__len__", + ) in calls, calls + + def test_boolean_operator_operand_dispatches_to_len(self, tmp_path: Path) -> None: + calls = _calls(tmp_path) + assert ( + "proj.pkg.user.User.combined", + "proj.pkg.sized.Sized.__len__", + ) in calls, calls + + def test_bool_takes_precedence_over_len(self, tmp_path: Path) -> None: + calls = _calls(tmp_path) + assert ( + "proj.pkg.user.User.truthy_flag", + "proj.pkg.flag.Flag.__bool__", + ) in calls, calls + assert ( + "proj.pkg.user.User.truthy_flag", + "proj.pkg.flag.Flag.__len__", + ) not in calls, calls diff --git a/codebase_rag/tests/test_ts_closure_containment.py b/codebase_rag/tests/test_ts_closure_containment.py new file mode 100644 index 000000000..7fa1a3dac --- /dev/null +++ b/codebase_rag/tests/test_ts_closure_containment.py @@ -0,0 +1,43 @@ +# (H) A function declared inside an anonymous callback must be DEFINEd by that +# (H) callback (its lexical parent), not hoisted to the nearest named ancestor. +# (H) The child's qn omits anonymous scopes, so deriving the DEFINES parent by +# (H) trimming the child qn skipped the callback; the parent is now recomputed +# (H) from the enclosing function node itself. +from __future__ import annotations + +from pathlib import Path +from unittest.mock import MagicMock + +from codebase_rag.constants import RelationshipType +from codebase_rag.tests.conftest import create_and_run_updater, get_relationships + +_TS = """\ +export function driver(client) { + test("x", function (assert) { + function inner(fn) { + return 1; + } + return inner; + }); +} +""" + + +def test_function_in_anonymous_callback_defined_by_callback( + temp_repo: Path, mock_ingestor: MagicMock +) -> None: + project = temp_repo / "ts_closure" + project.mkdir() + (project / "m.ts").write_text(_TS, encoding="utf-8") + create_and_run_updater(project, mock_ingestor, skip_if_missing="typescript") + + # (H) (parent_qn, child_qn) for DEFINES edges into `inner`. + parents = { + call[0][0][2] + for call in get_relationships(mock_ingestor, RelationshipType.DEFINES.value) + if str(call[0][2][2]).endswith(".inner") + } + assert parents, "no DEFINES edge into inner" + # (H) The parent must be the anonymous callback, not the named driver. + assert all("anonymous" in p for p in parents), parents + assert "ts_closure.m.driver" not in parents, parents diff --git a/codebase_rag/tests/test_type_inference_iterative.py b/codebase_rag/tests/test_type_inference_iterative.py index 76d0febeb..bf2fd80e0 100644 --- a/codebase_rag/tests/test_type_inference_iterative.py +++ b/codebase_rag/tests/test_type_inference_iterative.py @@ -3,7 +3,7 @@ from collections import defaultdict from pathlib import Path from typing import Any -from unittest.mock import MagicMock +from unittest.mock import MagicMock, patch import pytest @@ -88,15 +88,16 @@ def test_analyze_self_assignments_handles_deep_tree_without_recursion_error() -> engine = _make_engine() py_engine = engine.python_type_inference - py_engine._infer_type_from_expression = MagicMock(return_value="MockType") # type: ignore[method-assign] + mock_infer = MagicMock(return_value="MockType") root = _build_deep_assignment_chain(depth=1500) local_types: dict[str, Any] = {} - py_engine._analyze_self_assignments(root, local_types, "proj.module") # ty: ignore[invalid-argument-type] # (H) NodeStub not Node + with patch.object(type(py_engine), "_infer_type_from_expression", mock_infer): + py_engine._analyze_self_assignments(root, local_types, "proj.module") # ty: ignore[invalid-argument-type] # (H) NodeStub not Node assert local_types, "Expected at least one inferred instance variable" - assert py_engine._infer_type_from_expression.call_count == 1500 # type: ignore[attr-defined] + assert mock_infer.call_count == 1500 def test_find_return_statements_handles_deep_tree_without_recursion_error() -> None: @@ -162,86 +163,95 @@ def test_dispatches_to_python_engine( self, engine: TypeInferenceEngine, mock_node: MagicMock ) -> None: expected = {"var1": "str"} - engine.python_type_inference.build_local_variable_type_map = MagicMock( - return_value=expected - ) + mock_method = MagicMock(return_value=expected) - result = engine.build_local_variable_type_map( - mock_node, "proj.module", cs.SupportedLanguage.PYTHON - ) + with patch.object( + PythonTypeInferenceEngine, + "build_local_variable_type_map", + mock_method, + ): + result = engine.build_local_variable_type_map( + mock_node, "proj.module", cs.SupportedLanguage.PYTHON + ) assert result == expected - engine.python_type_inference.build_local_variable_type_map.assert_called_once_with( - mock_node, "proj.module" - ) + mock_method.assert_called_once_with(mock_node, "proj.module") def test_dispatches_to_js_engine( self, engine: TypeInferenceEngine, mock_node: MagicMock ) -> None: expected = {"jsVar": "number"} - engine.js_type_inference.build_local_variable_type_map = MagicMock( - return_value=expected - ) + mock_method = MagicMock(return_value=expected) - result = engine.build_local_variable_type_map( - mock_node, "proj.module", cs.SupportedLanguage.JS - ) + with patch.object( + JsTypeInferenceEngine, + "build_local_variable_type_map", + mock_method, + ): + result = engine.build_local_variable_type_map( + mock_node, "proj.module", cs.SupportedLanguage.JS + ) assert result == expected - engine.js_type_inference.build_local_variable_type_map.assert_called_once_with( - mock_node, "proj.module" + mock_method.assert_called_once_with( + mock_node, "proj.module", cs.SupportedLanguage.JS ) def test_dispatches_to_ts_engine( self, engine: TypeInferenceEngine, mock_node: MagicMock ) -> None: expected = {"tsVar": "string"} - engine.js_type_inference.build_local_variable_type_map = MagicMock( - return_value=expected - ) + mock_method = MagicMock(return_value=expected) - result = engine.build_local_variable_type_map( - mock_node, "proj.module", cs.SupportedLanguage.TS - ) + with patch.object( + JsTypeInferenceEngine, + "build_local_variable_type_map", + mock_method, + ): + result = engine.build_local_variable_type_map( + mock_node, "proj.module", cs.SupportedLanguage.TS + ) assert result == expected - engine.js_type_inference.build_local_variable_type_map.assert_called_once_with( - mock_node, "proj.module" + mock_method.assert_called_once_with( + mock_node, "proj.module", cs.SupportedLanguage.TS ) def test_dispatches_to_java_engine( self, engine: TypeInferenceEngine, mock_node: MagicMock ) -> None: expected = {"javaVar": "String"} - engine.java_type_inference.build_variable_type_map = MagicMock( - return_value=expected - ) + mock_method = MagicMock(return_value=expected) - result = engine.build_local_variable_type_map( - mock_node, "proj.module", cs.SupportedLanguage.JAVA - ) + with patch.object( + JavaTypeInferenceEngine, + "build_variable_type_map", + mock_method, + ): + result = engine.build_local_variable_type_map( + mock_node, "proj.module", cs.SupportedLanguage.JAVA + ) assert result == expected - engine.java_type_inference.build_variable_type_map.assert_called_once_with( - mock_node, "proj.module" - ) + mock_method.assert_called_once_with(mock_node, "proj.module") def test_dispatches_to_lua_engine( self, engine: TypeInferenceEngine, mock_node: MagicMock ) -> None: expected = {"luaVar": "table"} - engine.lua_type_inference.build_local_variable_type_map = MagicMock( - return_value=expected - ) + mock_method = MagicMock(return_value=expected) - result = engine.build_local_variable_type_map( - mock_node, "proj.module", cs.SupportedLanguage.LUA - ) + with patch.object( + LuaTypeInferenceEngine, + "build_local_variable_type_map", + mock_method, + ): + result = engine.build_local_variable_type_map( + mock_node, "proj.module", cs.SupportedLanguage.LUA + ) assert result == expected - engine.lua_type_inference.build_local_variable_type_map.assert_called_once_with( - mock_node, "proj.module" - ) + mock_method.assert_called_once_with(mock_node, "proj.module") @pytest.mark.parametrize( "language", @@ -250,7 +260,6 @@ def test_dispatches_to_lua_engine( cs.SupportedLanguage.GO, cs.SupportedLanguage.SCALA, cs.SupportedLanguage.CPP, - cs.SupportedLanguage.CSHARP, cs.SupportedLanguage.PHP, ], ) @@ -320,13 +329,16 @@ def test_delegates_to_java_engine(self) -> None: engine = _make_engine() mock_node = MagicMock() expected = {"javaVar": "String", "count": "int"} - engine.java_type_inference.build_variable_type_map = MagicMock( - return_value=expected - ) + mock_method = MagicMock(return_value=expected) - result = engine._build_java_variable_type_map(mock_node, "com.example.Module") + with patch.object( + JavaTypeInferenceEngine, + "build_variable_type_map", + mock_method, + ): + result = engine._build_java_variable_type_map( + mock_node, "com.example.Module" + ) assert result == expected - engine.java_type_inference.build_variable_type_map.assert_called_once_with( - mock_node, "com.example.Module" - ) + mock_method.assert_called_once_with(mock_node, "com.example.Module") diff --git a/codebase_rag/tests/test_typescript_containment_oracle.py b/codebase_rag/tests/test_typescript_containment_oracle.py new file mode 100644 index 000000000..15e5b4c81 --- /dev/null +++ b/codebase_rag/tests/test_typescript_containment_oracle.py @@ -0,0 +1,66 @@ +# (H) Covers TypeScript containment-edge validation: cgr's DEFINES (file module +# (H) -> every named type, even nested) and DEFINES_METHOD (class/namespace -> +# (H) method) edges are graded against the independent TypeScript-compiler-API +# (H) oracle, joined on (kind, file, line). Exercises a class method, a top-level +# (H) function, and a namespace (class + function as methods of the namespace). +from __future__ import annotations + +from pathlib import Path + +import pytest + +from codebase_rag import constants as cs +from codebase_rag.parser_loader import load_parsers +from evals import constants as ec +from evals.cgr_graph import extract_cgr_ts_graph +from evals.oracles import run_typescript_oracle, typescript_available +from evals.score import score_edge_types + +TS_SRC = """\ +export interface Shape { area(): number; } + +export enum Color { Red, Green } + +export class Point implements Shape { + x: number = 0; + area(): number { return 1.0; } +} + +export function free(): number { return 1; } + +export namespace geo { + export class Widget { build(): number { return 1; } } + export function helper(): number { return 2; } +} +""" + + +def _require_ts() -> None: + if not typescript_available(): + pytest.skip("node/npm toolchain not available") + if cs.SupportedLanguage.TS not in load_parsers()[0]: + pytest.skip("typescript parser not available") + + +def test_cgr_matches_tsc_oracle_on_containment_edges(tmp_path: Path) -> None: + _require_ts() + project = tmp_path / "ts_edge" + project.mkdir() + (project / "lib.ts").write_text(TS_SRC, encoding="utf-8") + + cgr = extract_cgr_ts_graph(project, project.name) + oracle = run_typescript_oracle(project) + + result = score_edge_types(cgr, oracle, ec.SCORED_EDGE_TYPES) + by_label = {row["label"]: row for row in result.rows} + for label in ( + cs.RelationshipType.DEFINES.value, + cs.RelationshipType.DEFINES_METHOD.value, + ): + row = by_label.get(label) + assert row is not None, (label, by_label, result.diff) + assert row["precision"] == 1.0 and row["recall"] == 1.0, ( + label, + row, + result.diff, + ) diff --git a/codebase_rag/tests/test_typescript_implements_edges.py b/codebase_rag/tests/test_typescript_implements_edges.py new file mode 100644 index 000000000..dc75804d0 --- /dev/null +++ b/codebase_rag/tests/test_typescript_implements_edges.py @@ -0,0 +1,42 @@ +# (H) TypeScript class `implements` was dropped: cgr captured `extends` +# (H) (-> INHERITS) via class_heritage but never the `implements_clause`, so a +# (H) class implementing interfaces produced no IMPLEMENTS edges. +from __future__ import annotations + +from pathlib import Path +from unittest.mock import MagicMock + +from codebase_rag.constants import RelationshipType +from codebase_rag.tests.conftest import create_and_run_updater, get_relationships + +_TS = """\ +export interface Shape {} +export interface Drawable {} +export class Base {} +export class Circle extends Base implements Shape, Drawable {} +""" + + +def _pairs(mock_ingestor: MagicMock, rel: str) -> set[tuple[str, str]]: + return { + (call[0][0][2], call[0][2][2]) for call in get_relationships(mock_ingestor, rel) + } + + +def test_typescript_class_implements_edges( + temp_repo: Path, mock_ingestor: MagicMock +) -> None: + project = temp_repo / "ts_impl" + project.mkdir() + (project / "lib.ts").write_text(_TS, encoding="utf-8") + create_and_run_updater(project, mock_ingestor, skip_if_missing="typescript") + + inherits = _pairs(mock_ingestor, RelationshipType.INHERITS.value) + implements = _pairs(mock_ingestor, RelationshipType.IMPLEMENTS.value) + base = "ts_impl.lib" + + # (H) extends still works. + assert (f"{base}.Circle", f"{base}.Base") in inherits, inherits + # (H) implements must now produce IMPLEMENTS to each interface. + assert (f"{base}.Circle", f"{base}.Shape") in implements, implements + assert (f"{base}.Circle", f"{base}.Drawable") in implements, implements diff --git a/codebase_rag/tests/test_typescript_inheritance_oracle.py b/codebase_rag/tests/test_typescript_inheritance_oracle.py new file mode 100644 index 000000000..414433e69 --- /dev/null +++ b/codebase_rag/tests/test_typescript_inheritance_oracle.py @@ -0,0 +1,54 @@ +# (H) Covers TypeScript inheritance-edge validation: cgr's INHERITS (class & +# (H) interface extends) and IMPLEMENTS (class implements) edges are graded +# (H) against the TypeScript-compiler-API oracle, by (source node, base name). +from __future__ import annotations + +from pathlib import Path + +import pytest + +from codebase_rag import constants as cs +from codebase_rag.parser_loader import load_parsers +from evals import constants as ec +from evals.cgr_graph import extract_cgr_ts_graph +from evals.oracles import run_typescript_oracle, typescript_available +from evals.score import score_name_edge_types + +TS_SRC = """\ +export interface Shape {} +export interface Drawable {} +export interface Big extends Shape, Drawable {} +export class Base {} +export class Circle extends Base implements Shape, Drawable {} +""" + + +def _require_ts() -> None: + if not typescript_available(): + pytest.skip("node/npm toolchain not available") + if cs.SupportedLanguage.TS not in load_parsers()[0]: + pytest.skip("typescript parser not available") + + +def test_cgr_matches_tsc_oracle_on_inheritance_edges(tmp_path: Path) -> None: + _require_ts() + project = tmp_path / "ts_inh_edge" + project.mkdir() + (project / "lib.ts").write_text(TS_SRC, encoding="utf-8") + + cgr = extract_cgr_ts_graph(project, project.name) + oracle = run_typescript_oracle(project) + + result = score_name_edge_types(cgr, oracle, ec.INHERITANCE_NAME_EDGE_TYPES) + by_label = {row["label"]: row for row in result.rows} + for label in ( + cs.RelationshipType.INHERITS.value, + cs.RelationshipType.IMPLEMENTS.value, + ): + row = by_label.get(label) + assert row is not None, (label, by_label, result.diff) + assert row["precision"] == 1.0 and row["recall"] == 1.0, ( + label, + row, + result.diff, + ) diff --git a/codebase_rag/tests/test_typescript_namespace_qn.py b/codebase_rag/tests/test_typescript_namespace_qn.py new file mode 100644 index 000000000..3d0f4ba43 --- /dev/null +++ b/codebase_rag/tests/test_typescript_namespace_qn.py @@ -0,0 +1,41 @@ +# (H) A class declared inside a TypeScript `namespace` must carry the namespace +# (H) in its qualified name (proj...geo.Widget), like a nested function does. +# (H) The class FQN scope walk listed the wrong node type ("namespace_definition" +# (H) instead of the grammar's "internal_module"), so it skipped the namespace +# (H) and produced an unscoped qn that collides with a top-level same-named type. +from __future__ import annotations + +from pathlib import Path +from unittest.mock import MagicMock + +from codebase_rag.constants import KEY_QUALIFIED_NAME, NodeLabel +from codebase_rag.tests.conftest import create_and_run_updater, get_nodes + +_TS = """\ +export namespace geo { + export class Widget { + build(): number { return 1; } + } +} + +export class Widget { + other(): number { return 2; } +} +""" + + +def test_typescript_namespace_class_qn_includes_namespace( + temp_repo: Path, mock_ingestor: MagicMock +) -> None: + project = temp_repo / "ts_ns" + project.mkdir() + (project / "lib.ts").write_text(_TS, encoding="utf-8") + create_and_run_updater(project, mock_ingestor, skip_if_missing="typescript") + + class_qns = { + str(node[0][1].get(KEY_QUALIFIED_NAME)) + for node in get_nodes(mock_ingestor, NodeLabel.CLASS) + } + # (H) The namespaced class and the top-level class must be distinct nodes. + assert "ts_ns.lib.geo.Widget" in class_qns, class_qns + assert "ts_ns.lib.Widget" in class_qns, class_qns diff --git a/codebase_rag/tests/test_typescript_span_oracle.py b/codebase_rag/tests/test_typescript_span_oracle.py new file mode 100644 index 000000000..de1076ff7 --- /dev/null +++ b/codebase_rag/tests/test_typescript_span_oracle.py @@ -0,0 +1,82 @@ +# (H) Covers TypeScript node SPAN (end_line) validation: cgr's end_line for each +# (H) node is graded against the TS-compiler-API oracle (which emits each node's +# (H) full-span end line), joined on (kind, file, start). Exercises a class with a +# (H) multi-line method signature, an interface, an enum, a type alias, a +# (H) namespace, and a multi-line arrow function so spans are not trivially single +# (H) line. +from __future__ import annotations + +from pathlib import Path + +import pytest + +from codebase_rag import constants as cs +from codebase_rag.parser_loader import load_parsers +from evals import constants as ec +from evals.cgr_graph import extract_cgr_ts_graph +from evals.oracles import run_typescript_oracle, typescript_available +from evals.score import score_span + +TS_SRC = """\ +export class Widget { + area( + scale: number, + ): number { + return scale; + } +} + +export interface Shape { + area(): number; +} + +export enum Color { + Red, + Green, +} + +export type Pair = { + a: number; + b: number; +}; + +export namespace geo { + export function dist(): number { + return 1; + } +} + +export function standalone(): number { + const cb = (v: number) => { + return v + 1; + }; + return cb(2); +} +""" + + +def _require_ts() -> None: + if not typescript_available(): + pytest.skip("node/npm toolchain not available") + if cs.SupportedLanguage.TS not in load_parsers()[0]: + pytest.skip("typescript parser not available") + + +def test_cgr_matches_tsc_oracle_on_node_spans(tmp_path: Path) -> None: + _require_ts() + project = tmp_path / "ts_span_test" + project.mkdir() + (project / "main.ts").write_text(TS_SRC, encoding="utf-8") + + cgr = extract_cgr_ts_graph(project, project.name) + oracle = run_typescript_oracle(project) + + result = score_span(cgr, oracle, ec.TS_SCORED_NODE_KINDS) + by_label = {row["label"]: row for row in result.rows} + aggregate = by_label.get(ec.AGGREGATE_LABEL) + assert aggregate is not None, (by_label, result.diff) + assert aggregate["precision"] == 1.0 and aggregate["recall"] == 1.0, ( + aggregate, + result.diff, + ) + assert aggregate["tp"] >= 5, aggregate diff --git a/codebase_rag/tests/test_typescript_structure_oracle.py b/codebase_rag/tests/test_typescript_structure_oracle.py new file mode 100644 index 000000000..bdb4f8972 --- /dev/null +++ b/codebase_rag/tests/test_typescript_structure_oracle.py @@ -0,0 +1,61 @@ +# (H) Covers the TypeScript structure oracle harness (evals/oracles/ts_oracle + +# (H) evals/ts_l1.py): the TS-compiler-API oracle is authoritative ground truth, +# (H) and cgr's captured TypeScript nodes are graded against it on +# (H) (kind, file, start_line). +from __future__ import annotations + +from pathlib import Path + +import pytest + +from codebase_rag import constants as cs +from codebase_rag.parser_loader import load_parsers +from evals import constants as ec +from evals.cgr_graph import extract_cgr_ts_nodes +from evals.oracles import run_typescript_oracle, typescript_available +from evals.score import score_node_kinds +from evals.types_defs import GraphData + +TS_SRC = """\ +export interface Shape { area(): number; } +export type Meters = number; +export enum Color { Red, Green, Blue } + +export class Point implements Shape { + x: number; + constructor(x: number) { this.x = x; } + area(): number { return this.x; } +} + +export function freeFn(a: number): number { return a + 1; } +export const arrow = (b: number): number => b * 2; +[1, 2].forEach((n) => freeFn(n)); +""" + + +def _require_ts() -> None: + if not typescript_available(): + pytest.skip("node/npm toolchain not available") + if cs.SupportedLanguage.TS not in load_parsers()[0]: + pytest.skip("typescript parser not available") + + +def test_cgr_matches_tsc_oracle_on_typescript_structure(tmp_path: Path) -> None: + _require_ts() + project = tmp_path / "ts_oracle_test" + project.mkdir() + (project / "app.ts").write_text(TS_SRC, encoding="utf-8") + + cgr = GraphData( + nodes=extract_cgr_ts_nodes(project, project.name), + edges=set(), + name_edges=set(), + ) + oracle = run_typescript_oracle(project) + + result = score_node_kinds(cgr, oracle, ec.TS_SCORED_NODE_KINDS) + by_label = {row["label"]: row for row in result.rows} + for label in ("Class", "Interface", "Enum", "Type", "Function", "Method"): + row = by_label.get(label) + assert row is not None, (label, by_label) + assert row["precision"] == 1.0 and row["recall"] == 1.0, (label, row) diff --git a/codebase_rag/tests/test_unixcoder_unit.py b/codebase_rag/tests/test_unixcoder_unit.py index bf8a807c7..fffc29e25 100644 --- a/codebase_rag/tests/test_unixcoder_unit.py +++ b/codebase_rag/tests/test_unixcoder_unit.py @@ -1,8 +1,11 @@ from __future__ import annotations +from unittest.mock import MagicMock + import torch +from torch import nn -from codebase_rag.unixcoder import Beam +from codebase_rag.unixcoder import Beam, UniXcoder class TestBeamInit: @@ -170,6 +173,38 @@ def test_handles_no_eos(self) -> None: assert len(result[0]) == 3 +class TestForwardAttentionMask: + def _make_uninitialized(self, pad_id: int) -> UniXcoder: + instance = UniXcoder.__new__(UniXcoder) + nn.Module.__init__(instance) + instance.config = MagicMock() + instance.config.pad_token_id = pad_id + return instance + + def test_attention_mask_is_4d(self) -> None: + instance = self._make_uninitialized(pad_id=1) + captured: dict[str, torch.Size] = {} + + def fake_model( + source_ids: torch.Tensor, attention_mask: torch.Tensor + ) -> tuple[torch.Tensor]: + captured["shape"] = attention_mask.shape + batch, seq = source_ids.shape + return (torch.zeros(batch, seq, 8),) + + instance.model = MagicMock(side_effect=fake_model) + + source_ids = torch.tensor([[2, 3, 4, 5, 1], [2, 3, 1, 1, 1]]) + instance.forward(source_ids) + + assert "shape" in captured + assert len(captured["shape"]) == 4 + assert captured["shape"][0] == 2 + assert captured["shape"][1] == 1 + assert captured["shape"][2] == 5 + assert captured["shape"][3] == 5 + + class TestBeamGetHyp: def test_constructs_hypothesis_path(self) -> None: beam = Beam(size=2, eos=2, device=torch.device("cpu")) diff --git a/codebase_rag/tests/test_vector_store.py b/codebase_rag/tests/test_vector_store.py index c4b0c0bad..57ccd3c36 100644 --- a/codebase_rag/tests/test_vector_store.py +++ b/codebase_rag/tests/test_vector_store.py @@ -78,6 +78,56 @@ def integration_client( pass +@pytest.mark.skipif(not has_qdrant_client(), reason="qdrant-client not installed") +def test_get_qdrant_client_uses_url_when_set(reset_global_client: None) -> None: + import codebase_rag.vector_store as vs + + with patch.object(vs.settings, "QDRANT_URL", "http://localhost:6333"): + with patch("codebase_rag.vector_store.QdrantClient") as mock_client_cls: + instance = MagicMock() + instance.collection_exists.return_value = True + mock_client_cls.return_value = instance + vs.get_qdrant_client() + + mock_client_cls.assert_called_once_with(url="http://localhost:6333") + + +@pytest.mark.skipif(not has_qdrant_client(), reason="qdrant-client not installed") +def test_get_qdrant_client_uses_path_when_url_unset( + reset_global_client: None, +) -> None: + import codebase_rag.vector_store as vs + + with patch.object(vs.settings, "QDRANT_URL", None): + with patch.object(vs.settings, "QDRANT_DB_PATH", "/tmp/qd"): + with patch("codebase_rag.vector_store.QdrantClient") as mock_client_cls: + instance = MagicMock() + instance.collection_exists.return_value = True + mock_client_cls.return_value = instance + vs.get_qdrant_client() + + mock_client_cls.assert_called_once_with(path="/tmp/qd") + + +@pytest.mark.skipif(not has_qdrant_client(), reason="qdrant-client not installed") +def test_get_qdrant_client_logs_and_reraises_on_lock_error( + reset_global_client: None, +) -> None: + import codebase_rag.vector_store as vs + + with patch.object(vs.settings, "QDRANT_URL", None): + with patch.object(vs.settings, "QDRANT_DB_PATH", "/tmp/qd_locked"): + with patch("codebase_rag.vector_store.QdrantClient") as mock_client_cls: + mock_client_cls.side_effect = RuntimeError( + "Storage folder is already accessed by another instance" + ) + with patch("codebase_rag.vector_store.logger") as mock_logger: + with pytest.raises(RuntimeError): + vs.get_qdrant_client() + + mock_logger.error.assert_called_once() + + @pytest.mark.skipif(not has_qdrant_client(), reason="qdrant-client not installed") def test_store_embedding_calls_upsert( mock_qdrant_client: MagicMock, reset_global_client: None diff --git a/codebase_rag/tests/test_vector_store_batch.py b/codebase_rag/tests/test_vector_store_batch.py new file mode 100644 index 000000000..597ebd2d2 --- /dev/null +++ b/codebase_rag/tests/test_vector_store_batch.py @@ -0,0 +1,225 @@ +from unittest.mock import MagicMock, patch + +import pytest + +from codebase_rag.utils.dependencies import has_qdrant_client + +pytestmark = pytest.mark.skipif( + not has_qdrant_client(), reason="qdrant-client not installed" +) + +_PATCH_CLIENT = "codebase_rag.vector_store.get_qdrant_client" +_PATCH_SLEEP = "codebase_rag.vector_store.time.sleep" + + +class TestUpsertWithRetry: + def test_succeeds_on_first_attempt(self) -> None: + from codebase_rag.vector_store import _upsert_with_retry + + mock_client = MagicMock() + mock_point = MagicMock() + + with patch(_PATCH_CLIENT, return_value=mock_client): + _upsert_with_retry([mock_point]) + + mock_client.upsert.assert_called_once() + + def test_retries_on_failure_then_succeeds(self) -> None: + from codebase_rag.vector_store import _upsert_with_retry + + mock_client = MagicMock() + mock_client.upsert.side_effect = [ + ConnectionError("timeout"), + None, + ] + + with ( + patch(_PATCH_CLIENT, return_value=mock_client), + patch(_PATCH_SLEEP) as mock_sleep, + ): + _upsert_with_retry([MagicMock()]) + + assert mock_client.upsert.call_count == 2 + mock_sleep.assert_called_once() + + def test_raises_after_exhausting_retries(self) -> None: + from codebase_rag.vector_store import _upsert_with_retry + + mock_client = MagicMock() + mock_client.upsert.side_effect = ConnectionError("timeout") + + with ( + patch(_PATCH_CLIENT, return_value=mock_client), + patch(_PATCH_SLEEP), + pytest.raises(ConnectionError, match="timeout"), + ): + _upsert_with_retry([MagicMock()]) + + def test_exponential_backoff_delays(self) -> None: + from codebase_rag.vector_store import _upsert_with_retry + + mock_client = MagicMock() + mock_client.upsert.side_effect = [ + ConnectionError("fail"), + ConnectionError("fail"), + None, + ] + + with ( + patch(_PATCH_CLIENT, return_value=mock_client), + patch(_PATCH_SLEEP) as mock_sleep, + ): + _upsert_with_retry([MagicMock()]) + + delays = [c.args[0] for c in mock_sleep.call_args_list] + assert delays[1] > delays[0] + + +class TestStoreEmbeddingBatch: + def test_returns_count_on_success(self) -> None: + from codebase_rag.vector_store import store_embedding_batch + + mock_client = MagicMock() + points = [ + (1, [0.1] * 768, "mod.func1"), + (2, [0.2] * 768, "mod.func2"), + ] + + with patch(_PATCH_CLIENT, return_value=mock_client): + result = store_embedding_batch(points) + + assert result == 2 + + def test_returns_zero_on_empty(self) -> None: + from codebase_rag.vector_store import store_embedding_batch + + result = store_embedding_batch([]) + assert result == 0 + + def test_returns_zero_on_failure(self) -> None: + from codebase_rag.vector_store import store_embedding_batch + + mock_client = MagicMock() + mock_client.upsert.side_effect = Exception("fail") + + with ( + patch(_PATCH_CLIENT, return_value=mock_client), + patch(_PATCH_SLEEP), + ): + result = store_embedding_batch([(1, [0.1] * 768, "mod.func")]) + + assert result == 0 + + def test_builds_correct_point_structs(self) -> None: + from codebase_rag.vector_store import store_embedding_batch + + mock_client = MagicMock() + embedding = [0.5] * 768 + points = [(42, embedding, "pkg.module.fn")] + + with patch(_PATCH_CLIENT, return_value=mock_client): + store_embedding_batch(points) + + call_kwargs = mock_client.upsert.call_args[1] + stored_points = call_kwargs["points"] + assert len(stored_points) == 1 + assert stored_points[0].id == 42 + assert stored_points[0].vector == embedding + assert stored_points[0].payload["node_id"] == 42 + assert stored_points[0].payload["qualified_name"] == "pkg.module.fn" + + +class TestDeleteProjectEmbeddings: + def test_deletes_given_ids(self) -> None: + from codebase_rag.vector_store import delete_project_embeddings + + mock_client = MagicMock() + node_ids = [1, 2, 3] + + with patch(_PATCH_CLIENT, return_value=mock_client): + delete_project_embeddings("myproject", node_ids) + + mock_client.delete.assert_called_once() + call_kwargs = mock_client.delete.call_args[1] + assert call_kwargs["points_selector"] == [1, 2, 3] + + def test_noop_on_empty_ids(self) -> None: + from codebase_rag.vector_store import delete_project_embeddings + + mock_client = MagicMock() + + with patch(_PATCH_CLIENT, return_value=mock_client): + delete_project_embeddings("myproject", []) + + mock_client.delete.assert_not_called() + + def test_handles_exception_gracefully(self) -> None: + from codebase_rag.vector_store import delete_project_embeddings + + mock_client = MagicMock() + mock_client.delete.side_effect = Exception("connection lost") + + with patch(_PATCH_CLIENT, return_value=mock_client): + delete_project_embeddings("myproject", [1, 2]) + + +class TestVerifyStoredIds: + def test_returns_found_ids(self) -> None: + from codebase_rag.vector_store import verify_stored_ids + + mock_client = MagicMock() + mock_point_1 = MagicMock() + mock_point_1.id = 1 + mock_point_2 = MagicMock() + mock_point_2.id = 3 + mock_client.retrieve.return_value = [mock_point_1, mock_point_2] + + with patch(_PATCH_CLIENT, return_value=mock_client): + result = verify_stored_ids({1, 2, 3}) + + assert result == {1, 3} + + def test_returns_empty_for_empty_input(self) -> None: + from codebase_rag.vector_store import verify_stored_ids + + result = verify_stored_ids(set()) + assert result == set() + + def test_raises_on_exception(self) -> None: + from codebase_rag.vector_store import verify_stored_ids + + mock_client = MagicMock() + mock_client.retrieve.side_effect = Exception("fail") + + with ( + patch(_PATCH_CLIENT, return_value=mock_client), + pytest.raises(Exception, match="fail"), + ): + verify_stored_ids({1, 2}) + + def test_batches_large_id_sets(self) -> None: + from codebase_rag.vector_store import _RETRIEVE_BATCH_SIZE, verify_stored_ids + + mock_client = MagicMock() + mock_client.retrieve.return_value = [] + + large_id_set = set(range(_RETRIEVE_BATCH_SIZE + 100)) + + with patch(_PATCH_CLIENT, return_value=mock_client): + verify_stored_ids(large_id_set) + + assert mock_client.retrieve.call_count == 2 + + def test_retrieve_called_with_correct_params(self) -> None: + from codebase_rag.vector_store import verify_stored_ids + + mock_client = MagicMock() + mock_client.retrieve.return_value = [] + + with patch(_PATCH_CLIENT, return_value=mock_client): + verify_stored_ids({10, 20}) + + call_kwargs = mock_client.retrieve.call_args[1] + assert call_kwargs["with_payload"] is False + assert call_kwargs["with_vectors"] is False + assert set(call_kwargs["ids"]) == {10, 20} diff --git a/codebase_rag/tests/test_workspaces.py b/codebase_rag/tests/test_workspaces.py new file mode 100644 index 000000000..a4078d1ed --- /dev/null +++ b/codebase_rag/tests/test_workspaces.py @@ -0,0 +1,251 @@ +from __future__ import annotations + +from collections.abc import Generator +from pathlib import Path +from unittest.mock import MagicMock, patch + +import pytest +from typer.testing import CliRunner + +from codebase_rag.cli import app +from codebase_rag.workspaces import ( + WorkspaceError, + add_repo, + create_workspace, + delete_workspace, + list_workspaces, + load_workspace, + remove_repo, +) +from codebase_rag.workspaces.models import WorkspaceConfig + +runner = CliRunner() + + +@pytest.fixture(autouse=True) +def _temp_home( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> Generator[Path, None, None]: + from codebase_rag.config import settings + + monkeypatch.setattr(settings, "CGR_HOME", tmp_path / "cgr-home") + yield tmp_path / "cgr-home" + + +class TestStorage: + def test_create_then_load(self, _temp_home: Path) -> None: + config, _ = create_workspace("alpha", description="testing") + assert config.name == "alpha" + loaded = load_workspace("alpha") + assert loaded.name == "alpha" + assert loaded.description == "testing" + assert loaded.repos == [] + + def test_create_duplicate_raises(self, _temp_home: Path) -> None: + create_workspace("dup") + with pytest.raises(WorkspaceError): + create_workspace("dup") + + def test_create_with_force_overwrites(self, _temp_home: Path) -> None: + create_workspace("over", description="first") + config, _ = create_workspace("over", description="second", overwrite=True) + assert config.description == "second" + + def test_load_missing_raises(self, _temp_home: Path) -> None: + with pytest.raises(WorkspaceError): + load_workspace("nope") + + def test_list_empty(self, _temp_home: Path) -> None: + assert list_workspaces() == [] + + def test_list_sorted(self, _temp_home: Path) -> None: + create_workspace("b") + create_workspace("a") + create_workspace("c") + assert list_workspaces() == ["a", "b", "c"] + + def test_delete(self, _temp_home: Path) -> None: + create_workspace("kill") + delete_workspace("kill") + with pytest.raises(WorkspaceError): + load_workspace("kill") + + def test_delete_missing_raises(self, _temp_home: Path) -> None: + with pytest.raises(WorkspaceError): + delete_workspace("nope") + + def test_add_repo_derives_project_name( + self, tmp_path: Path, _temp_home: Path + ) -> None: + repo_dir = tmp_path / "some_repo" + repo_dir.mkdir() + create_workspace("mono") + config, repo = add_repo("mono", str(repo_dir)) + assert repo.path == str(repo_dir.resolve()) + assert repo.project_name.startswith("some_repo__") + assert config.repos[0].project_name == repo.project_name + + def test_add_repo_with_explicit_project_name( + self, tmp_path: Path, _temp_home: Path + ) -> None: + repo_dir = tmp_path / "second_repo" + repo_dir.mkdir() + create_workspace("mono") + _, repo = add_repo("mono", str(repo_dir), project_name="custom_name") + assert repo.project_name == "custom_name" + + def test_add_repo_missing_path(self, tmp_path: Path, _temp_home: Path) -> None: + create_workspace("mono") + with pytest.raises(WorkspaceError): + add_repo("mono", str(tmp_path / "does_not_exist")) + + def test_add_repo_duplicate(self, tmp_path: Path, _temp_home: Path) -> None: + repo_dir = tmp_path / "dup_repo" + repo_dir.mkdir() + create_workspace("mono") + add_repo("mono", str(repo_dir)) + with pytest.raises(WorkspaceError): + add_repo("mono", str(repo_dir)) + + def test_remove_repo(self, tmp_path: Path, _temp_home: Path) -> None: + repo_dir = tmp_path / "rem_repo" + repo_dir.mkdir() + create_workspace("mono") + add_repo("mono", str(repo_dir)) + config, _ = remove_repo("mono", str(repo_dir)) + assert config.repos == [] + + def test_remove_repo_not_in_workspace( + self, tmp_path: Path, _temp_home: Path + ) -> None: + repo_dir = tmp_path / "missing_repo" + repo_dir.mkdir() + create_workspace("mono") + with pytest.raises(WorkspaceError): + remove_repo("mono", str(repo_dir)) + + +class TestCli: + def test_workspace_list_empty(self, _temp_home: Path) -> None: + result = runner.invoke(app, ["workspace", "list"]) + assert result.exit_code == 0, result.output + assert "no workspaces" in result.output.lower() + + def test_workspace_create_list_show_delete( + self, tmp_path: Path, _temp_home: Path + ) -> None: + result = runner.invoke(app, ["workspace", "create", "mono"]) + assert result.exit_code == 0, result.output + + result = runner.invoke(app, ["workspace", "list"]) + assert "mono" in result.output + + result = runner.invoke(app, ["workspace", "show", "mono"]) + assert "mono" in result.output + + result = runner.invoke(app, ["workspace", "delete", "mono"]) + assert result.exit_code == 0, result.output + + result = runner.invoke(app, ["workspace", "list"]) + assert "no workspaces" in result.output.lower() + + def test_workspace_add_remove_repo_via_cli( + self, tmp_path: Path, _temp_home: Path + ) -> None: + repo_dir = tmp_path / "the_repo" + repo_dir.mkdir() + + runner.invoke(app, ["workspace", "create", "mono"]) + result = runner.invoke(app, ["workspace", "add-repo", "mono", str(repo_dir)]) + assert result.exit_code == 0, result.output + assert str(repo_dir.resolve()) in result.output + + result = runner.invoke(app, ["workspace", "show", "mono"]) + assert str(repo_dir.resolve()) in result.output + + result = runner.invoke(app, ["workspace", "remove-repo", "mono", str(repo_dir)]) + assert result.exit_code == 0, result.output + + +@pytest.fixture +def mock_memgraph_connect() -> Generator[MagicMock, None, None]: + with patch("codebase_rag.cli.connect_memgraph") as mock_connect: + mock_ingestor = MagicMock() + mock_connect.return_value.__enter__ = MagicMock(return_value=mock_ingestor) + mock_connect.return_value.__exit__ = MagicMock(return_value=False) + yield mock_connect + + +@pytest.fixture +def mock_validate_models() -> Generator[None, None, None]: + with patch("codebase_rag.cli._update_and_validate_models"): + yield + + +def test_start_with_workspace_passes_all_projects( + mock_memgraph_connect: MagicMock, + mock_validate_models: None, + tmp_path: Path, + _temp_home: Path, +) -> None: + repo_a = tmp_path / "repo_a" + repo_b = tmp_path / "repo_b" + repo_a.mkdir() + repo_b.mkdir() + + create_workspace("mono") + add_repo("mono", str(repo_a), project_name="proj_a") + add_repo("mono", str(repo_b), project_name="proj_b") + + with ( + patch("codebase_rag.cli._run_graph_sync") as mock_sync, + patch("codebase_rag.cli.main_single_query") as mock_single, + ): + result = runner.invoke( + app, + [ + "start", + "--repo-path", + str(repo_a), + "--workspace", + "mono", + "--ask-agent", + "hi", + ], + ) + assert result.exit_code == 0, result.output + assert mock_sync.call_count == 2 + project_names_synced = [c.kwargs["project_name"] for c in mock_sync.call_args_list] + assert set(project_names_synced) == {"proj_a", "proj_b"} + mock_single.assert_called_once() + assert mock_single.call_args.kwargs["active_projects"] == ["proj_a", "proj_b"] + + +def test_start_with_unknown_workspace_errors( + mock_memgraph_connect: MagicMock, + mock_validate_models: None, + tmp_path: Path, + _temp_home: Path, +) -> None: + result = runner.invoke( + app, + [ + "start", + "--repo-path", + str(tmp_path), + "--workspace", + "doesnotexist", + "--ask-agent", + "hi", + "--no-sync", + ], + ) + assert result.exit_code != 0 + + +def test_workspace_model_project_names() -> None: + config = WorkspaceConfig( + name="x", + repos=[], + ) + assert config.project_names() == [] diff --git a/codebase_rag/tool_errors.py b/codebase_rag/tool_errors.py index 25540a976..50be918c6 100644 --- a/codebase_rag/tool_errors.py +++ b/codebase_rag/tool_errors.py @@ -6,34 +6,20 @@ # (H) File operation errors FILE_NOT_FOUND = "File not found." FILE_NOT_FOUND_OR_DIR = "File not found or is a directory: {path}" -BINARY_FILE = "File '{path}' is a binary file. Use the 'analyze_document' tool for this file type." +BINARY_FILE = "File '{path}' is a binary file. Ask the user to attach it inline if they want it analyzed." UNICODE_DECODE = ( "File '{path}' could not be read as text. It may be a binary file. " - "If it is a document (e.g., PDF), use the 'analyze_document' tool." + "If it is a document (e.g., PDF), ask the user to attach it inline." ) -# (H) Document analyzer errors -DOCUMENT_UNSUPPORTED = ( - "Error: Document analysis is not supported for the current LLM provider." -) -DOC_FILE_NOT_FOUND = "Error: File not found at '{path}'." -DOC_SECURITY_RISK = "Error: Security risk: file path {path} is outside the project root" -DOC_ACCESS_OUTSIDE_ROOT = ( - "Error: Security risk: Attempted to access file outside of project root: {path}" -) -DOC_API_VALIDATION = "Error: API validation failed: {error}" -DOC_API_ERROR = "Error: API error: {error}" -DOC_IMAGE_PROCESS = ( - "Error: Unable to process the image file. " - "The image may be corrupted or in an unsupported format." -) -DOC_ANALYSIS_FAILED = "Error: An error occurred during analysis: {error}" -DOC_DURING_ANALYSIS = "Error: Document analysis failed: {error}" - # (H) Directory errors DIRECTORY_INVALID = "Error: '{path}' is not a valid directory." DIRECTORY_EMPTY = "Error: The directory '{path}' is empty." DIRECTORY_LIST_FAILED = "Error: Could not list contents of '{path}'." +DIRECTORY_PATH_OUTSIDE_ROOT = ( + "Error: '{path}' is outside the project root ({root}). " + "Use a relative path from the project root, or the full absolute path within it." +) # (H) Shell command errors COMMAND_NOT_ALLOWED = "Command '{cmd}' is not in the allowlist.{suggestion} Available commands: {available}" @@ -69,3 +55,4 @@ # (H) CLI validation errors INVALID_POSITIVE_INT = "{value!r} is not a valid positive integer" +INVALID_NON_NEGATIVE_FLOAT = "Value must be non-negative, got {value}" diff --git a/codebase_rag/tools/code_retrieval.py b/codebase_rag/tools/code_retrieval.py index 2e6331dcd..bd04cce0a 100644 --- a/codebase_rag/tools/code_retrieval.py +++ b/codebase_rag/tools/code_retrieval.py @@ -1,5 +1,6 @@ from __future__ import annotations +import asyncio from pathlib import Path from loguru import logger @@ -15,6 +16,8 @@ class CodeRetriever: + __slots__ = ("project_root", "ingestor") + def __init__(self, project_root: str, ingestor: QueryProtocol): self.project_root = Path(project_root).resolve() self.ingestor = ingestor @@ -25,7 +28,9 @@ async def find_code_snippet(self, qualified_name: str) -> CodeSnippet: params = {"qn": qualified_name} try: - results = self.ingestor.fetch_all(CYPHER_FIND_BY_QUALIFIED_NAME, params) + results = await asyncio.to_thread( + self.ingestor.fetch_all, CYPHER_FIND_BY_QUALIFIED_NAME, params + ) if not results: return CodeSnippet( diff --git a/codebase_rag/tools/codebase_query.py b/codebase_rag/tools/codebase_query.py index 690a979bb..33c6c9cca 100644 --- a/codebase_rag/tools/codebase_query.py +++ b/codebase_rag/tools/codebase_query.py @@ -1,5 +1,7 @@ from __future__ import annotations +import asyncio + from loguru import logger from pydantic_ai import Tool from rich.console import Console @@ -8,16 +10,20 @@ from .. import exceptions as ex from .. import logs as ls +from ..config import settings from ..constants import ( QUERY_NOT_AVAILABLE, QUERY_RESULTS_PANEL_TITLE, QUERY_SUMMARY_DB_ERROR, QUERY_SUMMARY_SUCCESS, + QUERY_SUMMARY_TIMEOUT, QUERY_SUMMARY_TRANSLATION_FAILED, + QUERY_SUMMARY_TRUNCATED, ) from ..schemas import QueryGraphData from ..services import QueryProtocol from ..services.llm import CypherGenerator +from ..utils.token_utils import truncate_results_by_tokens from . import tool_descriptions as td @@ -27,7 +33,7 @@ def create_query_tool( console: Console | None = None, ) -> Tool: if console is None: - console = Console(width=None, force_terminal=True) + console = Console(width=None, stderr=True, force_terminal=True) async def query_codebase_knowledge_graph( natural_language_query: str, @@ -37,7 +43,20 @@ async def query_codebase_knowledge_graph( try: cypher_query = await cypher_gen.generate(natural_language_query) - results = ingestor.fetch_all(cypher_query) + results = await asyncio.wait_for( + asyncio.to_thread(ingestor.fetch_all, cypher_query), + timeout=settings.QUERY_TIMEOUT_S, + ) + + total_count = len(results) + if total_count > settings.QUERY_RESULT_ROW_CAP: + results = results[: settings.QUERY_RESULT_ROW_CAP] + + results, tokens_used, was_truncated = truncate_results_by_tokens( + results, + max_tokens=settings.QUERY_RESULT_MAX_TOKENS, + original_total=total_count, + ) if results: table = Table( @@ -69,7 +88,15 @@ async def query_codebase_knowledge_graph( ) ) - summary = QUERY_SUMMARY_SUCCESS.format(count=len(results)) + if was_truncated or total_count > len(results): + summary = QUERY_SUMMARY_TRUNCATED.format( + kept=len(results), + total=total_count, + tokens=tokens_used, + max_tokens=settings.QUERY_RESULT_MAX_TOKENS, + ) + else: + summary = QUERY_SUMMARY_SUCCESS.format(count=len(results)) return QueryGraphData( query_used=cypher_query, results=results, summary=summary ) @@ -79,6 +106,17 @@ async def query_codebase_knowledge_graph( results=[], summary=QUERY_SUMMARY_TRANSLATION_FAILED.format(error=e), ) + except TimeoutError: + logger.warning( + ls.TOOL_QUERY_TIMEOUT.format( + timeout=settings.QUERY_TIMEOUT_S, query=cypher_query + ) + ) + return QueryGraphData( + query_used=cypher_query, + results=[], + summary=QUERY_SUMMARY_TIMEOUT.format(timeout=settings.QUERY_TIMEOUT_S), + ) except Exception as e: logger.exception(ls.TOOL_QUERY_ERROR.format(error=e)) return QueryGraphData( diff --git a/codebase_rag/tools/directory_lister.py b/codebase_rag/tools/directory_lister.py index 01136a193..92afcb920 100644 --- a/codebase_rag/tools/directory_lister.py +++ b/codebase_rag/tools/directory_lister.py @@ -13,11 +13,19 @@ class DirectoryLister: + __slots__ = ("project_root",) + def __init__(self, project_root: str): self.project_root = Path(project_root).resolve() def list_directory_contents(self, directory_path: str) -> str: - target_path = self._get_safe_path(directory_path) + try: + target_path = self._get_safe_path(directory_path) + except PermissionError: + return te.DIRECTORY_PATH_OUTSIDE_ROOT.format( + path=directory_path, root=self.project_root + ) + logger.info(ls.DIR_LISTING.format(path=target_path)) try: diff --git a/codebase_rag/tools/document_analyzer.py b/codebase_rag/tools/document_analyzer.py deleted file mode 100644 index 2a5475954..000000000 --- a/codebase_rag/tools/document_analyzer.py +++ /dev/null @@ -1,167 +0,0 @@ -from __future__ import annotations - -import mimetypes -import shutil -import uuid -from pathlib import Path -from typing import NoReturn - -from google import genai -from google.genai import types -from google.genai.errors import ClientError -from loguru import logger -from pydantic_ai import Tool - -from .. import constants as cs -from .. import exceptions as ex -from .. import logs as ls -from .. import tool_errors as te -from ..config import settings -from . import tool_descriptions as td - - -class _NotSupportedClient: - def __getattr__(self, name: str) -> NoReturn: - raise NotImplementedError(ex.DOC_UNSUPPORTED_PROVIDER) - - -class DocumentAnalyzer: - def __init__(self, project_root: str) -> None: - self.project_root = Path(project_root).resolve() - - orchestrator_config = settings.active_orchestrator_config - orchestrator_provider = orchestrator_config.provider - - if orchestrator_provider == cs.Provider.GOOGLE: - if orchestrator_config.provider_type == cs.GoogleProviderType.VERTEX: - self.client = genai.Client( - project=orchestrator_config.project_id, - location=orchestrator_config.region, - ) - else: - self.client = genai.Client(api_key=orchestrator_config.api_key) - else: - self.client = _NotSupportedClient() - - logger.info(ls.DOC_ANALYZER_INIT.format(root=self.project_root)) - - def _resolve_absolute_path(self, file_path: str) -> Path | str: - source_path = Path(file_path) - if not source_path.is_file(): - return te.DOC_FILE_NOT_FOUND.format(path=file_path) - - tmp_dir = self.project_root / cs.TMP_DIR - tmp_dir.mkdir(exist_ok=True) - - tmp_file = tmp_dir / f"{uuid.uuid4()}-{source_path.name}" - shutil.copy2(source_path, tmp_file) - logger.info(ls.DOC_COPIED.format(path=tmp_file)) - return tmp_file - - def _resolve_relative_path(self, file_path: str) -> Path | str: - full_path = (self.project_root / file_path).resolve() - try: - full_path.relative_to(self.project_root.resolve()) - except ValueError: - return te.DOC_SECURITY_RISK.format(path=file_path) - - if not str(full_path).startswith(str(self.project_root.resolve())): - return te.DOC_SECURITY_RISK.format(path=file_path) - - return full_path - - def _resolve_file_path(self, file_path: str) -> Path | str: - if Path(file_path).is_absolute(): - return self._resolve_absolute_path(file_path) - return self._resolve_relative_path(file_path) - - def _extract_response_text(self, response: types.GenerateContentResponse) -> str: - if hasattr(response, "text") and response.text: - return str(response.text) - - if hasattr(response, "candidates") and response.candidates: - for candidate in response.candidates: - if hasattr(candidate, "content") and candidate.content: - parts = candidate.content.parts - if parts and hasattr(parts[0], "text"): - return str(parts[0].text) - return cs.MSG_DOC_NO_CANDIDATES - - logger.warning(ls.DOC_NO_TEXT.format(response=response)) - return cs.MSG_DOC_NO_CONTENT - - def _handle_analyze_error(self, error: Exception, file_path: str) -> str: - if isinstance(error, ValueError): - if "does not start with" in str(error): - err_msg = te.DOC_ACCESS_OUTSIDE_ROOT.format(path=file_path) - logger.error(err_msg) - return err_msg - logger.error(ls.DOC_ANALYZER_API_ERR.format(error=error)) - return te.DOC_API_VALIDATION.format(error=error) - - if isinstance(error, ClientError): - logger.error(ls.DOC_API_ERROR.format(path=file_path, error=error)) - if "Unable to process input image" in str(error): - return te.DOC_IMAGE_PROCESS - return te.DOC_API_ERROR.format(error=error) - - logger.exception(ls.DOC_FAILED.format(path=file_path, error=error)) - return te.DOC_ANALYSIS_FAILED.format(error=error) - - def analyze(self, file_path: str, question: str) -> str: - logger.info(ls.TOOL_DOC_ANALYZE.format(path=file_path, question=question)) - if isinstance(self.client, _NotSupportedClient): - return te.DOCUMENT_UNSUPPORTED - - try: - resolved = self._resolve_file_path(file_path) - if isinstance(resolved, str): - return resolved - full_path = resolved - - if not full_path.is_file(): - return te.DOC_FILE_NOT_FOUND.format(path=file_path) - - mime_type, _ = mimetypes.guess_type(full_path) - if not mime_type: - mime_type = cs.MIME_TYPE_DEFAULT - - file_bytes = full_path.read_bytes() - - prompt_parts = [ - types.Part.from_bytes(data=file_bytes, mime_type=mime_type), - cs.DOC_PROMPT_PREFIX.format(question=question), - ] - - orchestrator_config = settings.active_orchestrator_config - response = self.client.models.generate_content( - model=orchestrator_config.model_id, contents=prompt_parts - ) - - logger.success(ls.DOC_SUCCESS.format(path=file_path)) - return self._extract_response_text(response) - - except Exception as e: - return self._handle_analyze_error(e, file_path) - - -def create_document_analyzer_tool(analyzer: DocumentAnalyzer) -> Tool: - def analyze_document(file_path: str, question: str) -> str: - try: - result = analyzer.analyze(file_path, question) - preview = result[:100] if result else "None" - logger.debug( - ls.DOC_RESULT.format(type=type(result).__name__, preview=preview) - ) - return result - except Exception as e: - logger.exception(ls.DOC_EXCEPTION.format(error=e)) - if str(e).startswith("Error:") or str(e).startswith("API error:"): - return str(e) - return te.DOC_DURING_ANALYSIS.format(error=e) - - return Tool( - function=analyze_document, - name=td.AgenticToolName.ANALYZE_DOCUMENT, - description=td.ANALYZE_DOCUMENT, - ) diff --git a/codebase_rag/tools/file_editor.py b/codebase_rag/tools/file_editor.py index 650da823e..bc79ce8e0 100644 --- a/codebase_rag/tools/file_editor.py +++ b/codebase_rag/tools/file_editor.py @@ -20,6 +20,8 @@ class FileEditor: + __slots__ = ("project_root", "dmp", "parsers") + def __init__(self, project_root: str = ".") -> None: self.project_root = Path(project_root).resolve() self.dmp = diff_match_patch.diff_match_patch() @@ -218,7 +220,7 @@ def replace_code_block( if target_block not in original_content: logger.error(ls.EDITOR_BLOCK_NOT_FOUND.format(path=file_path)) - logger.debug(ls.EDITOR_LOOKING_FOR.format(block=repr(target_block))) + logger.debug(ls.EDITOR_LOOKING_FOR, block=repr(target_block)) return False modified_content = original_content.replace( diff --git a/codebase_rag/tools/file_reader.py b/codebase_rag/tools/file_reader.py index 1b5f8618b..ae471ee93 100644 --- a/codebase_rag/tools/file_reader.py +++ b/codebase_rag/tools/file_reader.py @@ -14,6 +14,8 @@ class FileReader: + __slots__ = ("project_root",) + def __init__(self, project_root: str = "."): self.project_root = Path(project_root).resolve() logger.info(ls.FILE_READER_INIT.format(root=self.project_root)) diff --git a/codebase_rag/tools/file_writer.py b/codebase_rag/tools/file_writer.py index 4f3110b3b..ca709778a 100644 --- a/codebase_rag/tools/file_writer.py +++ b/codebase_rag/tools/file_writer.py @@ -14,6 +14,8 @@ class FileWriter: + __slots__ = ("project_root",) + def __init__(self, project_root: str = "."): self.project_root = Path(project_root).resolve() logger.info(ls.FILE_WRITER_INIT.format(root=self.project_root)) diff --git a/codebase_rag/tools/health_checker.py b/codebase_rag/tools/health_checker.py index 2b94f2c6f..36640b5e1 100644 --- a/codebase_rag/tools/health_checker.py +++ b/codebase_rag/tools/health_checker.py @@ -12,6 +12,8 @@ class HealthChecker: + __slots__ = ("results",) + def __init__(self): self.results: list[HealthCheckResult] = [] diff --git a/codebase_rag/tools/semantic_search.py b/codebase_rag/tools/semantic_search.py index e7aa9c5b2..d647ce20e 100644 --- a/codebase_rag/tools/semantic_search.py +++ b/codebase_rag/tools/semantic_search.py @@ -139,7 +139,11 @@ async def semantic_search_functions(query: str, top_k: int = 5) -> str: return response - return Tool(semantic_search_functions, name=td.AgenticToolName.SEMANTIC_SEARCH) + return Tool( + semantic_search_functions, + name=td.AgenticToolName.SEMANTIC_SEARCH, + description=td.SEMANTIC_SEARCH, + ) def create_get_function_source_tool() -> Tool: @@ -153,4 +157,8 @@ async def get_function_source_by_id(node_id: int) -> str: return cs.MSG_SEMANTIC_SOURCE_FORMAT.format(id=node_id, code=source_code) - return Tool(get_function_source_by_id, name=td.AgenticToolName.GET_FUNCTION_SOURCE) + return Tool( + get_function_source_by_id, + name=td.AgenticToolName.GET_FUNCTION_SOURCE, + description=td.GET_FUNCTION_SOURCE, + ) diff --git a/codebase_rag/tools/shell_command.py b/codebase_rag/tools/shell_command.py index 2a4d3aff0..45021bf96 100644 --- a/codebase_rag/tools/shell_command.py +++ b/codebase_rag/tools/shell_command.py @@ -7,6 +7,7 @@ import shutil import sys import time +from collections.abc import Callable from pathlib import Path from loguru import logger @@ -58,6 +59,8 @@ def _has_subshell(command: str) -> str | None: class CommandGroup: + __slots__ = ("commands", "operator") + def __init__(self, commands: list[str], operator: str | None = None): self.commands = commands self.operator = operator @@ -152,12 +155,12 @@ def _is_dangerous_rm_path(cmd_parts: list[str], project_root: Path) -> tuple[boo resolved_str = str(resolved) if resolved == resolved.parent: return True, "rm targeting root directory" - parts = resolved.parts - if len(parts) >= 2 and parts[1] in cs.SHELL_SYSTEM_DIRECTORIES: - return True, f"rm targeting system directory: {resolved_str}" try: resolved.relative_to(project_root) except ValueError: + parts = resolved.parts + if len(parts) >= 2 and parts[1] in cs.SHELL_SYSTEM_DIRECTORIES: + return True, f"rm targeting system directory: {resolved_str}" return True, f"rm targeting path outside project: {resolved_str}" return False, "" @@ -194,7 +197,9 @@ def _is_dangerous_command(cmd_parts: list[str], full_segment: str) -> tuple[bool return False, "" -def _validate_segment(segment: str, available_commands: str) -> str | None: +def _validate_segment( + segment: str, available_commands: str, bypass_allowlist: bool = False +) -> str | None: try: cmd_parts = shlex.split(segment) except ValueError: @@ -205,7 +210,7 @@ def _validate_segment(segment: str, available_commands: str) -> str | None: base_cmd = cmd_parts[0] - if base_cmd not in settings.SHELL_COMMAND_ALLOWLIST: + if not bypass_allowlist and base_cmd not in settings.SHELL_COMMAND_ALLOWLIST: suggestion = cs.GREP_SUGGESTION if base_cmd == cs.SHELL_CMD_GREP else "" return te.COMMAND_NOT_ALLOWED.format( cmd=base_cmd, suggestion=suggestion, available=available_commands @@ -263,9 +268,17 @@ def _requires_approval(command: str) -> bool: class ShellCommander: - def __init__(self, project_root: str = ".", timeout: int = 30): + __slots__ = ("project_root", "timeout", "is_yolo") + + def __init__( + self, + project_root: str = ".", + timeout: int = 30, + is_yolo: Callable[[], bool] | None = None, + ): self.project_root = Path(project_root).resolve() self.timeout = timeout + self.is_yolo = is_yolo or (lambda: False) logger.info(ls.SHELL_COMMANDER_INIT.format(root=self.project_root)) async def _execute_pipeline(self, segments: list[str]) -> tuple[int, bytes, bytes]: @@ -352,9 +365,12 @@ async def execute(self, command: str) -> ShellCommandResult: ) available_commands = ", ".join(sorted(settings.SHELL_COMMAND_ALLOWLIST)) + bypass_allowlist = self.is_yolo() for group in groups: for segment in group.commands: - if err_msg := _validate_segment(segment, available_commands): + if err_msg := _validate_segment( + segment, available_commands, bypass_allowlist=bypass_allowlist + ): logger.error(err_msg) return ShellCommandResult( return_code=cs.SHELL_RETURN_CODE_ERROR, @@ -437,7 +453,11 @@ def create_shell_command_tool(shell_commander: ShellCommander) -> Tool: async def run_shell_command( ctx: RunContext[None], command: str ) -> ShellCommandResult: - if _requires_approval(command) and not ctx.tool_call_approved: + if ( + not shell_commander.is_yolo() + and _requires_approval(command) + and not ctx.tool_call_approved + ): raise ApprovalRequired(metadata={"command": command}) return await shell_commander.execute(command) diff --git a/codebase_rag/tools/tool_descriptions.py b/codebase_rag/tools/tool_descriptions.py index 008c60bef..df1d99812 100644 --- a/codebase_rag/tools/tool_descriptions.py +++ b/codebase_rag/tools/tool_descriptions.py @@ -11,17 +11,12 @@ class AgenticToolName(StrEnum): CREATE_FILE = "create_file" REPLACE_CODE = "replace_code" LIST_DIRECTORY = "list_directory" - ANALYZE_DOCUMENT = "analyze_document" EXECUTE_SHELL = "execute_shell" SEMANTIC_SEARCH = "semantic_search" GET_FUNCTION_SOURCE = "get_function_source" GET_CODE_SNIPPET = "get_code_snippet" -ANALYZE_DOCUMENT = ( - "Analyzes documents (PDFs, images) to answer questions about their content." -) - CODEBASE_QUERY = ( "Query the codebase knowledge graph using natural language questions. " "Ask in plain English about classes, functions, methods, dependencies, or code structure. " @@ -60,7 +55,7 @@ class AgenticToolName(StrEnum): FILE_READER = ( "Reads the content of text-based files. " - "For documents like PDFs or images, use the 'analyze_document' tool instead." + "Images and PDFs the user references are attached inline; read them directly." ) FILE_EDITOR = ( @@ -88,13 +83,19 @@ class AgenticToolName(StrEnum): ) MCP_INDEX_REPOSITORY = ( + "WARNING: Clears all data for the current project including its embeddings. " "Parse and ingest the repository into the Memgraph knowledge graph. " - "This builds a comprehensive graph of functions, classes, dependencies, and relationships. " - "Note: This preserves other projects - only the current project is re-indexed." + "Use update_repository for incremental updates. Only use when explicitly requested." +) + +MCP_UPDATE_REPOSITORY = ( + "Update the repository in the Memgraph knowledge graph without clearing existing data. " + "Use this for incremental updates." ) MCP_QUERY_CODE_GRAPH = ( "Query the codebase knowledge graph using natural language. " + "Use semantic_search unless you know the exact names of classes/functions you are searching for. " "Ask questions like 'What functions call UserService.create_user?' or " "'Show me all classes that implement the Repository interface'." ) @@ -117,6 +118,12 @@ class AgenticToolName(StrEnum): MCP_LIST_DIRECTORY = "List contents of a directory in the project." +MCP_SEMANTIC_SEARCH = ( + "Performs a semantic search for functions based on a natural language query " + "describing their purpose, returning a list of potential matches with similarity scores. " + "Requires the 'semantic' extra to be installed." +) + MCP_PARAM_PROJECT_NAME = "Name of the project to delete (e.g., 'my-project')" MCP_PARAM_CONFIRM = "Must be true to confirm the wipe operation" MCP_PARAM_NATURAL_LANGUAGE_QUERY = "Your question in plain English about the codebase" @@ -130,6 +137,16 @@ class AgenticToolName(StrEnum): MCP_PARAM_LIMIT = "Maximum number of lines to read (optional)" MCP_PARAM_CONTENT = "Content to write to the file" MCP_PARAM_DIRECTORY_PATH = "Relative path to directory from project root (default: '.')" +MCP_PARAM_TOP_K = "Max number of results to return (optional, default: 5)" +MCP_PARAM_QUESTION = ( + "A question about the codebase, architecture, functionality, or code relationships" +) + +MCP_ASK_AGENT = ( + "Ask the Code Graph RAG agent a question about the codebase. " + "Uses the full RAG pipeline to analyze the code graph and provide a detailed answer. " + "Use this for general questions about architecture, functionality, and code relationships." +) MCP_TOOLS: dict[MCPToolName, str] = { @@ -137,12 +154,15 @@ class AgenticToolName(StrEnum): MCPToolName.DELETE_PROJECT: MCP_DELETE_PROJECT, MCPToolName.WIPE_DATABASE: MCP_WIPE_DATABASE, MCPToolName.INDEX_REPOSITORY: MCP_INDEX_REPOSITORY, + MCPToolName.UPDATE_REPOSITORY: MCP_UPDATE_REPOSITORY, MCPToolName.QUERY_CODE_GRAPH: MCP_QUERY_CODE_GRAPH, MCPToolName.GET_CODE_SNIPPET: MCP_GET_CODE_SNIPPET, MCPToolName.SURGICAL_REPLACE_CODE: MCP_SURGICAL_REPLACE_CODE, MCPToolName.READ_FILE: MCP_READ_FILE, MCPToolName.WRITE_FILE: MCP_WRITE_FILE, MCPToolName.LIST_DIRECTORY: MCP_LIST_DIRECTORY, + MCPToolName.SEMANTIC_SEARCH: MCP_SEMANTIC_SEARCH, + MCPToolName.ASK_AGENT: MCP_ASK_AGENT, } AGENTIC_TOOLS: dict[AgenticToolName, str] = { @@ -151,7 +171,6 @@ class AgenticToolName(StrEnum): AgenticToolName.CREATE_FILE: FILE_WRITER, AgenticToolName.REPLACE_CODE: FILE_EDITOR, AgenticToolName.LIST_DIRECTORY: DIRECTORY_LISTER, - AgenticToolName.ANALYZE_DOCUMENT: ANALYZE_DOCUMENT, AgenticToolName.EXECUTE_SHELL: SHELL_COMMAND, AgenticToolName.SEMANTIC_SEARCH: SEMANTIC_SEARCH, AgenticToolName.GET_FUNCTION_SOURCE: GET_FUNCTION_SOURCE, diff --git a/codebase_rag/types_defs.py b/codebase_rag/types_defs.py index fb293147b..07eab14ca 100644 --- a/codebase_rag/types_defs.py +++ b/codebase_rag/types_defs.py @@ -95,6 +95,30 @@ def find_with_prefix(self, prefix: str) -> list[tuple[QualifiedName, NodeType]]: def find_ending_with(self, suffix: str) -> list[QualifiedName]: ... + def register_unique_qn( + self, natural_qn: QualifiedName, start_line: int + ) -> QualifiedName: ... + + def variants(self, qualified_name: QualifiedName) -> list[QualifiedName]: ... + + def mark_property(self, qualified_name: QualifiedName) -> None: ... + + def is_property(self, qualified_name: QualifiedName) -> bool: ... + + def property_names(self) -> set[str]: ... + + def mark_abstract(self, qualified_name: QualifiedName) -> None: ... + + def is_abstract(self, qualified_name: QualifiedName) -> bool: ... + + def mark_callable_params( + self, qualified_name: QualifiedName, params: dict[str, int] + ) -> None: ... + + def callable_params( + self, qualified_name: QualifiedName + ) -> dict[str, int] | None: ... + class ASTCacheProtocol(Protocol): def __setitem__(self, key: Path, value: tuple[Node, SupportedLanguage]) -> None: ... @@ -186,6 +210,11 @@ class GraphSummary(TypedDict): metadata: GraphMetadata +class QueryJsonOutput(TypedDict): + query: str + response: str + + class EmbeddingQueryResult(TypedDict): node_id: int qualified_name: str @@ -256,7 +285,13 @@ class AgentLoopUI(NamedTuple): panel_title: str -ORANGE_STYLE = Style.from_dict({"": "#ff8c00"}) +ORANGE_STYLE = Style.from_dict( + { + "": "#ff8c00", + "bottom-toolbar": "noreverse fg:#888888", + "bottom-toolbar.text": "noreverse fg:#888888", + } +) OPTIMIZATION_LOOP_UI = AgentLoopUI( status_message="[bold green]Agent is analyzing codebase... (Press Ctrl+C to cancel)[/bold green]", @@ -285,7 +320,6 @@ class LanguageImport(NamedTuple): class ToolNames(NamedTuple): query_graph: str read_file: str - analyze_document: str semantic_search: str create_file: str edit_file: str @@ -350,7 +384,7 @@ class FunctionNodeProps(TypedDict, total=False): class MCPInputSchemaProperty(TypedDict, total=False): type: str description: str - default: str + default: str | int MCPInputSchemaProperties = dict[str, MCPInputSchemaProperty] @@ -387,6 +421,14 @@ class CodeSnippetResultDict(TypedDict, total=False): error: str +class DeadCodeRow(TypedDict): + label: str + name: str + qualified_name: str + start_line: int + end_line: int + + class ListProjectsSuccessResult(TypedDict): projects: list[str] count: int @@ -439,36 +481,47 @@ class RelationshipSchema(NamedTuple): NODE_SCHEMAS: tuple[NodeSchema, ...] = ( NodeSchema(NodeLabel.PROJECT, "{name: string}"), NodeSchema( - NodeLabel.PACKAGE, "{qualified_name: string, name: string, path: string}" + NodeLabel.PACKAGE, + "{qualified_name: string, name: string, path: string, absolute_path: string}", + ), + NodeSchema(NodeLabel.FOLDER, "{path: string, name: string, absolute_path: string}"), + NodeSchema( + NodeLabel.FILE, + "{path: string, name: string, extension: string, absolute_path: string}", ), - NodeSchema(NodeLabel.FOLDER, "{path: string, name: string}"), - NodeSchema(NodeLabel.FILE, "{path: string, name: string, extension: string}"), NodeSchema( - NodeLabel.MODULE, "{qualified_name: string, name: string, path: string}" + NodeLabel.MODULE, + "{qualified_name: string, name: string, path: string, absolute_path: string}", ), NodeSchema( NodeLabel.CLASS, - "{qualified_name: string, name: string, decorators: list[string]}", + "{qualified_name: string, name: string, decorators: list[string], path: string, absolute_path: string}", ), NodeSchema( NodeLabel.FUNCTION, - "{qualified_name: string, name: string, decorators: list[string]}", + "{qualified_name: string, name: string, decorators: list[string], path: string, absolute_path: string}", ), NodeSchema( NodeLabel.METHOD, - "{qualified_name: string, name: string, decorators: list[string]}", + "{qualified_name: string, name: string, decorators: list[string], path: string, absolute_path: string}", + ), + NodeSchema( + NodeLabel.INTERFACE, + "{qualified_name: string, name: string, path: string, absolute_path: string}", + ), + NodeSchema( + NodeLabel.ENUM, + "{qualified_name: string, name: string, path: string, absolute_path: string}", ), - NodeSchema(NodeLabel.INTERFACE, "{qualified_name: string, name: string}"), - NodeSchema(NodeLabel.ENUM, "{qualified_name: string, name: string}"), NodeSchema(NodeLabel.TYPE, "{qualified_name: string, name: string}"), NodeSchema(NodeLabel.UNION, "{qualified_name: string, name: string}"), NodeSchema( NodeLabel.MODULE_INTERFACE, - "{qualified_name: string, name: string, path: string}", + "{qualified_name: string, name: string, path: string, absolute_path: string}", ), NodeSchema( NodeLabel.MODULE_IMPLEMENTATION, - "{qualified_name: string, name: string, path: string, implements_module: string}", + "{qualified_name: string, name: string, path: string, absolute_path: string, implements_module: string}", ), NodeSchema(NodeLabel.EXTERNAL_PACKAGE, "{name: string, version_spec: string}"), ) @@ -555,4 +608,9 @@ class RelationshipSchema(NamedTuple): RelationshipType.CALLS, (NodeLabel.FUNCTION, NodeLabel.METHOD), ), + RelationshipSchema( + (NodeLabel.MODULE, NodeLabel.FUNCTION, NodeLabel.METHOD), + RelationshipType.INSTANTIATES, + (NodeLabel.CLASS,), + ), ) diff --git a/codebase_rag/unixcoder.py b/codebase_rag/unixcoder.py index 6738fb677..e0d235c85 100644 --- a/codebase_rag/unixcoder.py +++ b/codebase_rag/unixcoder.py @@ -98,9 +98,8 @@ def forward(self, source_ids: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor] pad_id = self.config.pad_token_id assert pad_id is not None mask = source_ids.ne(pad_id) - token_embeddings = self.model( - source_ids, attention_mask=mask.unsqueeze(1) * mask.unsqueeze(2) - )[0] + attention_mask = (mask.unsqueeze(1) * mask.unsqueeze(2)).unsqueeze(1) + token_embeddings = self.model(source_ids, attention_mask=attention_mask)[0] sentence_embeddings = (token_embeddings * mask.unsqueeze(-1)).sum(1) / mask.sum( -1 ).unsqueeze(-1) @@ -190,6 +189,17 @@ def generate( class Beam: + __slots__ = ( + "_eos", + "device", + "eosTop", + "finished", + "nextYs", + "prevKs", + "scores", + "size", + ) + def __init__(self, size: int, eos: int, device: torch.device) -> None: self.size = size self.device = device diff --git a/codebase_rag/utils/fqn_resolver.py b/codebase_rag/utils/fqn_resolver.py index 470c6cc8f..ba3fe9dcd 100644 --- a/codebase_rag/utils/fqn_resolver.py +++ b/codebase_rag/utils/fqn_resolver.py @@ -40,7 +40,7 @@ def resolve_fqn_from_ast( return SEPARATOR_DOT.join(full_parts) except Exception as e: - logger.debug(ls.FQN_RESOLVE_FAILED.format(path=file_path, error=e)) + logger.debug(ls.FQN_RESOLVE_FAILED, path=file_path, error=e) return None @@ -73,7 +73,7 @@ def walk(node: Node) -> str | None: return walk(root_node) except Exception as e: - logger.debug(ls.FQN_FIND_FAILED.format(fqn=target_fqn, path=file_path, error=e)) + logger.debug(ls.FQN_FIND_FAILED, fqn=target_fqn, path=file_path, error=e) return None @@ -102,6 +102,6 @@ def walk(node: Node) -> None: walk(root_node) except Exception as e: - logger.debug(ls.FQN_EXTRACT_FAILED.format(path=file_path, error=e)) + logger.debug(ls.FQN_EXTRACT_FAILED, path=file_path, error=e) return functions diff --git a/codebase_rag/utils/path_utils.py b/codebase_rag/utils/path_utils.py index 5c9bbf5b5..fc5a4258d 100644 --- a/codebase_rag/utils/path_utils.py +++ b/codebase_rag/utils/path_utils.py @@ -1,19 +1,79 @@ +import hashlib +import re +from functools import lru_cache from pathlib import Path from .. import constants as cs +_PROJECT_NAME_INVALID_CHARS = re.compile(r"[^A-Za-z0-9_-]+") +_PROJECT_NAME_DIGEST_LEN = 8 +_PROJECT_NAME_FALLBACK_BASE = "repo" + + +def derive_project_name(repo_path: Path) -> str: + resolved = repo_path.resolve() + digest = hashlib.sha256(str(resolved).encode("utf-8")).hexdigest()[ + :_PROJECT_NAME_DIGEST_LEN + ] + base = _PROJECT_NAME_INVALID_CHARS.sub("_", resolved.name).strip("_") + if not base: + base = _PROJECT_NAME_FALLBACK_BASE + return f"{base}__{digest}" + + +def resolve_repo_path(repo_path: str | None, target_default: str) -> Path: + if repo_path: + return Path(repo_path).resolve() + if target_default and target_default != ".": + return Path(target_default).resolve() + return Path.cwd().resolve() + + +@lru_cache(maxsize=4096) +def cached_relative_path(file_path: Path, repo_path: Path) -> Path: + return file_path.relative_to(repo_path) + + +@lru_cache(maxsize=4096) +def cached_resolve_posix(file_path: Path) -> str: + return file_path.resolve().as_posix() + def should_skip_path( path: Path, repo_path: Path, exclude_paths: frozenset[str] | None = None, unignore_paths: frozenset[str] | None = None, + is_file: bool | None = None, ) -> bool: - if path.is_file() and path.suffix in cs.IGNORE_SUFFIXES: + _is_file = path.is_file() if is_file is None else is_file + if _is_file and path.suffix in cs.IGNORE_SUFFIXES: return True - rel_path = path.relative_to(repo_path) + rel_path = cached_relative_path(path, repo_path) rel_path_str = rel_path.as_posix() - dir_parts = rel_path.parent.parts if path.is_file() else rel_path.parts + dir_parts = rel_path.parent.parts if _is_file else rel_path.parts + if exclude_paths and ( + not exclude_paths.isdisjoint(dir_parts) + or rel_path_str in exclude_paths + or any(rel_path_str.startswith(f"{p}/") for p in exclude_paths) + ): + return True + if unignore_paths and any( + rel_path_str == p or rel_path_str.startswith(f"{p}/") for p in unignore_paths + ): + return False + return not cs.IGNORE_PATTERNS.isdisjoint(dir_parts) + + +def should_skip_rel_file( + rel_path_str: str, + dir_parts: tuple[str, ...], + suffix: str, + exclude_paths: frozenset[str] | None = None, + unignore_paths: frozenset[str] | None = None, +) -> bool: + if suffix in cs.IGNORE_SUFFIXES: + return True if exclude_paths and ( not exclude_paths.isdisjoint(dir_parts) or rel_path_str in exclude_paths diff --git a/codebase_rag/utils/rich_markdown.py b/codebase_rag/utils/rich_markdown.py new file mode 100644 index 000000000..12d4cf4fb --- /dev/null +++ b/codebase_rag/utils/rich_markdown.py @@ -0,0 +1,30 @@ +from __future__ import annotations + +from typing import ClassVar + +from rich import box +from rich.console import Console, ConsoleOptions, RenderResult +from rich.markdown import Heading, Markdown, MarkdownElement +from rich.panel import Panel +from rich.text import Text + + +class LeftAlignedHeading(Heading): + def __rich_console__( + self, console: Console, options: ConsoleOptions + ) -> RenderResult: + text = self.text + text.justify = "left" + if self.tag == "h1": + yield Panel(text, box=box.HEAVY, style="markdown.h1.border") + else: + if self.tag == "h2": + yield Text("") + yield text + + +class LeftAlignedMarkdown(Markdown): + elements: ClassVar[dict[str, type[MarkdownElement]]] = { + **Markdown.elements, + "heading_open": LeftAlignedHeading, + } diff --git a/codebase_rag/utils/source_extraction.py b/codebase_rag/utils/source_extraction.py index 548243a5f..20969db56 100644 --- a/codebase_rag/utils/source_extraction.py +++ b/codebase_rag/utils/source_extraction.py @@ -21,22 +21,28 @@ def extract_source_lines( return None try: - with open(file_path, encoding=encoding) as f: - lines = f.readlines() - - if start_line > len(lines) or end_line > len(lines): - logger.warning( - ls.SOURCE_RANGE_EXCEEDS.format( - start=start_line, - end=end_line, - length=len(lines), - path=file_path, - ) + raw_bytes = file_path.read_bytes() + text = raw_bytes.decode(encoding) + lines = text.splitlines(keepends=True) + + if not lines: + return None + + if start_line > len(lines) or end_line > len(lines): + logger.warning( + ls.SOURCE_RANGE_EXCEEDS.format( + start=start_line, + end=end_line, + length=len(lines), + path=file_path, ) + ) + end_line = min(end_line, len(lines)) + if start_line > len(lines): return None - extracted_lines = lines[start_line - 1 : end_line] - return "".join(extracted_lines).strip() + extracted_lines = lines[start_line - 1 : end_line] + return "".join(extracted_lines).strip() except Exception as e: logger.warning(ls.SOURCE_EXTRACT_FAILED.format(path=file_path, error=e)) @@ -56,7 +62,7 @@ def extract_source_with_fallback( if ast_result := ast_extractor(qualified_name, file_path): return str(ast_result) except Exception as e: - logger.debug(ls.SOURCE_AST_FAILED.format(name=qualified_name, error=e)) + logger.debug(ls.SOURCE_AST_FAILED, name=qualified_name, error=e) return extract_source_lines(file_path, start_line, end_line, encoding) diff --git a/codebase_rag/utils/token_utils.py b/codebase_rag/utils/token_utils.py new file mode 100644 index 000000000..031262d06 --- /dev/null +++ b/codebase_rag/utils/token_utils.py @@ -0,0 +1,53 @@ +from __future__ import annotations + +import json +from functools import cache + +import tiktoken +from loguru import logger + +from .. import constants as cs +from .. import logs as ls +from ..types_defs import ResultRow + + +@cache +def _get_encoding() -> tiktoken.Encoding: + return tiktoken.get_encoding(cs.TIKTOKEN_ENCODING) + + +def count_tokens(text: str) -> int: + return len(_get_encoding().encode(text)) + + +def truncate_results_by_tokens( + results: list[ResultRow], + max_tokens: int, + original_total: int | None = None, +) -> tuple[list[ResultRow], int, bool]: + if not results: + return results, 0, False + + kept: list[ResultRow] = [] + total_tokens = 0 + total_for_log = original_total if original_total is not None else len(results) + + for row in results: + row_text = json.dumps(row, default=str) + row_tokens = count_tokens(row_text) + + if total_tokens + row_tokens > max_tokens and kept: + logger.warning( + ls.QUERY_RESULTS_TRUNCATED.format( + kept=len(kept), + total=total_for_log, + tokens=total_tokens, + max_tokens=max_tokens, + ) + ) + return kept, total_tokens, True + + kept.append(row) + total_tokens += row_tokens + + return kept, total_tokens, False diff --git a/codebase_rag/vector_store.py b/codebase_rag/vector_store.py index 6580b43c2..82d0d19c5 100644 --- a/codebase_rag/vector_store.py +++ b/codebase_rag/vector_store.py @@ -1,3 +1,6 @@ +import time +from collections.abc import Sequence + from loguru import logger from . import logs as ls @@ -5,16 +8,35 @@ from .constants import PAYLOAD_NODE_ID, PAYLOAD_QUALIFIED_NAME from .utils.dependencies import has_qdrant_client +_RETRIEVE_BATCH_SIZE = 1000 + if has_qdrant_client(): from qdrant_client import QdrantClient from qdrant_client.models import Distance, PointStruct, VectorParams _CLIENT: QdrantClient | None = None + def close_qdrant_client() -> None: + global _CLIENT + if _CLIENT is not None: + _CLIENT.close() + _CLIENT = None + def get_qdrant_client() -> QdrantClient: global _CLIENT if _CLIENT is None: - _CLIENT = QdrantClient(path=settings.QDRANT_DB_PATH) + if settings.QDRANT_URL: + _CLIENT = QdrantClient(url=settings.QDRANT_URL) + else: + try: + _CLIENT = QdrantClient(path=settings.QDRANT_DB_PATH) + except Exception as e: + logger.error( + ls.QDRANT_LOCK_ERROR.format( + path=settings.QDRANT_DB_PATH, error=e + ) + ) + raise if not _CLIENT.collection_exists(settings.QDRANT_COLLECTION_NAME): _CLIENT.create_collection( collection_name=settings.QDRANT_COLLECTION_NAME, @@ -24,28 +46,92 @@ def get_qdrant_client() -> QdrantClient: ) return _CLIENT + def _upsert_with_retry(points: list[PointStruct]) -> None: + client = get_qdrant_client() + max_attempts = settings.QDRANT_UPSERT_RETRIES + base_delay = settings.QDRANT_RETRY_BASE_DELAY + for attempt in range(1, max_attempts + 1): + try: + client.upsert( + collection_name=settings.QDRANT_COLLECTION_NAME, + points=points, + ) + return + except Exception as e: + if attempt == max_attempts: + raise + delay = base_delay * (2 ** (attempt - 1)) + logger.warning( + ls.EMBEDDING_STORE_RETRY.format( + attempt=attempt, max_attempts=max_attempts, delay=delay, error=e + ) + ) + time.sleep(delay) + def store_embedding( node_id: int, embedding: list[float], qualified_name: str ) -> None: + store_embedding_batch([(node_id, embedding, qualified_name)]) + + def store_embedding_batch( + points: Sequence[tuple[int, list[float], str]], + ) -> int: + if not points: + return 0 + point_structs = [ + PointStruct( + id=node_id, + vector=embedding, + payload={ + PAYLOAD_NODE_ID: node_id, + PAYLOAD_QUALIFIED_NAME: qualified_name, + }, + ) + for node_id, embedding, qualified_name in points + ] try: + _upsert_with_retry(point_structs) + logger.debug(ls.EMBEDDING_BATCH_STORED.format(count=len(point_structs))) + return len(point_structs) + except Exception as e: + logger.warning(ls.EMBEDDING_BATCH_FAILED.format(error=e)) + return 0 + + def delete_project_embeddings(project_name: str, node_ids: Sequence[int]) -> None: + if not node_ids: + return + try: + logger.info( + ls.QDRANT_DELETE_PROJECT.format( + count=len(node_ids), project=project_name + ) + ) client = get_qdrant_client() - client.upsert( + client.delete( collection_name=settings.QDRANT_COLLECTION_NAME, - points=[ - PointStruct( - id=node_id, - vector=embedding, - payload={ - PAYLOAD_NODE_ID: node_id, - PAYLOAD_QUALIFIED_NAME: qualified_name, - }, - ) - ], + points_selector=list(node_ids), ) + logger.info(ls.QDRANT_DELETE_PROJECT_DONE.format(project=project_name)) except Exception as e: logger.warning( - ls.EMBEDDING_STORE_FAILED.format(name=qualified_name, error=e) + ls.QDRANT_DELETE_PROJECT_FAILED.format(project=project_name, error=e) + ) + + def verify_stored_ids(expected_ids: set[int]) -> set[int]: + if not expected_ids: + return set() + client = get_qdrant_client() + found_ids: set[int] = set() + ids_list = list(expected_ids) + for i in range(0, len(ids_list), _RETRIEVE_BATCH_SIZE): + points = client.retrieve( + collection_name=settings.QDRANT_COLLECTION_NAME, + ids=ids_list[i : i + _RETRIEVE_BATCH_SIZE], + with_payload=False, + with_vectors=False, ) + found_ids.update(p.id for p in points if isinstance(p.id, int)) + return found_ids def search_embeddings( query_embedding: list[float], top_k: int | None = None @@ -69,11 +155,25 @@ def search_embeddings( else: + def close_qdrant_client() -> None: + pass + def store_embedding( node_id: int, embedding: list[float], qualified_name: str ) -> None: pass + def store_embedding_batch( + points: Sequence[tuple[int, list[float], str]], + ) -> int: + return 0 + + def delete_project_embeddings(project_name: str, node_ids: Sequence[int]) -> None: + pass + + def verify_stored_ids(expected_ids: set[int]) -> set[int]: + return set() + def search_embeddings( query_embedding: list[float], top_k: int | None = None ) -> list[tuple[int, float]]: diff --git a/codebase_rag/workspaces/__init__.py b/codebase_rag/workspaces/__init__.py new file mode 100644 index 000000000..e93eec119 --- /dev/null +++ b/codebase_rag/workspaces/__init__.py @@ -0,0 +1,28 @@ +from .models import WorkspaceConfig, WorkspaceRepo +from .storage import ( + WorkspaceError, + add_repo, + create_workspace, + delete_workspace, + list_workspaces, + load_workspace, + remove_repo, + save_workspace, + workspace_path, + workspaces_dir, +) + +__all__ = [ + "WorkspaceConfig", + "WorkspaceError", + "WorkspaceRepo", + "add_repo", + "create_workspace", + "delete_workspace", + "list_workspaces", + "load_workspace", + "remove_repo", + "save_workspace", + "workspace_path", + "workspaces_dir", +] diff --git a/codebase_rag/workspaces/cli.py b/codebase_rag/workspaces/cli.py new file mode 100644 index 000000000..1726744fb --- /dev/null +++ b/codebase_rag/workspaces/cli.py @@ -0,0 +1,102 @@ +from __future__ import annotations + +import sys + +import click +from loguru import logger + +from .. import cli_help as ch +from . import constants as wcs +from . import storage as st +from .storage import WorkspaceError + + +@click.group(help=ch.CMD_WORKSPACE_GROUP) +def cli() -> None: + pass + + +@cli.command("list", help=ch.CMD_WORKSPACE_LIST) +def list_cmd() -> None: + names = st.list_workspaces() + if not names: + click.echo(ch.MSG_NO_WORKSPACES) + return + for name in names: + click.echo(name) + + +@cli.command("create", help=ch.CMD_WORKSPACE_CREATE) +@click.argument("name") +@click.option("--description", "-d", default="", help=ch.HELP_WORKSPACE_DESCRIPTION) +@click.option("--force", is_flag=True, help=ch.HELP_WORKSPACE_FORCE) +def create_cmd(name: str, description: str, force: bool) -> None: + try: + _, path = st.create_workspace(name, description=description, overwrite=force) + except WorkspaceError as e: + logger.error(str(e)) + click.secho(str(e), fg="red", err=True) + sys.exit(1) + click.echo(wcs.MSG_WORKSPACE_CREATED.format(name=name, path=path)) + + +@cli.command("delete", help=ch.CMD_WORKSPACE_DELETE) +@click.argument("name") +def delete_cmd(name: str) -> None: + try: + path = st.delete_workspace(name) + except WorkspaceError as e: + logger.error(str(e)) + click.secho(str(e), fg="red", err=True) + sys.exit(1) + click.echo(wcs.MSG_WORKSPACE_DELETED.format(name=name, path=path)) + + +@cli.command("show", help=ch.CMD_WORKSPACE_SHOW) +@click.argument("name") +def show_cmd(name: str) -> None: + try: + config = st.load_workspace(name) + except WorkspaceError as e: + logger.error(str(e)) + click.secho(str(e), fg="red", err=True) + sys.exit(1) + click.echo(f"name: {config.name}") + if config.description: + click.echo(f"description: {config.description}") + click.echo(f"repos: {len(config.repos)}") + for repo in config.repos: + click.echo(f" - {repo.path} ({repo.project_name})") + + +@cli.command("add-repo", help=ch.CMD_WORKSPACE_ADD_REPO) +@click.argument("name") +@click.argument("repo_path") +@click.option( + "--project-name", "-p", default=None, help=ch.HELP_WORKSPACE_REPO_PROJECT_NAME +) +def add_repo_cmd(name: str, repo_path: str, project_name: str | None) -> None: + try: + _, repo = st.add_repo(name, repo_path, project_name=project_name) + except WorkspaceError as e: + logger.error(str(e)) + click.secho(str(e), fg="red", err=True) + sys.exit(1) + click.echo( + wcs.MSG_WORKSPACE_ADDED_REPO.format( + path=repo.path, project_name=repo.project_name + ) + ) + + +@cli.command("remove-repo", help=ch.CMD_WORKSPACE_REMOVE_REPO) +@click.argument("name") +@click.argument("repo_path") +def remove_repo_cmd(name: str, repo_path: str) -> None: + try: + _, repo = st.remove_repo(name, repo_path) + except WorkspaceError as e: + logger.error(str(e)) + click.secho(str(e), fg="red", err=True) + sys.exit(1) + click.echo(wcs.MSG_WORKSPACE_REMOVED_REPO.format(path=repo.path)) diff --git a/codebase_rag/workspaces/constants.py b/codebase_rag/workspaces/constants.py new file mode 100644 index 000000000..2bd69da47 --- /dev/null +++ b/codebase_rag/workspaces/constants.py @@ -0,0 +1,24 @@ +WORKSPACES_SUBDIR = "workspaces" +WORKSPACE_EXTENSION = ".toml" + +ERR_WORKSPACE_NOT_FOUND = "Workspace '{name}' not found at {path}." +ERR_WORKSPACE_ALREADY_EXISTS = "Workspace '{name}' already exists at {path}." +ERR_WORKSPACE_INVALID_TOML = "Workspace '{name}' has invalid TOML: {error}" +ERR_WORKSPACE_INVALID_SCHEMA = "Workspace '{name}' schema invalid: {error}" +ERR_WORKSPACE_REPO_PATH_MISSING = ( + "Repo path '{path}' does not exist on disk. Aborting workspace operation." +) +ERR_WORKSPACE_REPO_DUPLICATE = ( + "Repo with path '{path}' is already in workspace '{name}'." +) +ERR_WORKSPACE_REPO_NOT_IN_WORKSPACE = ( + "No repo with path '{path}' in workspace '{name}'." +) + +MSG_WORKSPACE_CREATED = "Created workspace '{name}' at {path}" +MSG_WORKSPACE_DELETED = "Deleted workspace '{name}' at {path}" +MSG_WORKSPACE_ADDED_REPO = "Added repo '{path}' (project: {project_name})" +MSG_WORKSPACE_REMOVED_REPO = "Removed repo '{path}'" +MSG_WORKSPACE_SYNCING = "Syncing workspace '{name}' ({count} repo(s))" +MSG_WORKSPACE_SYNC_REPO = "[{idx}/{total}] Syncing {path} as project '{project_name}'" +MSG_WORKSPACE_SYNC_DONE = "Workspace '{name}' sync complete." diff --git a/codebase_rag/workspaces/models.py b/codebase_rag/workspaces/models.py new file mode 100644 index 000000000..184cc3a67 --- /dev/null +++ b/codebase_rag/workspaces/models.py @@ -0,0 +1,29 @@ +from __future__ import annotations + +from pathlib import Path + +from pydantic import BaseModel, Field + + +class WorkspaceRepo(BaseModel): + path: str + project_name: str + + def repo_path(self) -> Path: + return Path(self.path).expanduser().resolve() + + +class WorkspaceConfig(BaseModel): + name: str + description: str = "" + repos: list[WorkspaceRepo] = Field(default_factory=list) + + def project_names(self) -> list[str]: + return [r.project_name for r in self.repos] + + def find_repo(self, path: str) -> WorkspaceRepo | None: + target = Path(path).expanduser().resolve() + for repo in self.repos: + if repo.repo_path() == target: + return repo + return None diff --git a/codebase_rag/workspaces/storage.py b/codebase_rag/workspaces/storage.py new file mode 100644 index 000000000..7e04380d0 --- /dev/null +++ b/codebase_rag/workspaces/storage.py @@ -0,0 +1,125 @@ +from __future__ import annotations + +import tomllib +from pathlib import Path + +import toml +from pydantic import ValidationError + +from ..config import settings +from ..utils.path_utils import derive_project_name +from . import constants as cs +from .models import WorkspaceConfig, WorkspaceRepo + + +class WorkspaceError(RuntimeError): + pass + + +def workspaces_dir(home: Path | None = None) -> Path: + base = (home or settings.CGR_HOME).expanduser() + return base / cs.WORKSPACES_SUBDIR + + +def workspace_path(name: str, home: Path | None = None) -> Path: + return workspaces_dir(home) / f"{name}{cs.WORKSPACE_EXTENSION}" + + +def list_workspaces(home: Path | None = None) -> list[str]: + root = workspaces_dir(home) + if not root.exists(): + return [] + return sorted(p.stem for p in root.glob(f"*{cs.WORKSPACE_EXTENSION}")) + + +def load_workspace(name: str, home: Path | None = None) -> WorkspaceConfig: + path = workspace_path(name, home) + if not path.exists(): + raise WorkspaceError(cs.ERR_WORKSPACE_NOT_FOUND.format(name=name, path=path)) + try: + with path.open("rb") as f: + data = tomllib.load(f) + except tomllib.TOMLDecodeError as e: + raise WorkspaceError( + cs.ERR_WORKSPACE_INVALID_TOML.format(name=name, error=e) + ) from e + body = data.get("workspace", data) + try: + return WorkspaceConfig.model_validate(body) + except ValidationError as e: + raise WorkspaceError( + cs.ERR_WORKSPACE_INVALID_SCHEMA.format(name=name, error=e) + ) from e + + +def save_workspace(config: WorkspaceConfig, home: Path | None = None) -> Path: + path = workspace_path(config.name, home) + path.parent.mkdir(parents=True, exist_ok=True) + payload = {"workspace": config.model_dump()} + with path.open("w", encoding="utf-8") as f: + toml.dump(payload, f) + return path + + +def create_workspace( + name: str, + description: str = "", + repos: list[WorkspaceRepo] | None = None, + home: Path | None = None, + overwrite: bool = False, +) -> tuple[WorkspaceConfig, Path]: + path = workspace_path(name, home) + if path.exists() and not overwrite: + raise WorkspaceError( + cs.ERR_WORKSPACE_ALREADY_EXISTS.format(name=name, path=path) + ) + config = WorkspaceConfig(name=name, description=description, repos=repos or []) + saved = save_workspace(config, home=home) + return config, saved + + +def delete_workspace(name: str, home: Path | None = None) -> Path: + path = workspace_path(name, home) + if not path.exists(): + raise WorkspaceError(cs.ERR_WORKSPACE_NOT_FOUND.format(name=name, path=path)) + path.unlink() + return path + + +def add_repo( + name: str, + repo_path: str, + project_name: str | None = None, + home: Path | None = None, +) -> tuple[WorkspaceConfig, WorkspaceRepo]: + resolved = Path(repo_path).expanduser().resolve() + if not resolved.exists(): + raise WorkspaceError(cs.ERR_WORKSPACE_REPO_PATH_MISSING.format(path=resolved)) + config = load_workspace(name, home=home) + if config.find_repo(str(resolved)) is not None: + raise WorkspaceError( + cs.ERR_WORKSPACE_REPO_DUPLICATE.format(path=resolved, name=name) + ) + repo = WorkspaceRepo( + path=str(resolved), + project_name=(project_name or derive_project_name(resolved)), + ) + config.repos.append(repo) + save_workspace(config, home=home) + return config, repo + + +def remove_repo( + name: str, repo_path: str, home: Path | None = None +) -> tuple[WorkspaceConfig, WorkspaceRepo]: + config = load_workspace(name, home=home) + found = config.find_repo(repo_path) + if found is None: + raise WorkspaceError( + cs.ERR_WORKSPACE_REPO_NOT_IN_WORKSPACE.format( + path=Path(repo_path).expanduser().resolve(), name=name + ) + ) + config.repos = [r for r in config.repos if r is not found] + save_workspace(config, home=home) + return config, found diff --git a/codec/schema.proto b/codec/schema.proto index fcd28e6c2..06832c97f 100644 --- a/codec/schema.proto +++ b/codec/schema.proto @@ -102,6 +102,10 @@ message GraphCodeIndex { ExternalPackage external_package = 9; ModuleImplementation module_implementation = 10; ModuleInterface module_interface = 11; + Interface interface_node = 12; + Enum enum_node = 13; + Type type_node = 14; + Union union_node = 15; } } @@ -123,6 +127,8 @@ message GraphCodeIndex { DEPENDS_ON_EXTERNAL = 11; IMPLEMENTS_MODULE = 12; IMPLEMENTS = 13; + EXPORTS = 14; + EXPORTS_MODULE = 15; } RelationshipType type = 1; @@ -232,3 +238,35 @@ message GraphCodeIndex { repeated string decorators = 6; bool is_exported = 7; } + + message Interface { + // Primary Key + string qualified_name = 1; + + string name = 2; + string path = 3; + string absolute_path = 4; + } + + message Enum { + // Primary Key + string qualified_name = 1; + + string name = 2; + string path = 3; + string absolute_path = 4; + } + + message Type { + // Primary Key + string qualified_name = 1; + + string name = 2; + } + + message Union { + // Primary Key + string qualified_name = 1; + + string name = 2; + } diff --git a/codec/schema_pb2.py b/codec/schema_pb2.py index 5dd666f71..fcae069dd 100644 --- a/codec/schema_pb2.py +++ b/codec/schema_pb2.py @@ -1,61 +1,62 @@ +# -*- coding: utf-8 -*- # Generated by the protocol buffer compiler. DO NOT EDIT! -# NO CHECKED-IN PROTOBUF GENCODE # source: codec/schema.proto -# Protobuf Python Version: 6.33.1 """Generated protocol buffer code.""" - +from google.protobuf.internal import builder as _builder from google.protobuf import descriptor as _descriptor from google.protobuf import descriptor_pool as _descriptor_pool -from google.protobuf import runtime_version as _runtime_version -from google.protobuf import struct_pb2 as _struct_pb2 from google.protobuf import symbol_database as _symbol_database -from google.protobuf.internal import builder as _builder - -_runtime_version.ValidateProtobufRuntimeVersion( - _runtime_version.Domain.PUBLIC, 6, 33, 1, "", "codec/schema.proto" -) # @@protoc_insertion_point(imports) _sym_db = _symbol_database.Default() -DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile( - b'\n\x12\x63odec/schema.proto\x12\x0cgraphcode.v1\x1a\x1cgoogle/protobuf/struct.proto"f\n\x0eGraphCodeIndex\x12!\n\x05nodes\x18\x01 \x03(\x0b\x32\x12.graphcode.v1.Node\x12\x31\n\rrelationships\x18\x02 \x03(\x0b\x32\x1a.graphcode.v1.Relationship"\x93\x04\n\x04Node\x12(\n\x07project\x18\x01 \x01(\x0b\x32\x15.graphcode.v1.ProjectH\x00\x12(\n\x07package\x18\x02 \x01(\x0b\x32\x15.graphcode.v1.PackageH\x00\x12&\n\x06\x66older\x18\x03 \x01(\x0b\x32\x14.graphcode.v1.FolderH\x00\x12&\n\x06module\x18\x04 \x01(\x0b\x32\x14.graphcode.v1.ModuleH\x00\x12)\n\nclass_node\x18\x05 \x01(\x0b\x32\x13.graphcode.v1.ClassH\x00\x12*\n\x08\x66unction\x18\x06 \x01(\x0b\x32\x16.graphcode.v1.FunctionH\x00\x12&\n\x06method\x18\x07 \x01(\x0b\x32\x14.graphcode.v1.MethodH\x00\x12"\n\x04\x66ile\x18\x08 \x01(\x0b\x32\x12.graphcode.v1.FileH\x00\x12\x39\n\x10\x65xternal_package\x18\t \x01(\x0b\x32\x1d.graphcode.v1.ExternalPackageH\x00\x12\x43\n\x15module_implementation\x18\n \x01(\x0b\x32".graphcode.v1.ModuleImplementationH\x00\x12\x39\n\x10module_interface\x18\x0b \x01(\x0b\x32\x1d.graphcode.v1.ModuleInterfaceH\x00\x42\t\n\x07payload"\xe9\x03\n\x0cRelationship\x12\x39\n\x04type\x18\x01 \x01(\x0e\x32+.graphcode.v1.Relationship.RelationshipType\x12\x11\n\tsource_id\x18\x02 \x01(\t\x12\x11\n\ttarget_id\x18\x03 \x01(\t\x12+\n\nproperties\x18\x04 \x01(\x0b\x32\x17.google.protobuf.Struct\x12\x14\n\x0csource_label\x18\x05 \x01(\t\x12\x14\n\x0ctarget_label\x18\x06 \x01(\t"\x9e\x02\n\x10RelationshipType\x12!\n\x1dRELATIONSHIP_TYPE_UNSPECIFIED\x10\x00\x12\x14\n\x10\x43ONTAINS_PACKAGE\x10\x01\x12\x13\n\x0f\x43ONTAINS_FOLDER\x10\x02\x12\x11\n\rCONTAINS_FILE\x10\x03\x12\x13\n\x0f\x43ONTAINS_MODULE\x10\x04\x12\x0b\n\x07\x44\x45\x46INES\x10\x05\x12\x12\n\x0e\x44\x45\x46INES_METHOD\x10\x06\x12\x0b\n\x07IMPORTS\x10\x07\x12\x0c\n\x08INHERITS\x10\x08\x12\r\n\tOVERRIDES\x10\t\x12\t\n\x05\x43\x41LLS\x10\n\x12\x17\n\x13\x44\x45PENDS_ON_EXTERNAL\x10\x0b\x12\x15\n\x11IMPLEMENTS_MODULE\x10\x0c\x12\x0e\n\nIMPLEMENTS\x10\r"\x17\n\x07Project\x12\x0c\n\x04name\x18\x01 \x01(\t"=\n\x07Package\x12\x16\n\x0equalified_name\x18\x01 \x01(\t\x12\x0c\n\x04name\x18\x02 \x01(\t\x12\x0c\n\x04path\x18\x03 \x01(\t"$\n\x06\x46older\x12\x0c\n\x04path\x18\x01 \x01(\t\x12\x0c\n\x04name\x18\x02 \x01(\t"5\n\x04\x46ile\x12\x0c\n\x04path\x18\x01 \x01(\t\x12\x0c\n\x04name\x18\x02 \x01(\t\x12\x11\n\textension\x18\x03 \x01(\t"<\n\x06Module\x12\x16\n\x0equalified_name\x18\x01 \x01(\t\x12\x0c\n\x04name\x18\x02 \x01(\t\x12\x0c\n\x04path\x18\x03 \x01(\t"e\n\x14ModuleImplementation\x12\x16\n\x0equalified_name\x18\x01 \x01(\t\x12\x0c\n\x04name\x18\x02 \x01(\t\x12\x0c\n\x04path\x18\x03 \x01(\t\x12\x19\n\x11implements_module\x18\x04 \x01(\t"E\n\x0fModuleInterface\x12\x16\n\x0equalified_name\x18\x01 \x01(\t\x12\x0c\n\x04name\x18\x02 \x01(\t\x12\x0c\n\x04path\x18\x03 \x01(\t"\x1f\n\x0f\x45xternalPackage\x12\x0c\n\x04name\x18\x01 \x01(\t"\x92\x01\n\x08\x46unction\x12\x16\n\x0equalified_name\x18\x01 \x01(\t\x12\x0c\n\x04name\x18\x02 \x01(\t\x12\x11\n\tdocstring\x18\x03 \x01(\t\x12\x12\n\nstart_line\x18\x04 \x01(\x05\x12\x10\n\x08\x65nd_line\x18\x05 \x01(\x05\x12\x12\n\ndecorators\x18\x06 \x03(\t\x12\x13\n\x0bis_exported\x18\x07 \x01(\x08"{\n\x06Method\x12\x16\n\x0equalified_name\x18\x01 \x01(\t\x12\x0c\n\x04name\x18\x02 \x01(\t\x12\x11\n\tdocstring\x18\x03 \x01(\t\x12\x12\n\nstart_line\x18\x04 \x01(\x05\x12\x10\n\x08\x65nd_line\x18\x05 \x01(\x05\x12\x12\n\ndecorators\x18\x06 \x03(\t"\x8f\x01\n\x05\x43lass\x12\x16\n\x0equalified_name\x18\x01 \x01(\t\x12\x0c\n\x04name\x18\x02 \x01(\t\x12\x11\n\tdocstring\x18\x03 \x01(\t\x12\x12\n\nstart_line\x18\x04 \x01(\x05\x12\x10\n\x08\x65nd_line\x18\x05 \x01(\x05\x12\x12\n\ndecorators\x18\x06 \x03(\t\x12\x13\n\x0bis_exported\x18\x07 \x01(\x08\x62\x06proto3' -) +from google.protobuf import struct_pb2 as google_dot_protobuf_dot_struct__pb2 + + +DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x12\x63odec/schema.proto\x12\x0cgraphcode.v1\x1a\x1cgoogle/protobuf/struct.proto\"f\n\x0eGraphCodeIndex\x12!\n\x05nodes\x18\x01 \x03(\x0b\x32\x12.graphcode.v1.Node\x12\x31\n\rrelationships\x18\x02 \x03(\x0b\x32\x1a.graphcode.v1.Relationship\"\xc3\x05\n\x04Node\x12(\n\x07project\x18\x01 \x01(\x0b\x32\x15.graphcode.v1.ProjectH\x00\x12(\n\x07package\x18\x02 \x01(\x0b\x32\x15.graphcode.v1.PackageH\x00\x12&\n\x06\x66older\x18\x03 \x01(\x0b\x32\x14.graphcode.v1.FolderH\x00\x12&\n\x06module\x18\x04 \x01(\x0b\x32\x14.graphcode.v1.ModuleH\x00\x12)\n\nclass_node\x18\x05 \x01(\x0b\x32\x13.graphcode.v1.ClassH\x00\x12*\n\x08\x66unction\x18\x06 \x01(\x0b\x32\x16.graphcode.v1.FunctionH\x00\x12&\n\x06method\x18\x07 \x01(\x0b\x32\x14.graphcode.v1.MethodH\x00\x12\"\n\x04\x66ile\x18\x08 \x01(\x0b\x32\x12.graphcode.v1.FileH\x00\x12\x39\n\x10\x65xternal_package\x18\t \x01(\x0b\x32\x1d.graphcode.v1.ExternalPackageH\x00\x12\x43\n\x15module_implementation\x18\n \x01(\x0b\x32\".graphcode.v1.ModuleImplementationH\x00\x12\x39\n\x10module_interface\x18\x0b \x01(\x0b\x32\x1d.graphcode.v1.ModuleInterfaceH\x00\x12\x31\n\x0einterface_node\x18\x0c \x01(\x0b\x32\x17.graphcode.v1.InterfaceH\x00\x12\'\n\tenum_node\x18\r \x01(\x0b\x32\x12.graphcode.v1.EnumH\x00\x12\'\n\ttype_node\x18\x0e \x01(\x0b\x32\x12.graphcode.v1.TypeH\x00\x12)\n\nunion_node\x18\x0f \x01(\x0b\x32\x13.graphcode.v1.UnionH\x00\x42\t\n\x07payload\"\x8a\x04\n\x0cRelationship\x12\x39\n\x04type\x18\x01 \x01(\x0e\x32+.graphcode.v1.Relationship.RelationshipType\x12\x11\n\tsource_id\x18\x02 \x01(\t\x12\x11\n\ttarget_id\x18\x03 \x01(\t\x12+\n\nproperties\x18\x04 \x01(\x0b\x32\x17.google.protobuf.Struct\x12\x14\n\x0csource_label\x18\x05 \x01(\t\x12\x14\n\x0ctarget_label\x18\x06 \x01(\t\"\xbf\x02\n\x10RelationshipType\x12!\n\x1dRELATIONSHIP_TYPE_UNSPECIFIED\x10\x00\x12\x14\n\x10\x43ONTAINS_PACKAGE\x10\x01\x12\x13\n\x0f\x43ONTAINS_FOLDER\x10\x02\x12\x11\n\rCONTAINS_FILE\x10\x03\x12\x13\n\x0f\x43ONTAINS_MODULE\x10\x04\x12\x0b\n\x07\x44\x45\x46INES\x10\x05\x12\x12\n\x0e\x44\x45\x46INES_METHOD\x10\x06\x12\x0b\n\x07IMPORTS\x10\x07\x12\x0c\n\x08INHERITS\x10\x08\x12\r\n\tOVERRIDES\x10\t\x12\t\n\x05\x43\x41LLS\x10\n\x12\x17\n\x13\x44\x45PENDS_ON_EXTERNAL\x10\x0b\x12\x15\n\x11IMPLEMENTS_MODULE\x10\x0c\x12\x0e\n\nIMPLEMENTS\x10\r\x12\x0b\n\x07\x45XPORTS\x10\x0e\x12\x12\n\x0e\x45XPORTS_MODULE\x10\x0f\"\x17\n\x07Project\x12\x0c\n\x04name\x18\x01 \x01(\t\"=\n\x07Package\x12\x16\n\x0equalified_name\x18\x01 \x01(\t\x12\x0c\n\x04name\x18\x02 \x01(\t\x12\x0c\n\x04path\x18\x03 \x01(\t\"$\n\x06\x46older\x12\x0c\n\x04path\x18\x01 \x01(\t\x12\x0c\n\x04name\x18\x02 \x01(\t\"5\n\x04\x46ile\x12\x0c\n\x04path\x18\x01 \x01(\t\x12\x0c\n\x04name\x18\x02 \x01(\t\x12\x11\n\textension\x18\x03 \x01(\t\"<\n\x06Module\x12\x16\n\x0equalified_name\x18\x01 \x01(\t\x12\x0c\n\x04name\x18\x02 \x01(\t\x12\x0c\n\x04path\x18\x03 \x01(\t\"e\n\x14ModuleImplementation\x12\x16\n\x0equalified_name\x18\x01 \x01(\t\x12\x0c\n\x04name\x18\x02 \x01(\t\x12\x0c\n\x04path\x18\x03 \x01(\t\x12\x19\n\x11implements_module\x18\x04 \x01(\t\"E\n\x0fModuleInterface\x12\x16\n\x0equalified_name\x18\x01 \x01(\t\x12\x0c\n\x04name\x18\x02 \x01(\t\x12\x0c\n\x04path\x18\x03 \x01(\t\"\x1f\n\x0f\x45xternalPackage\x12\x0c\n\x04name\x18\x01 \x01(\t\"\x92\x01\n\x08\x46unction\x12\x16\n\x0equalified_name\x18\x01 \x01(\t\x12\x0c\n\x04name\x18\x02 \x01(\t\x12\x11\n\tdocstring\x18\x03 \x01(\t\x12\x12\n\nstart_line\x18\x04 \x01(\x05\x12\x10\n\x08\x65nd_line\x18\x05 \x01(\x05\x12\x12\n\ndecorators\x18\x06 \x03(\t\x12\x13\n\x0bis_exported\x18\x07 \x01(\x08\"{\n\x06Method\x12\x16\n\x0equalified_name\x18\x01 \x01(\t\x12\x0c\n\x04name\x18\x02 \x01(\t\x12\x11\n\tdocstring\x18\x03 \x01(\t\x12\x12\n\nstart_line\x18\x04 \x01(\x05\x12\x10\n\x08\x65nd_line\x18\x05 \x01(\x05\x12\x12\n\ndecorators\x18\x06 \x03(\t\"\x8f\x01\n\x05\x43lass\x12\x16\n\x0equalified_name\x18\x01 \x01(\t\x12\x0c\n\x04name\x18\x02 \x01(\t\x12\x11\n\tdocstring\x18\x03 \x01(\t\x12\x12\n\nstart_line\x18\x04 \x01(\x05\x12\x10\n\x08\x65nd_line\x18\x05 \x01(\x05\x12\x12\n\ndecorators\x18\x06 \x03(\t\x12\x13\n\x0bis_exported\x18\x07 \x01(\x08\"V\n\tInterface\x12\x16\n\x0equalified_name\x18\x01 \x01(\t\x12\x0c\n\x04name\x18\x02 \x01(\t\x12\x0c\n\x04path\x18\x03 \x01(\t\x12\x15\n\rabsolute_path\x18\x04 \x01(\t\"Q\n\x04\x45num\x12\x16\n\x0equalified_name\x18\x01 \x01(\t\x12\x0c\n\x04name\x18\x02 \x01(\t\x12\x0c\n\x04path\x18\x03 \x01(\t\x12\x15\n\rabsolute_path\x18\x04 \x01(\t\",\n\x04Type\x12\x16\n\x0equalified_name\x18\x01 \x01(\t\x12\x0c\n\x04name\x18\x02 \x01(\t\"-\n\x05Union\x12\x16\n\x0equalified_name\x18\x01 \x01(\t\x12\x0c\n\x04name\x18\x02 \x01(\tb\x06proto3') + +_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, globals()) +_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'codec.schema_pb2', globals()) +if _descriptor._USE_C_DESCRIPTORS == False: -_globals = globals() -_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals) -_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, "codec.schema_pb2", _globals) -if not _descriptor._USE_C_DESCRIPTORS: - DESCRIPTOR._loaded_options = None - _globals["_GRAPHCODEINDEX"]._serialized_start = 66 - _globals["_GRAPHCODEINDEX"]._serialized_end = 168 - _globals["_NODE"]._serialized_start = 171 - _globals["_NODE"]._serialized_end = 702 - _globals["_RELATIONSHIP"]._serialized_start = 705 - _globals["_RELATIONSHIP"]._serialized_end = 1194 - _globals["_RELATIONSHIP_RELATIONSHIPTYPE"]._serialized_start = 908 - _globals["_RELATIONSHIP_RELATIONSHIPTYPE"]._serialized_end = 1194 - _globals["_PROJECT"]._serialized_start = 1196 - _globals["_PROJECT"]._serialized_end = 1219 - _globals["_PACKAGE"]._serialized_start = 1221 - _globals["_PACKAGE"]._serialized_end = 1282 - _globals["_FOLDER"]._serialized_start = 1284 - _globals["_FOLDER"]._serialized_end = 1320 - _globals["_FILE"]._serialized_start = 1322 - _globals["_FILE"]._serialized_end = 1375 - _globals["_MODULE"]._serialized_start = 1377 - _globals["_MODULE"]._serialized_end = 1437 - _globals["_MODULEIMPLEMENTATION"]._serialized_start = 1439 - _globals["_MODULEIMPLEMENTATION"]._serialized_end = 1540 - _globals["_MODULEINTERFACE"]._serialized_start = 1542 - _globals["_MODULEINTERFACE"]._serialized_end = 1611 - _globals["_EXTERNALPACKAGE"]._serialized_start = 1613 - _globals["_EXTERNALPACKAGE"]._serialized_end = 1644 - _globals["_FUNCTION"]._serialized_start = 1647 - _globals["_FUNCTION"]._serialized_end = 1793 - _globals["_METHOD"]._serialized_start = 1795 - _globals["_METHOD"]._serialized_end = 1918 - _globals["_CLASS"]._serialized_start = 1921 - _globals["_CLASS"]._serialized_end = 2064 + DESCRIPTOR._options = None + _GRAPHCODEINDEX._serialized_start=66 + _GRAPHCODEINDEX._serialized_end=168 + _NODE._serialized_start=171 + _NODE._serialized_end=878 + _RELATIONSHIP._serialized_start=881 + _RELATIONSHIP._serialized_end=1403 + _RELATIONSHIP_RELATIONSHIPTYPE._serialized_start=1084 + _RELATIONSHIP_RELATIONSHIPTYPE._serialized_end=1403 + _PROJECT._serialized_start=1405 + _PROJECT._serialized_end=1428 + _PACKAGE._serialized_start=1430 + _PACKAGE._serialized_end=1491 + _FOLDER._serialized_start=1493 + _FOLDER._serialized_end=1529 + _FILE._serialized_start=1531 + _FILE._serialized_end=1584 + _MODULE._serialized_start=1586 + _MODULE._serialized_end=1646 + _MODULEIMPLEMENTATION._serialized_start=1648 + _MODULEIMPLEMENTATION._serialized_end=1749 + _MODULEINTERFACE._serialized_start=1751 + _MODULEINTERFACE._serialized_end=1820 + _EXTERNALPACKAGE._serialized_start=1822 + _EXTERNALPACKAGE._serialized_end=1853 + _FUNCTION._serialized_start=1856 + _FUNCTION._serialized_end=2002 + _METHOD._serialized_start=2004 + _METHOD._serialized_end=2127 + _CLASS._serialized_start=2130 + _CLASS._serialized_end=2273 + _INTERFACE._serialized_start=2275 + _INTERFACE._serialized_end=2361 + _ENUM._serialized_start=2363 + _ENUM._serialized_end=2444 + _TYPE._serialized_start=2446 + _TYPE._serialized_end=2490 + _UNION._serialized_start=2492 + _UNION._serialized_end=2537 # @@protoc_insertion_point(module_scope) diff --git a/docker-compose.yaml b/docker-compose.yaml deleted file mode 100644 index 88b9b13ab..000000000 --- a/docker-compose.yaml +++ /dev/null @@ -1,12 +0,0 @@ -services: - memgraph: - image: memgraph/memgraph-mage - ports: - - "${MEMGRAPH_PORT:-7687}:7687" - - "${MEMGRAPH_HTTP_PORT:-7444}:7444" - lab: - image: memgraph/lab - ports: - - "${LAB_PORT:-3000}:3000" - environment: - QUICK_CONNECT_MG_HOST: memgraph diff --git a/TODO.md b/docs/TODO.md similarity index 100% rename from TODO.md rename to docs/TODO.md diff --git a/docs/advanced/adding-languages.md b/docs/advanced/adding-languages.md new file mode 100644 index 000000000..ed69ef6c9 --- /dev/null +++ b/docs/advanced/adding-languages.md @@ -0,0 +1,104 @@ +--- +description: "Add support for new programming languages to Code-Graph-RAG using Tree-sitter grammars." +--- + +# Adding Languages + +Code-Graph-RAG makes it easy to add support for any language that has a Tree-sitter grammar. The system automatically handles grammar compilation and integration. + +!!! warning + While you can add languages yourself, we recommend waiting for official full support to ensure optimal parsing quality, comprehensive feature coverage, and robust integration. [Submit a language request](https://codeberg.org/vitali87/code-graph-rag/issues) if you need a specific language supported. + +## Quick Start + +Use the built-in language management tool: + +```bash +cgr language add-grammar +``` + +Examples: + +```bash +cgr language add-grammar c-sharp +cgr language add-grammar php +cgr language add-grammar ruby +cgr language add-grammar kotlin +``` + +## Custom Grammar Repositories + +For languages hosted outside the standard tree-sitter organization: + +```bash +cgr language add-grammar --grammar-url https://github.com/custom/tree-sitter-mylang +``` + +## What Happens Automatically + +When you add a language, the tool automatically: + +1. **Downloads the Grammar**: Clones the tree-sitter grammar repository as a git submodule +2. **Detects Configuration**: Auto-extracts language metadata from `tree-sitter.json` +3. **Analyzes Node Types**: Automatically identifies AST node types for functions/methods, classes/structs, modules/files, and function calls +4. **Compiles Bindings**: Builds Python bindings from the grammar source +5. **Updates Configuration**: Adds the language to `codebase_rag/language_config.py` +6. **Enables Parsing**: Makes the language immediately available for codebase analysis + +## Example: Adding C# Support + +```bash +$ cgr language add-grammar c-sharp +Using default tree-sitter URL: https://github.com/tree-sitter/tree-sitter-c-sharp +Adding submodule from https://github.com/tree-sitter/tree-sitter-c-sharp... +Successfully added submodule at grammars/tree-sitter-c-sharp +Auto-detected language: c-sharp +Auto-detected file extensions: ['cs'] +Auto-detected node types: +Functions: ['destructor_declaration', 'method_declaration', 'constructor_declaration'] +Classes: ['struct_declaration', 'enum_declaration', 'interface_declaration', 'class_declaration'] +Modules: ['compilation_unit', 'file_scoped_namespace_declaration', 'namespace_declaration'] +Calls: ['invocation_expression'] + +Language 'c-sharp' has been added to the configuration! +Updated codebase_rag/language_config.py +``` + +## Managing Languages + +```bash +cgr language list-languages + +cgr language remove-language +``` + +## Language Configuration + +Each language is defined in `codebase_rag/language_config.py`: + +```python +"language-name": LanguageConfig( + name="language-name", + file_extensions=[".ext1", ".ext2"], + function_node_types=["function_declaration", "method_declaration"], + class_node_types=["class_declaration", "struct_declaration"], + module_node_types=["compilation_unit", "source_file"], + call_node_types=["call_expression", "method_invocation"], +), +``` + +## Troubleshooting + +**Grammar not found**: Use a custom URL if the automatic URL doesn't work: + +```bash +cgr language add-grammar --grammar-url https://github.com/custom/tree-sitter-mylang +``` + +**Version incompatibility**: If you get "Incompatible Language version" errors: + +```bash +uv add tree-sitter@latest +``` + +**Missing node types**: The tool automatically detects common node patterns, but you can manually adjust the configuration in `language_config.py` if needed. diff --git a/docs/advanced/building-binaries.md b/docs/advanced/building-binaries.md new file mode 100644 index 000000000..b250d52c7 --- /dev/null +++ b/docs/advanced/building-binaries.md @@ -0,0 +1,15 @@ +--- +description: "Build a standalone binary of Code-Graph-RAG using PyInstaller." +--- + +# Building Binaries + +You can build a standalone binary of Code-Graph-RAG using the `build_binary.py` script. This uses PyInstaller to package the application and its dependencies into a single executable. + +## Build + +```bash +python build_binary.py +``` + +The resulting binary will be located in the `dist` directory. diff --git a/docs/advanced/ignore-patterns.md b/docs/advanced/ignore-patterns.md new file mode 100644 index 000000000..a17ad4b70 --- /dev/null +++ b/docs/advanced/ignore-patterns.md @@ -0,0 +1,28 @@ +--- +description: "Configure .cgrignore to exclude directories from Code-Graph-RAG analysis." +--- + +# Ignore Patterns + +You can specify additional directories to exclude from analysis by creating a `.cgrignore` file in your repository root. + +## Format + +``` +# Comments start with # +vendor +.custom_cache +my_build_output +``` + +## Rules + +- One directory name per line +- Lines starting with `#` are comments +- Blank lines are ignored +- Patterns are exact directory name matches (not globs) +- Patterns from `.cgrignore` are merged with `--exclude` flags and auto-detected directories + +## Default Exclusions + +Code-Graph-RAG automatically excludes common non-source directories such as `.git`, `node_modules`, `__pycache__`, `dist`, `build`, and similar. diff --git a/docs/advanced/troubleshooting.md b/docs/advanced/troubleshooting.md new file mode 100644 index 000000000..22a2dd27c --- /dev/null +++ b/docs/advanced/troubleshooting.md @@ -0,0 +1,46 @@ +--- +description: "Troubleshoot common Code-Graph-RAG issues with Memgraph, Ollama, and model configuration." +--- + +# Troubleshooting + +## Check Memgraph Connection + +- Ensure Docker containers are running: `docker compose ps` +- Verify Memgraph is accessible on port 7687 + +## View Database in Memgraph Lab + +- Open [http://localhost:3000](http://localhost:3000) +- Connect to `memgraph:7687` + +## Local Model Issues (Ollama) + +- Verify Ollama is running: `ollama list` +- Check if models are downloaded: `ollama pull llama3` +- Test Ollama API: `curl http://localhost:11434/v1/models` +- Check Ollama logs: `ollama logs` + +## General Checklist + +1. Check the logs for error details +2. Verify Memgraph connection +3. Ensure all environment variables are set +4. Review the graph schema matches your expectations +5. Run `cgr doctor` to validate your setup + +## Language Grammar Issues + +**Grammar not found**: Use a custom URL: + +```bash +cgr language add-grammar --grammar-url https://github.com/custom/tree-sitter-mylang +``` + +**Version incompatibility**: Update tree-sitter: + +```bash +uv add tree-sitter@latest +``` + +**Missing node types**: Manually adjust the configuration in `codebase_rag/language_config.py`. diff --git a/docs/architecture/graph-schema.md b/docs/architecture/graph-schema.md new file mode 100644 index 000000000..9e240007d --- /dev/null +++ b/docs/architecture/graph-schema.md @@ -0,0 +1,173 @@ +--- +description: "Knowledge graph schema with node types, relationships, and language-specific AST mappings." +--- + +# Graph Schema + +The knowledge graph uses a unified schema across all supported languages. + +## Node Types + +| Label | Properties | +|-------|------------| +| Project | `{name: string}` | +| Package | `{qualified_name: string, name: string, path: string}` | +| Folder | `{path: string, name: string}` | +| File | `{path: string, name: string, extension: string}` | +| Module | `{qualified_name: string, name: string, path: string}` | +| Class | `{qualified_name: string, name: string, decorators: list[string]}` | +| Function | `{qualified_name: string, name: string, decorators: list[string]}` | +| Method | `{qualified_name: string, name: string, decorators: list[string]}` | +| Interface | `{qualified_name: string, name: string}` | +| Enum | `{qualified_name: string, name: string}` | +| Type | `{qualified_name: string, name: string}` | +| Union | `{qualified_name: string, name: string}` | +| ModuleInterface | `{qualified_name: string, name: string, path: string}` | +| ModuleImplementation | `{qualified_name: string, name: string, path: string, implements_module: string}` | +| ExternalPackage | `{name: string, version_spec: string}` | + +## Relationships + +| Source | Relationship | Target | +|--------|-------------|--------| +| Project, Package, Folder | CONTAINS_PACKAGE | Package | +| Project, Package, Folder | CONTAINS_FOLDER | Folder | +| Project, Package, Folder | CONTAINS_FILE | File | +| Project, Package, Folder | CONTAINS_MODULE | Module | +| Module, Function, Method | DEFINES | Class, Function | +| Class | DEFINES_METHOD | Method | +| Module | IMPORTS | Module | +| Module | EXPORTS | Class, Function | +| Module | EXPORTS_MODULE | ModuleInterface | +| Module | IMPLEMENTS_MODULE | ModuleImplementation | +| Class | INHERITS | Class | +| Class | IMPLEMENTS | Interface | +| Method | OVERRIDES | Method | +| ModuleImplementation | IMPLEMENTS | ModuleInterface | +| Project | DEPENDS_ON_EXTERNAL | ExternalPackage | +| Function, Method | CALLS | Function, Method | + +## Nested Definitions + +A function or class defined inside another function or method (a closure or a function-local class) is attached by `DEFINES` to its **enclosing scope**, not flattened onto the Module. So `DEFINES` can originate from a `Function` or `Method` as well as a `Module`. A top-level function or class is still defined by its `Module`. + +Methods and classes defined inside function bodies are captured only when `CGR_CAPTURE_LOCAL_DEFINITIONS` is enabled (see [Configuration](../getting-started/configuration.md)); function-local *classes* are captured by default, but their methods require the flag. + +## Qualified Name Uniqueness + +`qualified_name` uniquely identifies each `Function`, `Method`, and `Class` node. When the same qualified name is defined more than once in a module, every definition is kept as a distinct node. This happens with the `if has_x(): ... else: ...` import-fallback idiom, `typing.overload`, and `try/except ImportError` fallbacks. + +The first definition keeps the plain dotted qualified name; each later definition is suffixed with `@` (for example `pkg.module.store_embedding@161`) so both survive instead of one overwriting the other. The `name` property stays the plain name on every variant. + +A `CALLS` edge to a name that has more than one definition links to every variant, since each is a runtime-possible target. + +## Language-Specific AST Mappings + +### C++ + +- `class_specifier` +- `declaration` +- `enum_specifier` +- `field_declaration` +- `function_definition` +- `lambda_expression` +- `struct_specifier` +- `template_declaration` +- `union_specifier` + +### Java + +- `annotation_type_declaration` +- `class_declaration` +- `constructor_declaration` +- `enum_declaration` +- `interface_declaration` +- `method_declaration` +- `record_declaration` + +### JavaScript + +- `arrow_function` +- `class` +- `class_declaration` +- `function_declaration` +- `function_expression` +- `generator_function_declaration` +- `method_definition` + +### Lua + +- `function_declaration` +- `function_definition` + +### Python + +- `class_definition` +- `function_definition` + +### Rust + +- `closure_expression` +- `enum_item` +- `function_item` +- `function_signature_item` +- `impl_item` +- `struct_item` +- `trait_item` +- `type_item` +- `union_item` + +### TypeScript + +- `abstract_class_declaration` +- `arrow_function` +- `class` +- `class_declaration` +- `enum_declaration` +- `function_declaration` +- `function_expression` +- `function_signature` +- `generator_function_declaration` +- `interface_declaration` +- `internal_module` +- `method_definition` +- `type_alias_declaration` + +### C# + +- `anonymous_method_expression` +- `class_declaration` +- `constructor_declaration` +- `destructor_declaration` +- `enum_declaration` +- `function_pointer_type` +- `interface_declaration` +- `lambda_expression` +- `local_function_statement` +- `method_declaration` +- `struct_declaration` + +### Go + +- `function_declaration` +- `method_declaration` +- `type_declaration` + +### PHP + +- `anonymous_function` +- `arrow_function` +- `class_declaration` +- `enum_declaration` +- `function_definition` +- `function_static_declaration` +- `interface_declaration` +- `trait_declaration` + +### Scala + +- `class_definition` +- `function_declaration` +- `function_definition` +- `object_definition` +- `trait_definition` diff --git a/docs/architecture/language-support.md b/docs/architecture/language-support.md new file mode 100644 index 000000000..b5a85c488 --- /dev/null +++ b/docs/architecture/language-support.md @@ -0,0 +1,34 @@ +--- +description: "Supported programming languages and their feature coverage in Code-Graph-RAG." +--- + +# Language Support + +Code-Graph-RAG uses Tree-sitter for language-agnostic AST parsing with a unified graph schema across all languages. + +## Support Matrix + +| Language | Status | Extensions | Functions | Classes/Structs | Modules | Package Detection | Additional Features | +|----------|--------|------------|-----------|-----------------|---------|-------------------|---------------------| +| C++ | Fully Supported | .cpp, .h, .hpp, .cc, .cxx, .hxx, .hh, .ixx, .cppm, .ccm | Yes | Yes | Yes | Yes | Constructors, destructors, operator overloading, templates, lambdas, C++20 modules, namespaces | +| Java | Fully Supported | .java | Yes | Yes | Yes | No | Generics, annotations, modern features (records/sealed classes), concurrency, reflection | +| JavaScript | Fully Supported | .js, .jsx | Yes | Yes | Yes | No | ES6 modules, CommonJS, prototype methods, object methods, arrow functions | +| Lua | Fully Supported | .lua | Yes | No | Yes | No | Local/global functions, metatables, closures, coroutines | +| Python | Fully Supported | .py | Yes | Yes | Yes | Yes | Type inference, decorators, nested functions | +| Rust | Fully Supported | .rs | Yes | Yes | Yes | Yes | impl blocks, associated functions | +| TypeScript | Fully Supported | .ts, .tsx | Yes | Yes | Yes | No | Interfaces, type aliases, enums, namespaces, ES6/CommonJS modules | +| C# | In Development | .cs | Yes | Yes | Yes | No | Classes, interfaces, generics (planned) | +| Go | In Development | .go | Yes | Yes | Yes | No | Methods, type declarations | +| PHP | Fully Supported | .php | Yes | Yes | Yes | No | Classes, interfaces, traits, enums, namespaces, PHP 8 attributes | +| Scala | In Development | .scala, .sc | Yes | Yes | Yes | No | Case classes, objects | + +## Language-Agnostic Design + +All languages share a unified graph schema, meaning queries work the same way regardless of language. You can query across languages in the same knowledge graph when analyzing polyglot repositories. + +## Adding New Languages + +Code-Graph-RAG makes it easy to add support for any language that has a Tree-sitter grammar. See the [Adding Languages](../advanced/adding-languages.md) guide. + +!!! tip + While you can add languages yourself, we recommend waiting for official full support for optimal parsing quality and comprehensive feature coverage. [Submit a language request](https://codeberg.org/vitali87/code-graph-rag/issues) if you need a specific language supported. diff --git a/docs/architecture/overview.md b/docs/architecture/overview.md new file mode 100644 index 000000000..5181f9d87 --- /dev/null +++ b/docs/architecture/overview.md @@ -0,0 +1,51 @@ +--- +description: "Architecture overview of Code-Graph-RAG's two-component system for codebase analysis." +--- + +# Architecture Overview + +Code-Graph-RAG consists of two main components that work together to analyze and query codebases. + +## Components + +### 1. Multi-Language Parser + +A Tree-sitter based parsing system that analyzes codebases and ingests data into Memgraph. + +- Uses Tree-sitter for robust, language-agnostic AST parsing +- Extracts functions, classes, methods, modules, and their relationships +- Supports 11 programming languages with a unified graph schema +- Handles complex patterns like nested functions, class hierarchies, and cross-module calls + +### 2. RAG System (`codebase_rag/`) + +An interactive CLI for querying the stored knowledge graph. + +- Translates natural language questions into Cypher queries +- Retrieves source code snippets for found elements +- Supports AI-powered code editing with AST-based targeting +- Provides code optimization with interactive approval workflow + +## Data Flow + +``` +Source Code → Tree-sitter Parser → AST Analysis → Memgraph Knowledge Graph + ↓ +User Query → AI Model (Cypher Gen) → Cypher Query → Graph Results → Response +``` + +## Key Dependencies + +| Dependency | Purpose | +|-----------|---------| +| `tree-sitter` | Language-agnostic AST parsing | +| `pymgclient` | Memgraph database adapter | +| `pydantic-ai` | Agent framework for LLM integration | +| `pydantic-settings` | Settings management | +| `mcp` | Model Context Protocol SDK | +| `typer` | CLI framework | +| `rich` | Terminal rendering | +| `prompt-toolkit` | Interactive command line | +| `diff-match-patch` | Code patching | +| `watchdog` | Filesystem events monitoring | +| `huggingface-hub` | UniXcoder model download | diff --git a/docs/assets/demo.gif b/docs/assets/demo.gif new file mode 100644 index 000000000..0260a2f83 Binary files /dev/null and b/docs/assets/demo.gif differ diff --git a/docs/assets/favicon.png b/docs/assets/favicon.png new file mode 100644 index 000000000..7ea975f2d Binary files /dev/null and b/docs/assets/favicon.png differ diff --git a/docs/assets/logo-dark-any.png b/docs/assets/logo-dark-any.png new file mode 100644 index 000000000..56508a2d7 Binary files /dev/null and b/docs/assets/logo-dark-any.png differ diff --git a/docs/assets/logo-icon.png b/docs/assets/logo-icon.png new file mode 100644 index 000000000..5449b7e03 Binary files /dev/null and b/docs/assets/logo-icon.png differ diff --git a/docs/assets/logo-light-any.png b/docs/assets/logo-light-any.png new file mode 100644 index 000000000..89be19120 Binary files /dev/null and b/docs/assets/logo-light-any.png differ diff --git a/docs/claude-code-setup.md b/docs/claude-code-setup.md index e4ede4397..e7250c4eb 100644 --- a/docs/claude-code-setup.md +++ b/docs/claude-code-setup.md @@ -47,7 +47,7 @@ claude mcp add --transport stdio code-graph-rag \ ```bash # 1. Install code-graph-rag -git clone https://github.com/vitali87/code-graph-rag.git +git clone https://codeberg.org/vitali87/code-graph-rag.git cd code-graph-rag uv sync diff --git a/docs/contributing.md b/docs/contributing.md new file mode 100644 index 000000000..bf7373fd2 --- /dev/null +++ b/docs/contributing.md @@ -0,0 +1,105 @@ +--- +description: "Contribution guidelines for Code-Graph-RAG including setup, code standards, and PR process." +--- + +# Contributing + +Thank you for your interest in contributing to Code-Graph-RAG! + +## Getting Started + +1. **Browse Issues**: Check out the [issue tracker](https://codeberg.org/vitali87/code-graph-rag/issues) to find tasks that need work. Look for `good first issue` and `help wanted` labels. +2. **Pick an Issue**: Choose an issue that interests you and matches your skill level +3. **Comment on the Issue**: Let us know you're working on it to avoid duplicate effort +4. **Fork the Repository**: Create your own fork to work on +5. **Create a Branch**: Use a descriptive branch name like `feat/add-feature` or `fix/bug-description` + +## Development Setup + +```bash +git clone https://github.com/YOUR-USERNAME/code-graph-rag.git +cd code-graph-rag +make dev +``` + +This installs all dependencies and sets up pre-commit hooks automatically. + +## Pre-commit Hooks + +All commits must pass pre-commit checks. Do not skip hooks with `--no-verify`. + +```bash +pre-commit install +pre-commit autoupdate +``` + +## Running Checks Locally + +```bash +make lint # Lint check +make format # Format check +make typecheck # Type check +make test-parallel # Unit tests in parallel +make test-integration # Integration tests (requires Docker) +``` + +Or run everything at once: + +```bash +make check # Runs lint + typecheck + test +make pre-commit # Runs ALL pre-commit checks (mirrors CI) +``` + +## Pull Request Guidelines + +- Keep PRs focused on a single issue or feature +- Write clear, descriptive commit messages using Conventional Commits format +- Include tests for new functionality +- Update documentation when necessary +- Be responsive to feedback during code review + +### CI Pipeline + +All pull requests are validated by CI, which runs in parallel: + +1. **Lint & Format**: `ruff check` and `ruff format --check` +2. **Type Check**: `ty check` on production code +3. **Unit Tests**: Parallel execution with `pytest-xdist` and coverage reporting +4. **Integration Tests**: Full stack testing with Memgraph +5. **PR Title Validation**: Conventional Commits format check + +### Automated Code Review + +This project uses automated code review bots (**Greptile** and **Gemini Code Assist**). Before requesting a human review, address all bot comments by either implementing suggestions or replying with a clear justification for why a suggestion doesn't apply. + +## Technical Requirements + +- **PydanticAI Only**: Do not introduce other agentic frameworks (LangChain, CrewAI, AutoGen, etc.) +- **Heavy Pydantic Usage**: Use Pydantic models for data validation, serialization, and configuration +- **Package Management**: Use `uv` for all dependency management +- **Code Quality**: Use `ruff` for linting and formatting +- **Type Safety**: Use type hints everywhere and run `uv run ty check` + +## Development Tools + +| Tool | Purpose | +|------|---------| +| `uv` | Package manager and dependency resolver | +| `ruff` | Code linting and formatting | +| `ty` | Static type checking (from Astral) | +| `pytest` | Testing framework | +| `ripgrep` (`rg`) | Shell command text searching | + +## Comment Policy + +No inline comments are allowed unless they: + +1. Appear before any code at the top of the file +2. Contain the `(H)` marker (intentional, human-written comment) +3. Are type annotations (`type:`, `noqa`, `pyright`, `ty:`) + +## Questions? + +- Open a discussion on GitHub +- Comment on the relevant issue +- Reach out to the maintainers diff --git a/docs/getting-started/configuration.md b/docs/getting-started/configuration.md new file mode 100644 index 000000000..1a72298fe --- /dev/null +++ b/docs/getting-started/configuration.md @@ -0,0 +1,128 @@ +--- +description: "Configure Code-Graph-RAG with provider settings, environment variables, and model options." +--- + +# Configuration + +Configuration is managed through environment variables in the `.env` file. The provider-explicit configuration supports mixing different providers for orchestrator and cypher models. + +## Provider Examples + +### All Ollama (Local Models) + +```bash +ORCHESTRATOR_PROVIDER=ollama +ORCHESTRATOR_MODEL=llama3.2 +ORCHESTRATOR_ENDPOINT=http://localhost:11434/v1 + +CYPHER_PROVIDER=ollama +CYPHER_MODEL=codellama +CYPHER_ENDPOINT=http://localhost:11434/v1 +``` + +### All OpenAI Models + +```bash +ORCHESTRATOR_PROVIDER=openai +ORCHESTRATOR_MODEL=gpt-4o +ORCHESTRATOR_API_KEY=sk-your-openai-key + +CYPHER_PROVIDER=openai +CYPHER_MODEL=gpt-4o-mini +CYPHER_API_KEY=sk-your-openai-key +``` + +### All Google Models + +```bash +ORCHESTRATOR_PROVIDER=google +ORCHESTRATOR_MODEL=gemini-2.5-pro +ORCHESTRATOR_API_KEY=your-google-api-key + +CYPHER_PROVIDER=google +CYPHER_MODEL=gemini-2.5-flash +CYPHER_API_KEY=your-google-api-key +``` + +Get your Google API key from [Google AI Studio](https://aistudio.google.com/app/apikey). + +### Mixed Providers + +```bash +ORCHESTRATOR_PROVIDER=google +ORCHESTRATOR_MODEL=gemini-2.5-pro +ORCHESTRATOR_API_KEY=your-google-api-key + +CYPHER_PROVIDER=ollama +CYPHER_MODEL=codellama +CYPHER_ENDPOINT=http://localhost:11434/v1 +``` + +## Orchestrator Model Settings + +| Variable | Description | +|----------|-------------| +| `ORCHESTRATOR_PROVIDER` | Provider name (`google`, `openai`, `ollama`) | +| `ORCHESTRATOR_MODEL` | Model ID (e.g., `gemini-2.5-pro`, `gpt-4o`, `llama3.2`) | +| `ORCHESTRATOR_API_KEY` | API key for the provider (if required) | +| `ORCHESTRATOR_ENDPOINT` | Custom endpoint URL (if required) | +| `ORCHESTRATOR_PROJECT_ID` | Google Cloud project ID (for Vertex AI) | +| `ORCHESTRATOR_REGION` | Google Cloud region (default: `us-central1`) | +| `ORCHESTRATOR_PROVIDER_TYPE` | Google provider type (`gla` or `vertex`) | +| `ORCHESTRATOR_THINKING_BUDGET` | Thinking budget for reasoning models | +| `ORCHESTRATOR_SERVICE_ACCOUNT_FILE` | Path to service account file (for Vertex AI) | + +## Cypher Model Settings + +| Variable | Description | +|----------|-------------| +| `CYPHER_PROVIDER` | Provider name (`google`, `openai`, `ollama`) | +| `CYPHER_MODEL` | Model ID (e.g., `gemini-2.5-flash`, `gpt-4o-mini`, `codellama`) | +| `CYPHER_API_KEY` | API key for the provider (if required) | +| `CYPHER_ENDPOINT` | Custom endpoint URL (if required) | +| `CYPHER_PROJECT_ID` | Google Cloud project ID (for Vertex AI) | +| `CYPHER_REGION` | Google Cloud region (default: `us-central1`) | +| `CYPHER_PROVIDER_TYPE` | Google provider type (`gla` or `vertex`) | +| `CYPHER_THINKING_BUDGET` | Thinking budget for reasoning models | +| `CYPHER_SERVICE_ACCOUNT_FILE` | Path to service account file (for Vertex AI) | + +## System Settings + +| Variable | Default | Description | +|----------|---------|-------------| +| `MEMGRAPH_HOST` | `localhost` | Memgraph hostname | +| `MEMGRAPH_PORT` | `7687` | Memgraph port | +| `MEMGRAPH_HTTP_PORT` | `7444` | Memgraph HTTP port | +| `LAB_PORT` | `3000` | Memgraph Lab port | +| `MEMGRAPH_BATCH_SIZE` | `1000` | Batch size for Memgraph operations | +| `TARGET_REPO_PATH` | `.` | Default repository path | +| `CGR_CAPTURE_LOCAL_DEFINITIONS` | `false` | Capture classes/methods defined inside function bodies (function-local definitions). Off by default to keep the graph free of throwaway helpers and test mocks; enable for exhaustive structure capture. | +| `LOCAL_MODEL_ENDPOINT` | `http://localhost:11434/v1` | Fallback endpoint for Ollama | + +## Setting Up Ollama + +```bash +curl -fsSL https://ollama.ai/install.sh | sh + +ollama pull llama3.2 +# Or try other models: +# ollama pull llama3 +# ollama pull mistral +# ollama pull codellama +``` + +Ollama automatically starts serving on `localhost:11434`. + +!!! note + Local models provide privacy and no API costs, but may have lower accuracy compared to cloud models like Gemini or GPT-4o. + +## Programmatic Configuration + +You can also configure providers programmatically via the Python SDK: + +```python +from cgr import settings + +settings.set_orchestrator("openai", "gpt-4o", api_key="sk-...") +settings.set_cypher("google", "gemini-2.5-flash", api_key="your-key") +``` diff --git a/docs/getting-started/installation.md b/docs/getting-started/installation.md new file mode 100644 index 000000000..5509f83ec --- /dev/null +++ b/docs/getting-started/installation.md @@ -0,0 +1,115 @@ +--- +description: "Install Code-Graph-RAG and set up Memgraph for multi-language codebase analysis." +--- + +# Installation + +## Prerequisites + +- Python 3.12+ +- Docker & Docker Compose (for Memgraph) +- **cmake** (required for building pymgclient dependency) +- **ripgrep** (`rg`) (required for shell command text searching) +- **For cloud models**: Google Gemini API key, OpenAI API key, or both +- **For local models**: Ollama installed and running +- `uv` package manager (recommended) or `pip` + +### Installing cmake and ripgrep + +=== "macOS" + + ```bash + brew install cmake ripgrep + ``` + +=== "Ubuntu/Debian" + + ```bash + sudo apt-get update + sudo apt-get install cmake ripgrep + ``` + +=== "CentOS/RHEL" + + ```bash + sudo yum install cmake + sudo dnf install ripgrep + ``` + + ripgrep may need to be installed from EPEL or via `cargo install ripgrep`. + +## Install from PyPI + +```bash +pip install code-graph-rag +``` + +With all Tree-sitter grammars (Python, JS, TS, Rust, Go, Java, Scala, C++, Lua): + +```bash +pip install 'code-graph-rag[treesitter-full]' +``` + +With semantic code search (UniXcoder embeddings): + +```bash +pip install 'code-graph-rag[semantic]' +``` + +With both full language support and semantic search: + +```bash +pip install 'code-graph-rag[treesitter-full,semantic]' +``` + +## Install from Source + +```bash +git clone https://codeberg.org/vitali87/code-graph-rag.git +cd code-graph-rag +``` + +For basic Python support: + +```bash +uv sync +``` + +For full multi-language support: + +```bash +uv sync --extra treesitter-full +``` + +For development (including tests and pre-commit hooks): + +```bash +make dev +``` + +This installs all dependencies and sets up pre-commit hooks automatically. + +## Start Memgraph + +```bash +docker compose up -d +``` + +This starts the Memgraph database on port 7687 and Memgraph Lab on port 3000. + +## Set Up Environment Variables + +```bash +cp .env.example .env +# Edit .env with your configuration +``` + +See the [Configuration](configuration.md) guide for all available options. + +## Verify Your Setup + +```bash +cgr doctor +``` + +This checks that all required dependencies and services are available. diff --git a/docs/getting-started/quickstart.md b/docs/getting-started/quickstart.md new file mode 100644 index 000000000..97100cc9b --- /dev/null +++ b/docs/getting-started/quickstart.md @@ -0,0 +1,103 @@ +--- +description: "Parse, query, and export your codebase in 5 minutes with Code-Graph-RAG." +--- + +# Quick Start + +Get from zero to querying your codebase in 5 minutes. + +## Step 1: Parse a Repository + +Parse and ingest a multi-language repository into the knowledge graph. + +**For the first repository (clean start):** + +```bash +cgr start --repo-path /path/to/repo1 --update-graph --clean +``` + +**For additional repositories (preserve existing data):** + +```bash +cgr start --repo-path /path/to/repo2 --update-graph +cgr start --repo-path /path/to/repo3 --update-graph +``` + +**Control Memgraph batch flushing:** + +```bash +cgr start --repo-path /path/to/repo --update-graph --batch-size 5000 +``` + +The system automatically detects and processes files for all supported languages. + +## Step 2: Query the Codebase + +Start the interactive RAG CLI: + +```bash +cgr start --repo-path /path/to/your/repo +``` + +**Specify custom models:** + +```bash +cgr start --repo-path /path/to/your/repo \ + --orchestrator ollama:llama3.2 \ + --cypher ollama:codellama +``` + +```bash +cgr start --repo-path /path/to/your/repo \ + --orchestrator google:gemini-2.0-flash-thinking-exp-01-21 \ + --cypher google:gemini-2.5-flash-lite-preview-06-17 +``` + +**Example queries:** + +- "Show me all classes that contain 'user' in their name" +- "Find functions related to database operations" +- "What methods does the User class have?" +- "Show me functions that handle authentication" +- "List all TypeScript components" +- "Find Rust structs and their methods" +- "Add logging to all database connection functions" +- "Refactor the User class to use dependency injection" + +## Step 3: Export Graph Data + +**Export during graph update:** + +```bash +cgr start --repo-path /path/to/repo --update-graph --clean -o my_graph.json +``` + +**Export existing graph without updating:** + +```bash +cgr export -o my_graph.json +``` + +**Work with exported data in Python:** + +```python +from codebase_rag.graph_loader import load_graph + +graph = load_graph("my_graph.json") +summary = graph.summary() +print(f"Total nodes: {summary['total_nodes']}") +print(f"Total relationships: {summary['total_relationships']}") + +functions = graph.find_nodes_by_label("Function") +for func in functions[:5]: + relationships = graph.get_relationships_for_node(func.node_id) + print(f"Function {func.properties['name']} has {len(relationships)} relationships") +``` + +## What Next? + +- [CLI Reference](../guide/cli-reference.md) for all available commands +- [Interactive Querying](../guide/interactive-querying.md) for query examples +- [Code Optimization](../guide/code-optimization.md) for AI-powered improvements +- [MCP Server](../guide/mcp-server.md) for Claude Code integration +- [Python SDK](../sdk/overview.md) for programmatic access diff --git a/docs/guide/cli-reference.md b/docs/guide/cli-reference.md new file mode 100644 index 000000000..6c5842703 --- /dev/null +++ b/docs/guide/cli-reference.md @@ -0,0 +1,111 @@ +--- +description: "Complete CLI reference for Code-Graph-RAG commands and Makefile targets." +--- + +# CLI Reference + +The `cgr` command is the main entry point for Code-Graph-RAG. + +## Core Commands + +### `cgr start` + +Parse a repository and/or start the interactive query CLI. + +```bash +cgr start --repo-path /path/to/repo [OPTIONS] +``` + +| Option | Description | +|--------|-------------| +| `--repo-path` | Path to repository (defaults to current directory) | +| `--update-graph` | Parse and ingest the repository into the knowledge graph | +| `--clean` | Clear existing data before ingesting | +| `--batch-size` | Override Memgraph flush batch size | +| `--orchestrator` | Specify provider:model for main operations (e.g., `google:gemini-2.5-pro`, `ollama:llama3.2`) | +| `--cypher` | Specify provider:model for graph queries (e.g., `google:gemini-2.5-flash`, `ollama:codellama`) | +| `-o` | Export graph to JSON file during update | + +### `cgr export` + +Export the knowledge graph to JSON. + +```bash +cgr export -o my_graph.json +``` + +### `cgr optimize` + +AI-powered codebase optimization. + +```bash +cgr optimize --repo-path /path/to/repo [OPTIONS] +``` + +| Option | Description | +|--------|-------------| +| `--repo-path` | Path to repository | +| `--orchestrator` | Specify provider:model for operations | +| `--batch-size` | Override Memgraph flush batch size | +| `--reference-document` | Path to reference documentation for guided optimization | + +Supported languages: `python`, `javascript`, `typescript`, `rust`, `go`, `java`, `scala`, `cpp` + +### `cgr mcp-server` + +Start the MCP server for Claude Code integration. + +```bash +cgr mcp-server +``` + +### `cgr index` + +Index a repository to protobuf for offline use. + +```bash +cgr index -o ./index-output --repo-path ./my-project +``` + +### `cgr doctor` + +Check that all required dependencies and services are available. + +```bash +cgr doctor +``` + +### `cgr language` + +Manage language support. + +```bash +cgr language add-grammar +cgr language add-grammar --grammar-url +cgr language list-languages +cgr language remove-language +``` + +## Makefile Commands + +| Command | Description | +|---------|-------------| +| `make help` | Show help message | +| `make all` | Install everything for full development environment | +| `make install` | Install project dependencies with full language support | +| `make python` | Install project dependencies for Python only | +| `make dev` | Setup development environment (install deps + pre-commit hooks) | +| `make test` | Run unit tests only (fast, no Docker) | +| `make test-parallel` | Run unit tests in parallel (fast, no Docker) | +| `make test-integration` | Run integration tests (requires Docker) | +| `make test-all` | Run all tests including integration and e2e (requires Docker) | +| `make test-parallel-all` | Run all tests in parallel (requires Docker) | +| `make clean` | Clean up build artifacts and cache | +| `make build-grammars` | Build grammar submodules | +| `make watch` | Watch repository for changes and update graph in real-time | +| `make readme` | Regenerate README.md from codebase | +| `make lint` | Run ruff check | +| `make format` | Run ruff format | +| `make typecheck` | Run type checking with ty | +| `make check` | Run all checks: lint, typecheck, test | +| `make pre-commit` | Run all pre-commit checks locally | diff --git a/docs/guide/code-optimization.md b/docs/guide/code-optimization.md new file mode 100644 index 000000000..77b7e6698 --- /dev/null +++ b/docs/guide/code-optimization.md @@ -0,0 +1,91 @@ +--- +description: "AI-powered codebase optimization with language-specific best practices and interactive approval." +--- + +# Code Optimization + +Code-Graph-RAG provides AI-powered codebase optimization with best practices guidance and an interactive approval workflow. + +## Basic Usage + +```bash +cgr optimize python --repo-path /path/to/your/repo +``` + +## With Reference Documentation + +Guide the optimization process using your own coding standards: + +```bash +cgr optimize python \ + --repo-path /path/to/your/repo \ + --reference-document /path/to/best_practices.md +``` + +```bash +cgr optimize java \ + --reference-document ./ARCHITECTURE.md +``` + +```bash +cgr optimize rust \ + --reference-document ./docs/performance_guide.md +``` + +The agent incorporates guidance from your reference documents when suggesting optimizations, ensuring they align with your project's standards and architectural decisions. + +## Using Specific Models + +```bash +cgr optimize javascript \ + --repo-path /path/to/frontend \ + --orchestrator google:gemini-2.0-flash-thinking-exp-01-21 +``` + +```bash +cgr optimize javascript --repo-path /path/to/frontend \ + --batch-size 5000 +``` + +## Supported Languages + +All supported languages: `python`, `javascript`, `typescript`, `rust`, `go`, `java`, `scala`, `cpp` + +## How It Works + +1. **Analysis Phase**: The agent analyzes your codebase structure using the knowledge graph +2. **Pattern Recognition**: Identifies common anti-patterns, performance issues, and improvement opportunities +3. **Best Practices Application**: Applies language-specific best practices and patterns +4. **Interactive Approval**: Presents each optimization suggestion for your approval before implementation +5. **Guided Implementation**: Implements approved changes with detailed explanations + +## Example Session + +``` +Starting python optimization session... +┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓ +┃ The agent will analyze your python codebase and propose specific ┃ +┃ optimizations. You'll be asked to approve each suggestion before ┃ +┃ implementation. Type 'exit' or 'quit' to end the session. ┃ +┗━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┛ + +Analyzing codebase structure... +Found 23 Python modules with potential optimizations + +Optimization Suggestion #1: + File: src/data_processor.py + Issue: Using list comprehension in a loop can be optimized + Suggestion: Replace with generator expression for memory efficiency + + [y/n] Do you approve this optimization? +``` + +## CLI Options + +| Option | Description | +|--------|-------------| +| `--orchestrator` | Specify provider:model for main operations | +| `--cypher` | Specify provider:model for graph queries | +| `--repo-path` | Path to repository (defaults to current directory) | +| `--batch-size` | Override Memgraph flush batch size | +| `--reference-document` | Path to reference documentation | diff --git a/docs/guide/graph-export.md b/docs/guide/graph-export.md new file mode 100644 index 000000000..814321dd0 --- /dev/null +++ b/docs/guide/graph-export.md @@ -0,0 +1,63 @@ +--- +description: "Export the Code-Graph-RAG knowledge graph to JSON for programmatic analysis and integration." +--- + +# Graph Export + +Export the entire knowledge graph to JSON for programmatic access and integration with other tools. + +## Export Commands + +**Export during graph update:** + +```bash +cgr start --repo-path /path/to/repo --update-graph --clean -o my_graph.json +``` + +**Export existing graph without updating:** + +```bash +cgr export -o my_graph.json +``` + +**Adjust Memgraph batching during export:** + +```bash +cgr export -o my_graph.json --batch-size 5000 +``` + +## Working with Exported Data + +```python +from codebase_rag.graph_loader import load_graph + +graph = load_graph("my_graph.json") + +summary = graph.summary() +print(f"Total nodes: {summary['total_nodes']}") +print(f"Total relationships: {summary['total_relationships']}") + +functions = graph.find_nodes_by_label("Function") +classes = graph.find_nodes_by_label("Class") + +for func in functions[:5]: + relationships = graph.get_relationships_for_node(func.node_id) + print(f"Function {func.properties['name']} has {len(relationships)} relationships") +``` + +## Example Analysis Script + +```bash +python examples/graph_export_example.py my_graph.json +``` + +## Use Cases + +Exported graph data is useful for: + +- Integration with other tools +- Custom analysis scripts +- Building documentation generators +- Creating code metrics dashboards + +See the [Python SDK](../sdk/overview.md) for more programmatic access patterns. diff --git a/docs/guide/interactive-querying.md b/docs/guide/interactive-querying.md new file mode 100644 index 000000000..5f3dd983b --- /dev/null +++ b/docs/guide/interactive-querying.md @@ -0,0 +1,89 @@ +--- +description: "Query your codebase with natural language using Code-Graph-RAG's interactive CLI." +--- + +# Interactive Querying + +Code-Graph-RAG lets you ask questions about your codebase in plain English. The system translates your questions into Cypher queries, executes them against the knowledge graph, and returns relevant results with source code snippets. + +## Starting the CLI + +```bash +cgr start --repo-path /path/to/your/repo +``` + +## Example Queries + +### Finding Code Elements + +- "Show me all classes that contain 'user' in their name" +- "Find functions related to database operations" +- "What methods does the User class have?" +- "Show me functions that handle authentication" +- "List all TypeScript components" +- "Find Rust structs and their methods" +- "Show me Go interfaces and implementations" + +### Analyzing Relationships + +- "Find all functions that call each other" +- "What classes are in the user module" +- "Show me functions with the longest call chains" +- "What functions call UserService.create_user?" +- "Show me all classes that implement the Repository interface" + +### C++ Specific Queries + +- "Find all C++ operator overloads in the Matrix class" +- "Show me C++ template functions with their specializations" +- "List all C++ namespaces and their contained classes" +- "Find C++ lambda expressions used in algorithms" + +### Code Editing Queries + +- "Add logging to all database connection functions" +- "Refactor the User class to use dependency injection" +- "Convert these Python functions to async/await pattern" +- "Add error handling to authentication methods" +- "Optimize this function for better performance" + +## Semantic Code Search + +Search for functions by describing what they do, rather than by exact names: + +- "error handling functions" +- "authentication code" +- "database connection setup" + +Semantic search uses UniXcoder embeddings and requires the `semantic` extra: + +```bash +pip install 'code-graph-rag[semantic]' +``` + +## Agentic Tools + +The interactive agent has access to these tools: + +| Tool | Description | +|------|-------------| +| `query_graph` | Query the knowledge graph using natural language | +| `read_file` | Read the content of text-based files | +| `create_file` | Create a new file with content | +| `replace_code` | Surgically replace specific code blocks | +| `list_directory` | List directory contents | +| `analyze_document` | Analyze documents (PDFs, images) | +| `execute_shell` | Execute shell commands from allowlist | +| `semantic_search` | Semantic function search by description | +| `get_function_source` | Retrieve source code by node ID | +| `get_code_snippet` | Retrieve source code by qualified name | + +## Intelligent File Editing + +The agent uses AST-based function targeting with Tree-sitter for precise code modifications: + +- **Visual diff preview** before changes +- **Surgical patching** that only modifies target code blocks +- **Multi-language support** across all supported languages +- **Security sandbox** preventing edits outside project directory +- **Smart function matching** with qualified names and line numbers diff --git a/docs/guide/mcp-server.md b/docs/guide/mcp-server.md new file mode 100644 index 000000000..ea9048c09 --- /dev/null +++ b/docs/guide/mcp-server.md @@ -0,0 +1,140 @@ +--- +description: "Integrate Code-Graph-RAG with Claude Code as an MCP server for natural language codebase analysis." +--- + +# MCP Server (Claude Code Integration) + +Code-Graph-RAG can run as an MCP (Model Context Protocol) server, enabling seamless integration with Claude Code and other MCP clients. + +## Quick Setup + +**If installed via pip** (and `code-graph-rag` is on your PATH): + +```bash +claude mcp add --transport stdio code-graph-rag \ + --env TARGET_REPO_PATH=/absolute/path/to/your/project \ + --env CYPHER_PROVIDER=openai \ + --env CYPHER_MODEL=gpt-4 \ + --env CYPHER_API_KEY=your-api-key \ + -- code-graph-rag mcp-server +``` + +**If installed from source:** + +```bash +claude mcp add --transport stdio code-graph-rag \ + --env TARGET_REPO_PATH=/absolute/path/to/your/project \ + --env CYPHER_PROVIDER=openai \ + --env CYPHER_MODEL=gpt-4 \ + --env CYPHER_API_KEY=your-api-key \ + -- uv run --directory /path/to/code-graph-rag code-graph-rag mcp-server +``` + +### Using Current Directory + +```bash +cd /path/to/your/project + +claude mcp add --transport stdio code-graph-rag \ + --env TARGET_REPO_PATH="$(pwd)" \ + --env CYPHER_PROVIDER=google \ + --env CYPHER_MODEL=gemini-2.0-flash \ + --env CYPHER_API_KEY=your-google-api-key \ + -- uv run --directory /absolute/path/to/code-graph-rag code-graph-rag mcp-server +``` + +## Prerequisites + +```bash +git clone https://codeberg.org/vitali87/code-graph-rag.git +cd code-graph-rag +uv sync + +docker run -p 7687:7687 -p 7444:7444 memgraph/memgraph-platform +``` + +## Available Tools + +| Tool | Description | +|------|-------------| +| `list_projects` | List all indexed projects in the knowledge graph database | +| `delete_project` | Delete a specific project from the knowledge graph database | +| `wipe_database` | Completely wipe the entire database (cannot be undone) | +| `index_repository` | Parse and ingest the repository into the knowledge graph | +| `query_code_graph` | Query the codebase knowledge graph using natural language | +| `get_code_snippet` | Retrieve source code for a function, class, or method by qualified name | +| `surgical_replace_code` | Surgically replace an exact code block using diff-match-patch | +| `read_file` | Read file contents with pagination support | +| `write_file` | Write content to a file | +| `list_directory` | List directory contents | + +## Example Usage + +``` +> Index this repository +> What functions call UserService.create_user? +> Update the login function to add rate limiting +``` + +## LLM Provider Options + +=== "OpenAI" + + ```bash + --env CYPHER_PROVIDER=openai \ + --env CYPHER_MODEL=gpt-4 \ + --env CYPHER_API_KEY=sk-... + ``` + +=== "Google Gemini" + + ```bash + --env CYPHER_PROVIDER=google \ + --env CYPHER_MODEL=gemini-2.5-flash \ + --env CYPHER_API_KEY=... + ``` + +=== "Ollama (free, local)" + + ```bash + --env CYPHER_PROVIDER=ollama \ + --env CYPHER_MODEL=llama3.2 + ``` + +## Multi-Repository Setup + +Add separate named instances for different projects: + +```bash +claude mcp add --transport stdio code-graph-rag-backend \ + --env TARGET_REPO_PATH=/path/to/backend \ + --env CYPHER_PROVIDER=openai \ + --env CYPHER_MODEL=gpt-4 \ + --env CYPHER_API_KEY=your-api-key \ + -- uv run --directory /path/to/code-graph-rag code-graph-rag mcp-server + +claude mcp add --transport stdio code-graph-rag-frontend \ + --env TARGET_REPO_PATH=/path/to/frontend \ + --env CYPHER_PROVIDER=openai \ + --env CYPHER_MODEL=gpt-4 \ + --env CYPHER_API_KEY=your-api-key \ + -- uv run --directory /path/to/code-graph-rag code-graph-rag mcp-server +``` + +!!! warning + Only one repository can be indexed at a time per MCP instance. When you index a new repository, the previous repository's data is automatically cleared. + +## Troubleshooting + +| Issue | Solution | +|-------|----------| +| Can't find uv/code-graph-rag | Use absolute paths from `which uv` | +| Wrong repository analyzed | Set `TARGET_REPO_PATH` to an absolute path | +| Memgraph connection failed | Ensure `docker ps` shows Memgraph running | +| Tools not showing | Run `claude mcp list` to verify installation | + +## Remove + +```bash +claude mcp remove code-graph-rag +``` diff --git a/docs/guide/realtime-updates.md b/docs/guide/realtime-updates.md new file mode 100644 index 000000000..9516eea31 --- /dev/null +++ b/docs/guide/realtime-updates.md @@ -0,0 +1,62 @@ +--- +description: "Keep your Code-Graph-RAG knowledge graph synchronized with code changes using the real-time file watcher." +--- + +# Real-Time Graph Updates + +For active development, keep your knowledge graph automatically synchronized with code changes using the real-time updater. + +## What It Does + +- Watches your repository for file changes (create, modify, delete) +- Automatically updates the knowledge graph in real-time +- Maintains consistency by recalculating all function call relationships +- Filters out irrelevant files (`.git`, `node_modules`, etc.) + +## Usage + +Run the real-time updater in a separate terminal: + +```bash +python realtime_updater.py /path/to/your/repo +``` + +Or using the Makefile: + +```bash +make watch REPO_PATH=/path/to/your/repo +``` + +### With Custom Memgraph Settings + +```bash +python realtime_updater.py /path/to/your/repo \ + --host localhost --port 7687 --batch-size 1000 +``` + +```bash +make watch REPO_PATH=/path/to/your/repo HOST=localhost PORT=7687 BATCH_SIZE=1000 +``` + +## Multi-Terminal Workflow + +```bash +# Terminal 1: Start the real-time updater +python realtime_updater.py ~/my-project + +# Terminal 2: Run the AI assistant +cgr start --repo-path ~/my-project +``` + +## CLI Arguments + +| Argument | Required | Default | Description | +|----------|----------|---------|-------------| +| `repo_path` | Yes | | Path to repository to watch | +| `--host` | No | `localhost` | Memgraph host | +| `--port` | No | `7687` | Memgraph port | +| `--batch-size` | No | | Number of buffered nodes/relationships before flushing to Memgraph | + +## Performance Note + +The updater currently recalculates all CALLS relationships on every file change to ensure consistency. This prevents "island" problems where changes in one file aren't reflected in relationships from other files, but may impact performance on very large codebases with frequent changes. Optimization of this behavior is a work in progress. diff --git a/docs/index.md b/docs/index.md new file mode 100644 index 000000000..c62861c38 --- /dev/null +++ b/docs/index.md @@ -0,0 +1,48 @@ +--- +description: "Graph-based RAG system that parses multi-language codebases with Tree-sitter, builds knowledge graphs, and enables natural language querying, editing, and optimization." +--- + +# Code-Graph-RAG + +**The ultimate RAG for your monorepo.** Query, understand, and edit multi-language codebases with the power of AI and knowledge graphs. + +

+ Code-Graph-RAG Demo +

+ +## What is Code-Graph-RAG? + +Code-Graph-RAG is an accurate Retrieval-Augmented Generation (RAG) system that analyzes multi-language codebases using Tree-sitter, builds comprehensive knowledge graphs in Memgraph, and enables natural language querying of codebase structure and relationships as well as editing capabilities. + +## Key Features + +- **Multi-Language Support** for Python, TypeScript, JavaScript, Rust, Java, C++, Go, Lua, and more +- **Tree-sitter Parsing** for robust, language-agnostic AST analysis +- **Knowledge Graph Storage** using Memgraph for interconnected codebase structure +- **Natural Language Querying** to ask questions about your code in plain English +- **AI-Powered Cypher Generation** with Google Gemini, OpenAI, and Ollama support +- **Code Snippet Retrieval** with actual source code for found functions and methods +- **Advanced File Editing** with AST-based function targeting and visual diff previews +- **Shell Command Execution** for running tests and CLI tools +- **Interactive Code Optimization** with language-specific best practices +- **Reference-Guided Optimization** using your own coding standards +- **Dependency Analysis** from `pyproject.toml` +- **Semantic Code Search** using UniXcoder embeddings to find functions by intent +- **MCP Server Integration** for seamless use with Claude Code +- **Real-Time Graph Updates** via file watcher for active development + +## Quick Start + +```bash +pip install code-graph-rag +docker compose up -d +cgr start --repo-path ./my-project --update-graph --clean +``` + +See the [Installation](getting-started/installation.md) guide for full setup instructions. + +## Enterprise Services + +Code-Graph-RAG is open source and free to use. For organizations that need more, we offer **fully managed cloud-hosted solutions** and **on-premise deployments**. + +[View plans & pricing at code-graph-rag.com](https://code-graph-rag.com/enterprise){ .md-button } diff --git a/docs/overrides/main.html b/docs/overrides/main.html new file mode 100644 index 000000000..528edb714 --- /dev/null +++ b/docs/overrides/main.html @@ -0,0 +1,30 @@ +{% extends "base.html" %} + +{% block extrahead %} + +{% endblock %} diff --git a/docs/reports/BENCHMARK_REPORT.md b/docs/reports/BENCHMARK_REPORT.md new file mode 100644 index 000000000..d96875e01 --- /dev/null +++ b/docs/reports/BENCHMARK_REPORT.md @@ -0,0 +1,199 @@ +# Benchmark Report: Measured vs Projected Performance + +## Methodology + +All benchmarks ran on macOS (Darwin 25.3.0), Python 3.12, using `uv run`. Each benchmark used: +- 3 warmup runs (discarded) +- 20 to 100 measured iterations (depending on benchmark) +- Statistical measures: median, mean, stddev, min, max, p95 +- Realistic data sizes matching the profiled workload (352 files, ~4,500 registry entries) + +Benchmark scripts are in `benchmarks/`. Run all with `uv run python benchmarks/run_all.py`. + +--- + +## FINDING 1: `find_ending_with` Linear Scan (48.3% of CPU) + +**The single biggest performance win available, requiring zero dependencies.** + +The `FunctionRegistryTrie.find_ending_with()` method falls back to a linear scan of all entries when the `_simple_name_lookup` index misses (80.7% miss rate per profiling data). + +### Measured Results + +| Scenario | Registry Size | Queries | Linear Scan (ms) | Full Suffix Index (ms) | Speedup | +|---|---|---|---|---|---| +| Batch lookup | 1,000 | 38 | 1.77 | 0.007 | **261x** | +| Batch lookup | 4,500 | 38 | 8.04 | 0.023 | **356x** | +| Batch lookup | 10,000 | 38 | 17.78 | 0.046 | **382x** | +| Single lookup | 4,500 | 1 | 0.22 | 0.001 | **178x** | + +### Projected vs Measured + +The integration feasibility report projected ~1.9x total speedup (saving 13.5s of 31.2s). Our benchmarks show that building a complete suffix index provides **178x to 382x speedup** on the specific operation, validating the projection and suggesting the total improvement could be even larger than estimated. + +### Fix + +Build a complete suffix index in `FunctionRegistryTrie` by populating `_simple_name_lookup` for every insert, and ensure all insertion code paths (including `__setitem__`) update the index. This eliminates the linear scan fallback entirely. + +--- + +## FINDING 2: pathlib vs String Operations (13.7% of CPU) + +**The `should_skip_path` function uses `pathlib.Path.relative_to()` which creates intermediate objects on every call.** + +### Measured Results + +| Operation | pathlib (ms) | String ops (ms) | Speedup | +|---|---|---|---| +| `relative_to` vs `removeprefix` (5,000 paths) | 61.3 | 0.097 | **634x** | +| `relative_to` vs `removeprefix` (20,000 paths) | 253.0 | 0.394 | **643x** | +| Full `should_skip_path` (5,000 paths) | 69.3 | 1.55 | **45x** | +| Full `should_skip_path` (20,000 paths) | 285.9 | 6.21 | **46x** | +| `Path.suffix` vs `str.rfind` (5,000 paths) | 6.97 | 0.278 | **25x** | +| `Path.name` vs `str.rfind+slice` (5,000 paths) | 6.37 | 0.360 | **18x** | + +### Projected vs Measured + +The integration report projected 4.0s savings (13.7% of 31.2s total). Our benchmarks show `pathlib.relative_to` is 634x slower than `str.removeprefix`, and the full `should_skip_path` function is 45x slower with pathlib. These numbers validate the projection: for 59,012 calls at ~57us/call (pathlib), the total is ~3.4s, matching the profiled 3.39s. + +### Fix + +Convert paths to strings at the boundary of `should_skip_path` and use `str.removeprefix()`, `str.split("/")`, and `set` membership testing instead of `Path.relative_to()` and `Path.parts`. + +--- + +## FINDING 3: orjson vs stdlib json (JSON Serialization) + +**orjson provides massive speedups on serialization with zero integration overhead.** + +### Measured Results + +| Operation | Data Size | json (ms) | orjson (ms) | Speedup | +|---|---|---|---|---| +| dumps compact | 372 KB | 1.16 | 0.21 | **5.5x** | +| dumps compact | 1.9 MB | 5.73 | 1.01 | **5.7x** | +| dumps compact | 8.5 MB | 26.6 | 4.91 | **5.4x** | +| dumps indented | 372 KB | 9.70 | 0.39 | **24.7x** | +| dumps indented | 1.9 MB | 48.5 | 2.02 | **24.0x** | +| dumps indented | 8.5 MB | 216.9 | 8.58 | **25.3x** | +| loads | 372 KB | 1.26 | 0.62 | **2.0x** | +| loads | 1.9 MB | 6.23 | 3.24 | **1.9x** | +| loads | 8.5 MB | 30.1 | 16.6 | **1.8x** | + +### Projected vs Measured + +The language recommendations projected 5x to 15x. Our measured results show: +- **Compact serialization: 5.4x to 5.7x** (within projected range) +- **Indented serialization: 24x to 25x** (exceeds projected range significantly) +- **Deserialization: 1.8x to 2.0x** (below projected range) + +The indented serialization speedup is particularly relevant because `_write_graph_json` uses `json.dump(data, f, indent=2)` (the slowest path). For a 20K node graph, this drops from 217ms to 8.6ms. + +--- + +## FINDING 4: BLAKE3 vs SHA256 Hashing (NEGATIVE RESULT) + +**BLAKE3 is slower than hashlib.sha256 for this workload. The recommendation is invalidated.** + +### Measured Results + +| Operation | SHA256 (ms) | BLAKE3 (ms) | Speedup | +|---|---|---|---| +| 500 snippet hashes | 0.155 | 0.325 | **0.5x (slower)** | +| 2,000 snippet hashes | 0.594 | 1.177 | **0.5x (slower)** | +| 10,000 snippet hashes | 2.988 | 6.131 | **0.5x (slower)** | +| 50 file hashes (5KB avg) | 0.968 | 1.031 | **0.9x (slower)** | +| 200 file hashes (10KB avg) | 4.419 | 4.964 | **0.9x (slower)** | +| 500 file hashes (20KB avg) | 14.164 | 15.883 | **0.9x (slower)** | + +### Analysis + +The language recommendations projected 4x to 10x speedup. Our benchmarks show BLAKE3 is actually **0.5x to 0.9x** (slower) for this workload. This is because: + +1. **hashlib.sha256 is already C-backed** (OpenSSL). The baseline is not pure Python. +2. **BLAKE3's SIMD advantages require large contiguous buffers.** Code snippets average 200 bytes; file chunks are 5-20KB. BLAKE3's parallelism does not engage at these sizes. +3. **FFI overhead dominates.** The `blake3` Python package adds per-call FFI overhead that exceeds the algorithmic savings for small inputs. + +**Verdict: Do not adopt BLAKE3.** The recommendation was based on algorithmic benchmarks, not Python binding benchmarks. + +--- + +## FINDING 5: FunctionRegistryTrie Baseline Performance + +### Measured Results (Existing Python Implementation) + +| Operation | 1K entries | 5K entries | 10K entries | 50K entries | +|---|---|---|---|---| +| insert (ms) | 0.33 | 1.76 | 3.74 | 18.1 | +| lookup (ms) | 0.04 | 0.19 | 0.41 | 2.06 | +| find_ending_with (ms) | 0.004 | 0.018 | 0.046 | 0.47 | +| find_with_prefix (ms) | 0.39 | 2.18 | 4.18 | 39.9 | +| delete 25% (ms) | 0.42 | 2.10 | 4.20 | 22.2 | + +### Analysis + +The trie operations are already fast when the index is hit (O(1) via `_simple_name_lookup`). The Rust trie rewrite (projected 3x to 8x) would save microseconds per operation. The integration feasibility report correctly identified that a standalone Rust trie provides only 1.5x to 3x net gain after FFI overhead. The **pure Python fix (Finding 1) provides 178x to 382x speedup** on the actual bottleneck, making the Rust rewrite unnecessary. + +--- + +## FINDING 6: GraphLoader JSON Parse + Index Build + +### Measured Results + +| Graph Size | JSON Parse Only (ms) | GraphLoader.load (ms) | Index Build Overhead | +|---|---|---|---| +| 1K nodes, 2K rels | 1.03 | 2.10 | 2.0x | +| 5K nodes, 10K rels | 5.15 | 10.6 | 2.1x | +| 20K nodes, 50K rels | 24.2 | 64.2 | 2.7x | + +### Analysis + +GraphLoader.load() is 2x to 2.7x slower than raw JSON parsing due to index construction (node-by-id, node-by-label, outgoing/incoming relationship indexes). With orjson, the JSON parse portion would drop from 24.2ms to ~13.4ms (1.8x), but index construction would remain unchanged. Net improvement for 20K nodes: 64.2ms to ~53ms (1.2x). The index construction is pure Python dict/list operations. + +--- + +## FINDING 7: File Hashing Comparison + +### Measured Results + +| Algorithm | 50 files (5KB) | 200 files (10KB) | 500 files (20KB) | +|---|---|---|---| +| SHA256 (8KB buffer) | 0.98ms | 4.43ms | 14.3ms | +| SHA256 (64KB buffer) | 1.05ms | 4.61ms | 14.9ms | +| SHA256 (mmap) | 1.30ms | 5.76ms | 17.4ms | +| MD5 | 1.22ms | 6.44ms | 24.7ms | +| BLAKE2b | 1.04ms | 5.17ms | 17.5ms | + +### Analysis + +SHA256 with 8KB buffer is already the fastest option. Larger buffers and mmap add overhead for these file sizes. MD5 is slower (no hardware acceleration on this platform). File hashing consumes <0.5% of total runtime. No optimization needed. + +--- + +## Summary: Validated vs Invalidated Recommendations + +| Recommendation | Language Report Projection | Measured Result | Verdict | +|---|---|---|---| +| Fix `find_ending_with` index | ~1.9x total speedup | **261x to 382x** on the operation | **VALIDATED (exceeds projection)** | +| Replace pathlib with strings | ~1.15x total speedup | **45x to 643x** on path ops | **VALIDATED (exceeds projection)** | +| orjson for JSON | 5x to 15x on JSON ops | **1.8x to 25x** depending on operation | **VALIDATED** | +| BLAKE3 for hashing | 4x to 10x speedup | **0.5x (slower)** | **INVALIDATED** | +| neo4j-rust-ext | 3x to 10x on DB ops | N/A (wrong driver) | **INVALIDATED** (uses Memgraph/pymgclient) | +| Rust AST extension | 10x to 16x on parsing | Not benchmarked (3.1% of CPU) | **DEPRIORITIZED** (targets 3.1% of runtime) | +| Rust trie | 3x to 8x on lookups | 1.5x to 3x net (per feasibility) | **SUPERSEDED** by Python index fix | + +## Revised Priority Order (Measured) + +| Priority | Fix | Type | Measured Speedup | Effort | +|---|---|---|---|---| +| **1** | Fix `find_ending_with` suffix index | Python bugfix | 261x to 382x on operation (~1.9x total) | Low | +| **2** | Replace pathlib with string ops | Python refactor | 45x to 643x on path ops (~1.15x total) | Low | +| **3** | Cache type inference results | Python memoization | Not benchmarked (projected ~1.07x total) | Low | +| **4** | Suppress debug logging | Config change | Not benchmarked (projected ~1.06x total) | Trivial | +| **5** | Deduplicate FS traversal | Python refactor | Not benchmarked (projected ~1.05x total) | Low | +| **6** | orjson for JSON | Dependency swap | 5.4x to 25x on JSON ops | Trivial | +| **7** | Rust AST extension | Rust crate | Targets 3.1% of CPU; ~1.03x total after Python fixes | High | + +**Combined estimated speedup from priorities 1 through 6: ~3.7x, with zero language rewrites.** + +The Rust AST extension (previously the headline recommendation at "10x to 16x") targets only 3.1% of actual CPU time and provides ~1.03x total improvement after the pure Python fixes are applied. It should only be considered for repositories significantly larger than the current benchmark workload. diff --git a/docs/reports/INTEGRATION_FEASIBILITY.md b/docs/reports/INTEGRATION_FEASIBILITY.md new file mode 100644 index 000000000..b65a9da31 --- /dev/null +++ b/docs/reports/INTEGRATION_FEASIBILITY.md @@ -0,0 +1,392 @@ +# Integration Feasibility Report + +## Build System and Deployment Context + +**Package manager:** `uv` (Astral), defined in `pyproject.toml` with `uv.lock` +**Build backend:** setuptools (via `[tool.setuptools]`), three packages: `codebase_rag`, `codec`, `cgr` +**Distribution:** PyPI wheel, Docker image (`python:3.12-slim`), PyInstaller binary +**CI/CD:** Pre-commit hooks (ruff, ty, bandit), Makefile targets +**Python version:** 3.12+ required +**Key native dependency:** `pymgclient` (compiled from source with `--no-binary-package`) + +--- + +## Candidate 1: orjson (Drop-in JSON Replacement) + +### Integration Strategy +Drop-in dependency swap. Replace `import json` with `import orjson` in graph_loader.py, graph_updater.py, services/graph_service.py, embedder.py, stdlib_extractor.py. + +### Integration Overhead +- **Serialization boundary:** Zero. orjson is a direct Python C extension. No FFI marshalling. +- **API difference:** `orjson.dumps()` returns `bytes` not `str`. Every `json.dumps()` call site that feeds the result to something expecting `str` needs `.decode()`. In this codebase, the `_write_graph_json` function in `main.py` uses `json.dump(graph_data, f, indent=2, ensure_ascii=False)` which would need adjustment since orjson's `OPT_INDENT_2` flag replaces the `indent` parameter. +- **Protobuf service:** `services/protobuf_service.py` does not use JSON. No impact. +- **Hash cache I/O:** `_save_hash_cache` and `_load_hash_cache` use `json.dump/load` with file objects. orjson does not support file-object streaming; need to call `orjson.dumps()` then `f.write()`. +- **Embedding cache:** Same pattern. `EmbeddingCache.save()` uses `json.dump(self._cache, f)`. Requires manual write of bytes. +- **Build system change:** Add `orjson>=3.10.0` to `[project.dependencies]`. orjson publishes pre-built wheels for all platforms. No toolchain change. +- **Docker impact:** Zero. orjson wheels are self-contained. +- **PyInstaller impact:** Add `--hidden-import orjson`. orjson is a single .so/.pyd file, minimal size increase. + +### Net Projected Gain +- **Raw gain:** 5x to 15x on JSON operations +- **Integration overhead:** Near zero. ~10 call sites need minor API adjustments (bytes vs str, file.write vs json.dump). +- **Net gain:** 5x to 15x on JSON operations. No overhead erosion. +- **Risk:** Very low. Widely adopted library (polars, FastAPI, etc.) + +--- + +## Candidate 2: neo4j-rust-ext (NOT APPLICABLE) + +### Integration Strategy +NOT APPLICABLE. This codebase uses **Memgraph** via `pymgclient` (mgclient C library), NOT the Neo4j Python driver. The `neo4j-rust-ext` package patches the `neo4j` Python driver's PackStream implementation. It has zero effect on `pymgclient`. + +### Assessment +- `services/graph_service.py` imports `mgclient`, connects to Memgraph, and uses the mgclient C API directly. +- There is no `neo4j` dependency in `pyproject.toml`. +- The language researcher's recommendation was based on an incorrect assumption about the database driver. + +### Alternative for Memgraph Driver +- pymgclient is already a C extension wrapping Memgraph's C client library. It is already compiled code. +- The actual overhead is in Python-side batch construction (building `list[RelBatchRow]` and `list[NodeBatchRow]` dicts), Cypher query string formatting, and result deserialization in `_cursor_to_results`. +- The `_cursor_to_results` method iterates cursor results and builds `list[ResultRow]` via `dict(zip(column_names, row))`. This is pure Python overhead. +- Potential optimization: Use cursor iteration in C rather than Python, but this requires pymgclient changes, not neo4j-rust-ext. + +### Net Projected Gain +- **Net gain:** 0x. This recommendation is inapplicable. + +--- + +## Candidate 3: BLAKE3 (Embedding Cache Hashing) + +### Integration Strategy +Drop-in hash function replacement in `EmbeddingCache._content_hash()` and `_hash_file()` in `graph_updater.py`. + +### Integration Overhead +- **Serialization boundary:** Zero. blake3 Python package is a C extension. +- **API change:** `hashlib.sha256(content.encode()).hexdigest()` becomes `blake3.blake3(content.encode()).hexdigest()`. One-line change per call site. +- **Cache invalidation:** Existing embedding caches (`.qdrant_code_embeddings/embedding_cache.json`) and file hash caches (`.file_hashes.json`) will be invalidated because hash values change. This forces a full re-index on first run after the change. +- **Build system change:** Add `blake3>=1.0.0` to dependencies. blake3 publishes pre-built wheels. +- **Docker/PyInstaller:** Minimal impact. blake3 is a small native extension. + +### Net Projected Gain +- **Raw gain:** 4x to 10x on hashing operations +- **Practical impact:** Hashing is NOT the bottleneck. `_hash_file` reads 8KB chunks and hashes them. For a typical codebase (1000 files, avg 5KB), total hashing takes ~5ms (already fast because hashlib SHA256 is C-backed). The real I/O cost is the filesystem reads, not the hash computation. +- **Embedding cache hashing:** Similarly marginal. `_content_hash` hashes short code snippets. Each call takes microseconds. +- **Cache invalidation cost:** Forces a full re-indexing pass (potentially minutes for large repos), creating a one-time negative impact that dwarfs the per-operation savings. +- **Net gain:** Negligible in practice. The 4x to 10x improvement applies to an operation that takes microseconds per call. +- **Recommendation:** Skip unless profiling proves hashing is >5% of total wall clock time. + +--- + +## Candidate 4: Rust AST Processing Extension (via PyO3/maturin) + +### Integration Strategy +Build a Rust extension crate (e.g., `codebase-rag-core`) that accepts file bytes + language enum and returns structured extraction results. Use PyO3 for Python bindings and maturin for building. + +### Integration Overhead Assessment + +**Data crossing the FFI boundary:** +- **Input:** File bytes (`bytes`) and language enum (`str`). Minimal copy cost. PyO3 provides zero-copy access to Python bytes via `&[u8]`. +- **Output:** The Rust extension must return complex structured data to Python: + - Function definitions: list of (qualified_name, name, start_line, end_line, decorators, docstring) + - Class definitions: list of (qualified_name, name, parent_classes, methods) + - Call relationships: list of (caller_qn, callee_qn, caller_type, callee_type) + - Import mappings: dict of (module_qn -> dict of (local_name -> imported_qn)) + + Each of these requires constructing Python objects from Rust data. For a file with 50 functions and 200 call sites, this means ~250 Python dict/tuple creations on the return path. + +**Boundary crossing cost estimate:** +- PyO3 object creation: ~100ns per Python object (dict, str, list element) +- For a typical large file (50 functions, 100 calls, 20 imports): ~170 result objects * 5 fields each = ~850 Python object creations = ~85 microseconds +- Per-file processing time in Python currently: ~5-50ms (depends on file size) +- **FFI boundary cost as fraction of saved time: <1%**. This is excellent. + +**Coupling analysis:** + +The Rust extension needs to replicate or subsume: +1. `definition_processor.py` (7.5KB): Function/class/method extraction from AST +2. `call_processor.py` (13.7KB): Call relationship extraction +3. `call_resolver.py` (24.4KB): Call resolution with trie lookups, inheritance chains, import maps +4. `import_processor.py` (40KB): Language-specific import parsing (Python, JS/TS, Java, Rust, Go, C++, Lua) +5. `function_ingest.py` (16.4KB): Function registration and qualified name resolution +6. `type_inference.py` (5.8KB) + language-specific engines: Type inference for call resolution +7. `FunctionRegistryTrie` in `graph_updater.py`: Trie data structure + +Total: ~110KB of Python code with complex multi-language logic spanning 8+ languages. + +**Build system changes:** +- Add `maturin` as build dependency +- Add a `Cargo.toml` at project root or in a subdirectory (e.g., `rust/`) +- Add `tree-sitter` and language grammar crates as Rust dependencies +- Modify `pyproject.toml` to include maturin build configuration or create a separate wheel +- CI needs Rust toolchain (rustup) installed +- Docker builder stage needs Rust toolchain (~300MB image layer increase) +- PyInstaller needs to collect the compiled .so/.pyd from the Rust extension + +**Compatibility concerns:** +- Tree-sitter versions must match between Rust and Python. The codebase uses `tree-sitter==0.25.2`. The Rust `tree-sitter` crate version must be compatible. +- The Rust extension must handle all 9 supported languages with language-specific AST patterns. +- The `IngestorProtocol` interface (ensure_node_batch, ensure_relationship_batch) is called from within the processing loop. Either the Rust extension calls back into Python (expensive, defeats the purpose) OR the Rust extension accumulates all results and returns them in bulk (preferred). + +**Critical: tree-sitter Node FFI constraint (from adversarial review):** +- Tree-sitter `Node` objects are C-level pointers that cannot be marshalled across FFI boundaries. The call resolution pipeline operates on `Node` objects thousands of times per file. +- This rules out an incremental approach (e.g., rewriting just CallResolver in Rust while keeping Python tree-sitter nodes). The Rust extension must parse files from scratch using the `tree-sitter` Rust crate directly, producing Rust-native `Node` references. +- Consequence: the Rust extension is an all-or-nothing replacement of the entire parse-extract-resolve pipeline. Incremental migration is not feasible. This increases both effort and risk. + +**Deployment complexity:** +- Requires publishing platform-specific wheels (linux-x86_64, linux-aarch64, macos-x86_64, macos-arm64, windows-x64) +- maturin handles this via GitHub Actions + `maturin[zig]` for cross-compilation +- Users without pre-built wheels need a Rust toolchain to install from source +- The Docker image build becomes significantly more complex (multi-stage with Rust) + +### Net Projected Gain +- **Raw gain:** 10x to 16x on AST processing (the primary CPU hotspot) +- **FFI boundary overhead:** <1% (excellent input/output ratio: bytes in, structured results out) +- **Build system overhead:** Significant one-time cost. Ongoing CI cost of ~2-3 min for Rust compilation per release. +- **Development effort:** High. ~110KB of Python code to rewrite in Rust, with complex multi-language pattern matching. +- **Net gain:** 9x to 15x on AST processing operations, assuming bulk return pattern. +- **Risk:** Medium-high. Large surface area, 8+ language parsers, tight coupling with existing Python data structures. +- **Recommendation:** High value but should be incremental. Start with a single language (Python parser) as proof of concept, measure actual gains, then expand. + +--- + +## Candidate 5: Rust FunctionRegistryTrie (via PyO3) + +### Integration Strategy +Expose a Rust-backed trie as a Python class via PyO3, bundled in the same crate as Candidate 4. + +### Integration Overhead Assessment + +**Data crossing the FFI boundary:** +- **Insert:** Python str -> Rust &str (zero-copy via PyO3), Rust stores owned copy. Cost: one string allocation per insert. +- **Lookup (`__contains__`, `get`):** Python str -> Rust &str (zero-copy), returns bool or Python str. Cost: near zero per lookup. +- **Batch operations (`find_ending_with`, `find_with_prefix`):** Returns list of Python strings. For a query returning 50 matches, this means 50 Python string allocations. + +**Boundary crossing cost estimate:** +- Single lookup: ~50ns (vs ~200ns in Python dict) +- `find_ending_with` returning 10 results: ~1us (vs ~50us scanning Python dict) +- The trie has hot-path usage in `call_resolver.py` where every call expression triggers 2-5 trie lookups. + +**Coupling with Candidate 4:** +- If AST processing moves to Rust (Candidate 4), the trie must also be in Rust to avoid crossing back to Python for every lookup during call resolution. +- If Candidate 4 is NOT done, the Rust trie is still useful standalone, but the benefit is reduced because the Python call resolution code still creates Python strings for every lookup key. + +**Build system changes:** +- Bundled with Candidate 4. No additional build complexity. + +### Net Projected Gain +- **Raw gain:** 3x to 8x on trie operations +- **Standalone net gain (without Candidate 4):** 1.5x to 3x. Python call resolution code still creates string objects for lookup keys. FFI crossing happens per-lookup. +- **Combined net gain (with Candidate 4):** 3x to 8x. All trie operations happen in Rust with no FFI boundary during resolution. +- **Recommendation:** Only implement together with Candidate 4. Standalone, the integration overhead cuts the gains roughly in half. + +--- + +## Candidate 6: File Processing Parallelism (Python) + +### Integration Strategy +Use `concurrent.futures.ProcessPoolExecutor` to parallelize per-file processing in `GraphUpdater._process_files()`. + +### Integration Overhead Assessment + +**Serialization at boundary:** +- Each worker process needs: file path (Path, serializable), language queries (NOT serializable: contains tree-sitter Parser, Query, Language objects which are C pointers). +- **Critical problem:** `LanguageQueries` contains `Parser`, `Query`, and `Language` objects from tree-sitter, which are C-level objects that cannot be serialized across process boundaries. +- Each worker would need to call `load_parsers()` independently, loading all language grammars (~50ms startup cost per worker). +- Results (function definitions, call relationships) are Python dicts/tuples that serialize easily. + +**State synchronization:** +- `FunctionRegistryTrie` is shared mutable state. Workers write to it during function registration, and readers need it during call resolution. +- With multiprocessing, each worker would have its own trie. Merging tries after parallel processing adds complexity. +- `import_mapping` in `ImportProcessor` is similarly shared mutable state. +- The three-pass architecture (structure -> definitions -> calls) has inherent sequential dependencies: pass 3 needs results from pass 2. + +**GIL considerations:** +- `threading.Thread` would not help because call resolution is CPU-bound Python code held by the GIL. +- `ProcessPoolExecutor` bypasses GIL but introduces serialization overhead. +- Estimated per-file serialization overhead for results: ~0.1ms per file. +- For 1000 files on 4 cores: ~25ms total serialization overhead vs ~5000ms saved. + +### Net Projected Gain +- **Raw gain:** 2x to 4x (limited by sequential passes and Amdahl's law) +- **Serialization overhead:** ~5ms for 1000 files (minimal) +- **Worker initialization overhead:** ~50ms per worker (grammar loading), amortized across files +- **Architecture complexity:** High. Requires restructuring the three-pass processing pipeline, managing shared state (trie, import maps), and handling errors across processes. +- **Net gain:** 1.5x to 3x after accounting for sequential bottlenecks (pass dependencies) +- **Recommendation:** Medium priority. Worth doing after Candidate 4 (Rust extension) is evaluated. If Candidate 4 makes per-file processing fast enough, parallelism becomes less critical. + +--- + +## Candidate 7: String Processing in Call Resolution (Rust) + +### Integration Strategy +Bundled with Candidate 4. Call resolution logic moves into the Rust AST processing extension. + +### Integration Overhead +- **Standalone:** NOT recommended. Call resolution is deeply interleaved with trie lookups, import map lookups, and AST node access. Extracting just the string processing would require marshalling all context (import maps, trie state, class inheritance) across FFI on every call. +- **Bundled with Candidate 4:** Zero additional FFI overhead. The Rust extension performs call resolution as part of the same processing pass. + +### Net Projected Gain +- **Standalone net gain:** Negative. The overhead of passing import maps and trie state across FFI for each call resolution would exceed the savings from faster string processing. +- **Bundled net gain:** 5x to 10x (absorbed into Candidate 4's gains) +- **Recommendation:** Only implement as part of Candidate 4. + +--- + +## Summary: Feasibility Verdicts + +| Candidate | Strategy | FFI Overhead | Build Impact | Net Gain | Verdict | +|---|---|---|---|---|---| +| 1. orjson | Dependency swap | None | Trivial | 5x-15x on JSON | **PROCEED** | +| 2. neo4j-rust-ext | N/A | N/A | N/A | 0x (wrong driver) | **REJECT** | +| 3. BLAKE3 hashing | Dependency swap | None | Trivial | Negligible | **SKIP** (not a bottleneck) | +| 4. Rust AST extension | PyO3/maturin crate | <1% | Significant | 9x-15x on AST | **PROCEED** (incremental) | +| 5. Rust trie | PyO3 (bundled #4) | ~50% standalone | Bundled with #4 | 1.5x-3x standalone, 3x-8x bundled | **BUNDLE with #4** | +| 6. File parallelism | ProcessPoolExecutor | ~5ms/1000 files | Moderate refactor | 1.5x-3x | **DEFER** (after #4) | +| 7. String processing | Rust (bundled #4) | Negative standalone | Bundled with #4 | Negative standalone, 5x-10x bundled | **BUNDLE with #4** | + +## Key Finding: Integration Overhead Negation Analysis + +The critical insight is that **Candidates 5 and 7 have negative net gains if implemented standalone** because the FFI boundary crossing cost exceeds the per-operation savings. They are only viable when bundled with Candidate 4, which keeps all related operations on the Rust side of the boundary. + +This validates the principle: **a function 10x faster but with 8x overhead at the boundary is only 1.25x improvement.** For Candidates 5 and 7, the standalone case is even worse because the boundary must be crossed per-lookup (thousands of times per file) rather than per-file. + +**Candidate 2 is completely inapplicable** due to incorrect driver assumption. + +**Candidate 3 optimizes a non-bottleneck** (microsecond-level operations). + +The only candidates with clear positive ROI accounting for integration overhead are: +1. **orjson** (zero overhead, significant JSON gains) +2. **Rust AST extension** (minimal overhead due to bytes-in/results-out architecture, massive CPU gains) + +--- + +## ADDENDUM: Revised Analysis Based on CPU Profiling Data + +The CPU profiling report (cProfile, 31.2s total, 179M function calls on 352 Python files) **dramatically changes the priority landscape.** The actual hotspots are fundamentally different from those assumed in the language recommendations. + +### Profiling Reality vs. Language Researcher Assumptions + +| Rank | Actual Hotspot | % CPU | Language Researcher Assumption | +|------|---------------|-------|-------------------------------| +| 1 | `find_ending_with` linear scan | 48.3% | Assumed trie was working; recommended Rust trie for data layout improvement | +| 2 | `should_skip_path` pathlib overhead | 13.7% | Not identified as a hotspot | +| 3 | `build_local_variable_type_map` (uncached AST retraversal) | 8.3% | Assumed this was part of general AST processing | +| 4 | Loguru debug logging overhead | 5.9% | Not identified | +| 5 | `identify_structure` (duplicate FS traversal) | 5.0% | Not identified | +| 6 | tree-sitter `QueryCursor.captures` | 2.5% | Assumed this was the primary bottleneck (10x-16x claim) | +| 7 | tree-sitter `Parser.parse` | 0.6% | Assumed this was the primary bottleneck | + +**Tree-sitter operations total 3.1% of CPU time.** The language researcher's Hotspot 1 ("AST Parsing and Traversal, 10x-16x via Rust") targeted an operation that consumes only 3.1% of runtime. A 16x speedup on 3.1% of runtime yields 1.03x total speedup (Amdahl's law). The projected 10x-16x headline number is misleading. + +### Revised Candidate Assessments + +#### NEW CANDIDATE A: Fix `find_ending_with` Linear Scan (Pure Python Fix) + +**Integration strategy:** Pure Python algorithmic fix. No FFI, no new dependencies. + +**Root cause:** `_simple_name_lookup` index has an 80.7% miss rate (22,096 of 27,376 calls). On miss, the code falls back to `[qn for qn in self._entries.keys() if qn.endswith(f".{suffix}")]`, scanning all ~4,500 entries per call. This generates 123.7M `str.endswith()` invocations. + +**Fix options:** +1. **Populate `_simple_name_lookup` more aggressively:** The index only contains entries added via `FunctionRegistryTrie.insert()` which populates `self._simple_name_lookup` via the passed-in reference. The 80.7% miss rate suggests many qualified names are inserted through code paths that bypass the simple name index population. Audit all insertion paths. +2. **Build a suffix index:** Create a `dict[str, set[QualifiedName]]` mapping the last dot-separated segment of every qualified name to its full name. This converts O(n) scans to O(1) lookups. +3. **Cache negative results:** If a suffix has been scanned and yielded no results, cache that fact to avoid re-scanning. + +**Integration overhead:** Zero. This is a bugfix/optimization within existing Python code. +**Projected gain:** Eliminating 15.07s (48.3% of total) would reduce total runtime from 31.2s to ~16.1s. Even a 90% reduction (fixing most misses) saves ~13.5s. +**Net gain:** ~1.9x total speedup from a pure Python fix. +**Risk:** Very low. + +#### NEW CANDIDATE B: Replace pathlib with String Operations in `should_skip_path` + +**Integration strategy:** Pure Python refactor. Replace `Path.relative_to()` (3.39s across 59,012 calls) with `str.removeprefix()` or `os.path.relpath()`. + +**Root cause:** `pathlib.PurePosixPath.relative_to()` creates intermediate path objects on every call. For 59,012 calls, this creates ~118,000 intermediate objects. + +**Fix:** Convert paths to strings at the boundary and use `str.startswith()` / `str.removeprefix()` for prefix checks. The `should_skip_path` function only needs string comparison operations. + +**Integration overhead:** Zero. Internal refactor. +**Projected gain:** 4.29s (13.7%) reduced to ~0.2s (estimated 20x faster for string ops vs pathlib). Saves ~4s. +**Net gain:** ~1.15x total speedup. +**Risk:** Very low. + +#### NEW CANDIDATE C: Cache `build_local_variable_type_map` Results + +**Integration strategy:** Memoize results keyed by (file_path, function_start_line, function_end_line). + +**Root cause:** Called 5,228 times, re-traversing AST nodes that have already been parsed. Multiple functions in the same file trigger independent traversals. + +**Integration overhead:** Memory cost of caching ~5,000 dict results. Estimated ~2MB. +**Projected gain:** 2.59s (8.3%) reduced to ~0.5s (first traversal per function cached, subsequent hits free). Saves ~2s. +**Net gain:** ~1.07x total speedup. +**Risk:** Low. Need to ensure cache is invalidated when files change (already handled by the incremental update system). + +#### NEW CANDIDATE D: Suppress Debug Logging in Production + +**Integration strategy:** Set loguru level to INFO or WARNING during graph building, or use lazy evaluation for debug messages. + +**Root cause:** 85,099 `debug()` calls processed (1.75s) even when debug output is not displayed. + +**Fix options:** +1. Wrap debug calls in `if logger.level <= DEBUG` guards. +2. Use `logger.opt(lazy=True).debug(lambda: ...)` for expensive format strings. +3. Set log level to INFO at the start of `GraphUpdater.run()`. + +**Integration overhead:** Zero. +**Projected gain:** 1.84s (5.9%) reduced to ~0.1s. Saves ~1.7s. +**Net gain:** ~1.06x total speedup. +**Risk:** Very low. Debug output is not needed during normal operation. + +#### NEW CANDIDATE E: Deduplicate Filesystem Traversal + +**Integration strategy:** `identify_structure()` and `_collect_eligible_files()` both call `rglob("*")` + `should_skip_path()`. Merge into a single traversal pass. + +**Integration overhead:** Moderate refactor of the two-pass architecture. +**Projected gain:** 1.57s (5.0%) eliminated for the duplicate pass. If combined with Candidate B (string paths), the single remaining pass also runs ~20x faster. +**Net gain:** ~1.05x total speedup. +**Risk:** Low. + +### Combined Impact of Pure Python Fixes (Candidates A through E) + +| Fix | Time Saved | % of Total | +|-----|-----------|------------| +| A: Fix find_ending_with | ~13.5s | 43.3% | +| B: String paths | ~4.0s | 12.8% | +| C: Cache type inference | ~2.0s | 6.4% | +| D: Suppress debug logging | ~1.7s | 5.5% | +| E: Deduplicate FS traversal | ~1.5s | 4.8% | +| **Total saved** | **~22.7s** | **72.8%** | +| **Remaining runtime** | **~8.5s** | **27.2%** | + +**Combined speedup: ~3.7x from pure Python fixes alone, with zero integration overhead, zero build system changes, and zero deployment complexity.** + +After these fixes, the remaining 8.5s would be: +- tree-sitter operations: ~1.0s (now 11.8% of reduced total) +- Remaining call resolution: ~2.5s +- File I/O + hashing: ~0.5s +- Graph construction: ~2.5s +- Miscellaneous: ~2.0s + +### Revised Candidate 4 (Rust AST Extension) Assessment + +After pure Python fixes, tree-sitter operations are 1.0s out of 8.5s (11.8%). A 16x Rust speedup on tree-sitter would save 0.94s, reducing total runtime from 8.5s to 7.6s (1.12x improvement). **This is far below the break-even threshold** given the high development cost (~110KB of Python code to port) and build system complexity. + +The Rust AST extension only becomes worthwhile AFTER all pure Python fixes are applied AND the workload scales to much larger codebases (10,000+ files) where tree-sitter operations become a larger fraction of the reduced total. + +### Revised Priority Order + +| Priority | Candidate | Type | Net Gain (on 31.2s total) | Effort | Integration Overhead | +|----------|-----------|------|---------------------------|--------|---------------------| +| **1** | **A: Fix find_ending_with** | **Python bugfix** | **~1.9x (13.5s saved)** | **Low** | **Zero** | +| **2** | **B: String path ops** | **Python refactor** | **~1.15x (4.0s saved)** | **Low** | **Zero** | +| **3** | **C: Cache type inference** | **Python memoization** | **~1.07x (2.0s saved)** | **Low** | **Zero** | +| **4** | **D: Suppress debug logging** | **Config change** | **~1.06x (1.7s saved)** | **Trivial** | **Zero** | +| **5** | **E: Deduplicate FS traversal** | **Python refactor** | **~1.05x (1.5s saved)** | **Low** | **Zero** | +| 6 | 1: orjson | Dependency swap | Marginal on indexing | Trivial | Zero | +| 7 | 4+5+7: Rust AST extension | Rust crate | 1.12x after Python fixes | High | Significant | +| 8 | 6: File parallelism | Architecture change | 1.5x-3x after Python fixes | Moderate | Moderate | + +### Conclusion + +**The top 5 optimizations require zero language rewrites and zero integration overhead.** They fix algorithmic inefficiencies (linear scan), unnecessary object creation (pathlib), redundant computation (uncached type inference, duplicate traversal), and avoidable overhead (debug logging). Together they provide ~3.7x speedup. + +The Rust AST extension (previously the headline recommendation) addresses only 3.1% of actual CPU time and is demoted to priority 7. It should only be reconsidered after Python-level fixes are applied and the workload scales to repositories an order of magnitude larger than the current test case. diff --git a/docs/reports/LANGUAGE_RECOMMENDATIONS.md b/docs/reports/LANGUAGE_RECOMMENDATIONS.md new file mode 100644 index 000000000..fb2cd7d24 --- /dev/null +++ b/docs/reports/LANGUAGE_RECOMMENDATIONS.md @@ -0,0 +1,423 @@ +# Language Recommendations for Performance Hotspots + +## Executive Summary + +**CPU profiling reveals that 48.3% of total runtime is spent in a single Python function** (`FunctionRegistryTrie.find_ending_with()`) performing a linear scan fallback with 123.7M `str.endswith()` calls. This is a pure algorithmic bottleneck, not a language limitation, and fixing the simple name lookup index (80.7% miss rate) would nearly halve total runtime with zero language rewrite. + +After addressing algorithmic issues (Phase 0: ~3.7x total improvement from pure Python fixes), **Rust via PyO3** is the recommended target language for the remaining CPU-bound hotspots (AST wrapper overhead, trie operations, call resolution). For serialization, **orjson** (Rust-backed) is a drop-in replacement for stdlib json. ~~neo4j-rust-ext~~ was retracted (codebase uses Memgraph/pymgclient, not Neo4j). + +**Critical distinction:** This report contains both theoretical per-instruction overhead multipliers (20x-50x from structural analysis) and empirical runtime impact (from CPU profiling). The structural multipliers explain WHY Python is slow at specific operations, but the IMPACT must be measured against the actual profiled runtime distribution via Amdahl's law. After Phase 0 Python fixes reduce the baseline from 31.2s to ~8-10s, the Rust extension (Phase 2) addresses ~20% of the reduced baseline, yielding diminishing but still meaningful returns. + +**Profiling baseline:** 31.2 seconds (cProfile), 14.0s (wall-clock), 179M function calls for indexing 352 Python files. + +--- + +## Hotspot Categories and Recommendations + +### HOTSPOT 1: Tree-sitter AST Parsing and Traversal + +**Files:** `parsers/call_processor.py`, `parsers/call_resolver.py`, `parsers/definition_processor.py`, `parsers/function_ingest.py`, `parsers/structure_processor.py`, all `parsers/handlers/*.py` + +**Workload:** Per-file tree-sitter parsing, QueryCursor iteration, recursive Node traversal, text extraction/decoding from AST nodes. Every file in a repository triggers full AST parsing and multi-pass traversal for functions, classes, calls, and imports. + +**Recommended Language:** Rust (via PyO3/maturin) + +**Projected Speedup:** 20x to 50x (revised upward based on structural analysis) + +**CPU PROFILING DATA:** +- `TypeInferenceEngine.build_local_variable_type_map()`: **2.59s cumulative (8.3%)** across 5,228 calls. Traverses ASTs that have already been parsed, with no caching of results across calls within the same file. +- `QueryCursor.captures()`: **0.78s self time (2.5%)** across 11,028 calls. Already a C extension, largely irreducible. +- `Parser.parse()`: **0.19s self time (0.6%)** across 352 calls. Already C, already fast. +- **Key insight from profiling:** Tree-sitter C operations (parse + captures) total only ~1.0s (3.1% of runtime). The overwhelming majority of AST-related CPU time is in the Python wrapper code doing traversal, type inference, and call resolution around these fast C operations. This validates the Rust rewrite approach: keep tree-sitter's C parsing (fast), move the Python traversal/processing into Rust. +- Loguru debug logging: **1.84s cumulative (5.9%)** across 91,119 calls, including 85,099 debug-level calls processed even when not displayed. This is a Python-level fix (reduce log level or guard debug calls). + +**Evidence:** +- Gauge.sh case study: Moving AST-dependent operations into a Rust extension yielded a 16x speedup (8.7s to 530ms) on a 500k-line codebase. The original Python implementation made ~60M malloc calls and spent 35% of cycles on GC; the Rust version made ~7M malloc calls with no significant GC activity. [Source: gauge.sh/blog/python-extensions-should-be-lazy] +- Tree-sitter is already written in C/Rust. The Python bindings add per-node FFI overhead on every `.child_by_field_name()`, `.text`, and `.children` access. Moving traversal logic into Rust eliminates this boundary-crossing cost entirely. +- ast-grep (Rust-based tree-sitter tool) demonstrates that keeping AST processing in Rust-land and only returning final results to Python is the optimal architecture. [Source: github.com/ast-grep/ast-grep] +- **Structural analysis (CRITICAL severity):** Static analysis confirmed 20x to 50x overhead multiplier per node visit. Every `.parent`, `.children`, `.type` access on tree-sitter nodes goes through Python's descriptor protocol (~50 instructions vs ~1 instruction for a direct struct field read in Rust/C). Specific hot patterns identified: + - `_build_nested_qualified_name()` in `function_ingest.py:344-389`: walks parent chain upward + - `_resolve_inherited_method()` in `call_resolver.py:624-649`: BFS through class_inheritance dict + - `is_method_node()` in `parsers/utils.py:159-173`: walks parent chain for every function node + - `_collect_ancestor_path_parts()` in `function_ingest.py:369-389`: ancestor walk with repeated type checks + - `_is_nested_inside_function()` in `class_ingest/mixin.py:34-45`: another parent chain walk +- **Additional structural overhead:** `bytes.decode("utf-8")` on every `node.text` access (MEDIUM severity, 3x to 5x overhead). The LRU cache at `parsers/utils.py:48-50` mitigates this partially, but `call_processor.py:49` bypasses the cache entirely. In Rust, zero-copy `&[u8]` slices eliminate this entirely. + +**Architecture:** Build a Rust extension that accepts file bytes and a language enum, performs tree-sitter parsing and all traversal passes (function extraction, class extraction, call extraction, import extraction) in Rust, and returns structured results (lists of function definitions, call relationships, class hierarchies) as Python objects. + +**GIL consideration (from concurrency analysis):** Tree-sitter's C extension already releases the GIL during parsing, which enables ThreadPoolExecutor parallelism for the current Python implementation. Any Rust rewrite MUST preserve this property by using `Python::allow_threads` in PyO3 during parsing and traversal, enabling concurrent file processing across threads without process-level parallelism overhead. + +**Why not Cython:** Cython cannot eliminate the Python-to-C FFI overhead of tree-sitter node access, since the bottleneck is the per-node boundary crossing, not Python loop overhead. Rust allows direct tree-sitter C API access without Python object creation. + +**Why not Go:** Go's FFI to C (cgo) has higher overhead than Rust's native C interop. Go's garbage collector would reintroduce the GC pauses that are a key problem in the Python implementation. PyO3 is a more mature Python interop story than Go's limited options (gopy, cgo+ctypes). + +--- + +### HOTSPOT 2: FunctionRegistryTrie Operations + +**Files:** `graph_updater.py` (FunctionRegistryTrie class), `parsers/call_resolver.py` + +**Workload:** Trie insertion and lookup for qualified function names. Every function/method/class definition triggers a trie insert (string splitting on `.`, nested dict traversal). Every call resolution triggers trie lookups, often with multiple fallback strategies (direct lookup, inheritance chain walking, simple name fallback). + +**Recommended Language:** Rust (via PyO3/maturin) + +**Projected Speedup:** 10x to 50x on the post-fix baseline (NOT on the current 15s runtime) + +**IMPORTANT CONTEXT (from integration-architect):** The 10x-50x speedup applies to trie operations AFTER the algorithmic index fix (Priority 0a). After fixing the `_simple_name_lookup` 80.7% miss rate, trie operations drop from 15s to under 1s in pure Python. The Rust trie's 10x-50x improvement then applies to an operation taking <1s, yielding <1s additional savings. The algorithmic fix alone yields ~2x on total runtime. The Rust rewrite is justified by (a) GIL release enabling thread parallelism and (b) cumulative savings across all trie/string operations, but the root cause is an algorithmic bug, not a language limitation. + +**CPU PROFILING DATA (the #1 finding):** +- `find_ending_with()` at `graph_updater.py:156`: **7.91s self time (25.3%), 15.07s cumulative (48.3%)** across 27,376 calls +- Root cause: The `_simple_name_lookup` index has an **80.7% miss rate** (22,096 of 27,376 calls miss). On each miss, the code falls back to a linear scan: `[qn for qn in self._entries.keys() if qn.endswith(f".{suffix}")]`, triggering **123.7M `str.endswith()` calls** (7.21s self time) +- Called 26,950 times from `CallResolver._try_resolve_via_trie()`, the last-resort call resolution strategy +- **This single function accounts for nearly half of all CPU time. The trie data structure exists but is bypassed in favor of the linear fallback in most cases.** +- **CRITICAL: Fix the simple name lookup index first (Python algorithmic fix).** A proper reverse index mapping simple names to qualified names would eliminate the linear scan entirely, reducing this from 15.07s to sub-second. This is the highest-ROI optimization in the entire codebase. Note: even after the algorithmic fix, Python's per-call `str.endswith()` overhead is 5x to 10x what Rust byte-slice comparisons would cost (structural analysis cross-reference), so the Rust trie rewrite remains valuable for the remaining lookup operations. + +**Evidence for language rewrite (after algorithmic fix):** +- **Concurrency analysis confirms this is GIL-bound:** Pure Python trie/dict operations in `FunctionRegistryTrie` and `CallResolver` hold the GIL throughout, preventing any thread-level parallelism. The concurrency analyst estimates 10x to 50x speedup from moving this to native code. This is the strongest case for a Rust rewrite since it eliminates both per-operation overhead AND the GIL bottleneck. +- The current implementation uses nested Python dicts as trie nodes, which means every level of trie traversal creates Python string objects and performs dict hash lookups with full Python object overhead. +- **Structural analysis (HIGH severity):** Python dicts carry 50 to 80 bytes overhead per entry plus hash computation. Each `in` or `[]` lookup involves: hash the key string (O(n) for string length), probe the hash table, compare keys. In Rust, a `HashMap` has similar algorithmic complexity but with inline storage, no reference counting, and cache-friendly memory layout. Specialized data structures (arena-allocated tries, interned string IDs) are practical in systems languages but impractical in Python due to the object model. +- **String overhead (HIGH severity, 5x to 15x):** Qualified names are constructed, split, compared, and looked up thousands of times per file. Each `.split(".")` allocates a new list of new string objects. Each f-string creates a new heap allocation. `_calculate_import_distance()` at `call_resolver.py:651-671` splits both strings and compares elementwise. In Rust, these would be zero-copy string views or stack-allocated slices. +- Rust trie implementations (radix_trie crate) store data contiguously in memory with no per-node heap allocation, eliminating GC pressure. For high-miss-rate lookups (common in call resolution with fallback chains), optimized Rust tries outperform Python dicts. [Source: dev.to/timclicks/two-trie-implementations-in-rust] +- The Gauge.sh case study showed that moving data structures out of Python and into compact Rust structs reduced malloc calls by 8.5x, directly relevant to this trie-heavy workload. +- PyO3 achieves 92% of pure Rust performance for data structure operations while maintaining full Python interoperability. [Source: pyo3.rs/main/performance] + +**Architecture:** First, fix the `_simple_name_lookup` index to cover the 80.7% miss cases (Python fix). Then, implement `FunctionRegistryTrie` as a Rust struct exposed via PyO3. The `insert()`, `get()`, and `find_ending_with()` methods accept Python strings, perform all trie operations in Rust, and return results. The `__contains__` check (used heavily in call resolution) stays in Rust. Use Rust's `lasso` or `string-interner` crate for interned string IDs to eliminate the qualified name duplication across trie, `_entries`, `simple_name_lookup`, and `import_mapping` (memory profiling shows 3.5 MiB for 10k entries in Python vs ~400 KiB estimated in Rust with interning, a 9x reduction). + +**Convergence point (CPU + memory):** This is the strongest single rewrite target in the codebase. FunctionRegistryTrie is simultaneously the #1 CPU hotspot (48.3%) AND carries 9x memory overhead. A Rust replacement addresses both dimensions in one component. + +**Why not Cython:** Cython would help with loop overhead but cannot change the fundamental data layout. The bottleneck is Python dict overhead per trie node, which requires a different data structure (Rust's contiguous memory layout). + +--- + +### HOTSPOT 3: JSON Serialization/Deserialization for Graph Data + +**Files:** `graph_loader.py`, `graph_updater.py`, `services/graph_service.py` + +**Workload:** Loading and saving large graph JSON files (nodes, relationships, properties). The `GraphLoader.load()` method reads potentially multi-megabyte JSON files. The `GraphUpdater` serializes graph data for Neo4j ingestion. + +**Recommended Language:** Drop-in replacement with orjson (Rust-backed) + +**Projected Speedup:** 5x to 15x + +**Evidence:** +- orjson (written in Rust) is 2x to 15.8x faster than Python's stdlib json, depending on payload size. For large payloads (>1MB), gains are 10x or more. [Source: medium.com/codeelevation/want-500-faster-json-in-python-try-orjson] +- orjson uses SIMD (AVX2) for parallel UTF-8 validation and string escaping, scanning 32 bytes at once vs byte-by-byte. [Source: github.com/ijl/orjson] +- Memory usage is 75% lower peak RSS, which matters for large graph files. +- For a 10K-record benchmark, orjson achieved 820 MB/s serialization vs json's 52 MB/s (15.8x). + +**Architecture:** Replace `import json` with `import orjson` throughout the codebase. This is the lowest-effort, highest-ROI optimization. orjson is a drop-in replacement for most use cases. The only API difference is that `orjson.dumps()` returns bytes instead of str. + +**Why this over a full rewrite:** The JSON parsing itself is the bottleneck, not the surrounding Python code. orjson already provides native Rust performance for this specific operation. Writing a custom Rust extension for JSON handling would duplicate orjson's work. + +--- + +### ~~HOTSPOT 4: Neo4j Driver Communication~~ RETRACTED + +**CORRECTION (from integration-architect):** This codebase uses **Memgraph via `pymgclient`** (a C extension), NOT the Neo4j Python driver. There is no `neo4j` dependency in `pyproject.toml`. The `neo4j-rust-ext` package patches the Neo4j driver's PackStream implementation and has **zero effect** on `pymgclient`. This recommendation is retracted. + +`pymgclient` is already a C extension with low overhead. CPU profiling confirms database serialization (protobuf) is negligible at 0.17s total. No language rewrite is needed for the database communication layer. + +--- + +### HOTSPOT 5: Embedding Cache Hashing + +**Files:** `embedder.py` (EmbeddingCache class) + +**Workload:** SHA256 hashing of code snippets for cache key generation. Each snippet is hashed via `hashlib.sha256(content.encode()).hexdigest()`. For large codebases, thousands of snippets are hashed. + +**Recommended Language:** Conditional: BLAKE3 (Rust-backed) if profiling confirms hashing as bottleneck + +**Projected Speedup:** 4x to 10x (for hashing only) + +**Evidence:** +- Python's hashlib SHA256 is already implemented in C (OpenSSL), so it's reasonably fast. Rust SHA256 achieves roughly 1.5x over Python's hashlib. [Source: users.rust-lang.org/t/hash-digest-performance-rust-vs-python/89686] +- If hashing is confirmed as a bottleneck, switching to BLAKE3 (via the `blake3` Python package, which is Rust-backed) provides 4x to 10x speedup over SHA256 because BLAKE3 is inherently faster and uses SIMD parallelism. [Source: devtoolspro.org/articles/sha256-alternatives-faster-hash-functions-2025/] +- The `blake3` Python package is a drop-in hash function replacement. API change is minimal: `blake3.blake3(content.encode()).hexdigest()`. + +**Architecture:** Replace `hashlib.sha256` with `blake3.blake3` in the `EmbeddingCache._content_hash()` method. This is a one-line change. Note: existing caches would need to be regenerated since hash values will differ. + +**CPU PROFILING RESULT: Hashing is NOT a bottleneck.** `_hash_file()` costs only 0.04s total (0.1%) across 453 calls. SHA-256 hashing is fast and not worth optimizing. BLAKE3 swap is deprioritized. + +**Additional structural insight (MEDIUM severity):** The embedding pipeline at `embedder.py:109-126` and `unixcoder.py:97-107` crosses the Python/C boundary 3+ times per embedding: Python `list[list[int]]` to `torch.tensor` (copy), through PyTorch C++ backend (efficient), `.cpu().numpy()` (copy), `.tolist()` (N allocations for N-dim vector). Each crossing involves full memory copies and new container allocations. In Rust with `tch-rs`, tensor references can be held throughout without conversion overhead, providing 2x to 3x improvement on the embedding data path itself (separate from model inference time). + +--- + +### HOTSPOT 6: File Traversal and Processing Pipeline + +**Files:** `parsers/structure_processor.py`, `graph_updater.py` (file walking, `should_skip_path`) + +**Workload:** Walking repository directories, reading files, determining language, applying gitignore/skip rules, and feeding files into the parser pipeline. + +**Recommended Language:** Python (with concurrency improvements) + +**Projected Speedup:** 3x to 5x (via pathlib fix + deduplication, not language rewrite) + +**CPU PROFILING DATA:** +- `should_skip_path()`: **4.29s cumulative (13.7%)** across 59,270 calls. Dominated by `pathlib.relative_to()` at 3.18s across 54,519 calls, which creates intermediate `PurePosixPath` objects internally. +- `_collect_eligible_files()`: **4.71s cumulative (15.1%)** from a single call. The `rglob` itself costs only ~0.4s, but `should_skip_path` per file dominates. +- `identify_structure()`: **1.57s cumulative (5.0%)** from a single call. Performs a **duplicate** `rglob("*")` pass with separate `should_skip_path()` calls. +- **Key insight from profiling:** File traversal is NOT I/O-bound as originally assumed. The bottleneck is Python pathlib object overhead (creating intermediate Path objects for every `relative_to()` call), not filesystem I/O (`posix.scandir` costs only 0.42s). Using string-based path operations instead of pathlib would eliminate most of this overhead. Additionally, merging the duplicate traversal passes would cut FS stat calls in half. + +**I/O PROFILING DATA (confirms NOT I/O-bound):** +- Actual disk I/O for the entire workload totals only **0.85s (6.1% of 14.0s)**. File reads: 0.02s, hashing: 0.02s, protobuf serialization: 0.01s, JSON cache: 0.001s. +- `pathlib.relative_to()` performs **zero disk I/O**. It constructs intermediate `PurePosixPath` objects via `__init__`, `is_relative_to`, `with_segments`, `_from_parsed_parts`. Measured at **10.6 us/call**. +- **String slice equivalent: 0.065 us/call (163x faster).** This is the measured speedup from the I/O profiler for replacing `pathlib.relative_to()` with string slicing. +- Duplicate `rglob("*")` traversals cost ~0.80s combined (two passes of ~0.40s each scanning 59,283 entries). + +**Evidence:** +- The `rglob` filesystem traversal itself is fast (0.42s). The 4.29s in `should_skip_path` is pure Python object creation overhead from pathlib. +- The real opportunity is (a) replacing `pathlib.relative_to()` with string slicing (163x faster per call), and (b) merging the two separate `rglob` passes into one. + +**Architecture:** Keep file traversal in Python. Fix pathlib overhead first (Priority 0b). Thread-based parallelism for file processing is less impactful than originally estimated: CPU profiling shows tree-sitter parsing is only 0.6% of total CPU, so parallelizing parsing yields minimal gains. The dominant bottleneck (48.3%) is in the post-parsing call resolution phase, which is sequential and GIL-bound. + +**Why not Rust for traversal:** The per-file processing calls into tree-sitter (C library) and constructs Python objects. The overhead is in path manipulation (pathlib), not traversal I/O. A string-based path fix in Python is sufficient. + +**Revised concurrency estimate (from concurrency analysis):** Original 3x-6x estimate for parallel file parsing revised downward since tree-sitter parsing is only 0.6% of CPU. Parallelism gains are secondary to algorithmic and native extension improvements. + +**Note (from concurrency analysis):** The Memgraph/Neo4j flush layer already uses ThreadPoolExecutor with separate connections, so the I/O layer is well structured and does not need a language rewrite. + +--- + +### HOTSPOT 7: String Processing in Call Resolution + +**Files:** `parsers/call_resolver.py`, `parsers/import_processor.py` + +**Workload:** Regex matching (`_SEPARATOR_PATTERN`, `_CHAINED_METHOD_PATTERN`), string splitting, qualified name construction (f-string concatenation), dict lookups in import maps. + +**Recommended Language:** Rust (bundled with Hotspot 1 and 2 rewrites) + +**Projected Speedup:** 5x to 20x (as part of the combined AST processing extension) + +**Evidence:** +- Rust string processing is 10x to 80x faster than Python for CPU-intensive operations. [Source: blog.jetbrains.com/rust/2025/11/10/rust-vs-python-finding-the-right-balance] +- The call resolution logic is tightly coupled to AST traversal (it runs during the call processing pass). Moving it into the same Rust extension as Hotspot 1 eliminates all Python object creation overhead for intermediate strings. +- The regex patterns used are simple (separator splitting, method chaining detection) and would be even faster using Rust's `regex` crate, which uses finite automata rather than Python's backtracking regex engine. +- **Structural analysis: Interpreter loop overhead (HIGH severity, 5x to 20x).** The innermost loops at `call_processor.py:285-328`, `import_processor.py:164-172`, and `graph_updater.py:405-434` execute ~20 to 30 Python bytecode instructions per iteration just for control flow (dynamic dispatch, isinstance checks with MRO traversal, reference count updates), before the actual work in called methods. A compiled language would inline these calls and eliminate dispatch overhead entirely. + +**Architecture:** Include call resolution logic in the Hotspot 1 Rust extension. The Rust code performs AST traversal, call name extraction, and call resolution in a single pass, returning only the final resolved call relationships to Python. + +--- + +## CPU Profiling Summary (from cProfile) + +**Workload:** `GraphUpdater.run(force=True)` indexing 352 Python files, 31.2s total, 179M function calls. + +| Rank | Function | Self Time | Cum. Time | % Total | Calls | Root Cause | +|---|---|---|---|---|---|---| +| 1 | `find_ending_with` | 7.91s | 15.07s | 48.3% | 27,376 | Linear scan fallback, 123.7M `endswith` calls | +| 2 | `should_skip_path` | 0.07s | 4.29s | 13.7% | 59,270 | Pathlib `relative_to` overhead (3.18s) | +| 3 | `build_local_variable_type_map` | 0.004s | 2.59s | 8.3% | 5,228 | Repeated AST traversal, no caching | +| 4 | Loguru logging | 0.41s | 1.84s | 5.9% | 91,119 | Debug-level overhead at high call volume | +| 5 | `identify_structure` | 0.02s | 1.57s | 5.0% | 1 | Duplicate FS traversal + should_skip_path | +| 6 | `QueryCursor.captures` | 0.78s | 0.78s | 2.5% | 11,028 | C extension, largely irreducible | +| 7 | `Parser.parse` | 0.19s | 0.19s | 0.6% | 352 | C extension, already fast | +| 8 | `_hash_file` | 0.001s | 0.04s | 0.1% | 453 | Negligible | + +**Key observations:** +1. 48.3% of CPU is in a single function with an algorithmic fix available (index miss rate) +2. Tree-sitter C operations (parse + captures) total only 1.0s (3.1%), confirming the bottleneck is Python wrapper code +3. Protobuf serialization is negligible (0.17s total) +4. File hashing is negligible (0.04s total) + +--- + +## Structural Performance Ceilings (from Static Analysis) + +The static-pattern-analyst identified 9 categories of Python runtime overhead that create inherent performance ceilings. These are organized by severity: + +| Severity | Pattern | Overhead Multiplier | Rewrite Benefit | +|---|---|---|---| +| CRITICAL | AST tree traversal (pointer chasing + dynamic dispatch) | 20x-50x per node visit | Highest | +| CRITICAL | GIL preventing parallel parsing/resolution | Linear with core count | Highest | +| HIGH | String operations on qualified names | 5x-15x | High | +| HIGH | Dictionary lookups in hot loops | 3x-10x | High | +| HIGH | Interpreter loop overhead in tight iteration | 5x-20x | High | +| MEDIUM | `bytes.decode("utf-8")` on every node text access | 3x-5x | Moderate | +| MEDIUM | Object headers + reference counting on all intermediates | 2x-5x memory reduction | Moderate | +| MEDIUM | Embedding data format conversions (Python/Tensor/NumPy) | 2x-3x per embedding | Low (model dominates) | +| MEDIUM-HIGH | File I/O with Path objects (revised upward: CPU profiling shows 13.7% of CPU) | 3x-5x | Significant (pathlib overhead, not I/O) | + +**Key insight:** The CRITICAL and HIGH severity patterns are all concentrated in the same code: the parser/ingestion pipeline (Hotspots 1, 2, 7). A single Rust extension covering AST traversal, trie operations, and call resolution would address 5 of the 9 overhead categories simultaneously. + +**Diffuse overhead note:** Object header overhead (16 bytes per object minimum) and reference counting affect all Python code. Every intermediate `tuple`, `list[str]` from `.split()`, and NamedTuple is heap-allocated with refcounting. A `tuple[str, str]` is ~100 bytes in Python vs ~16 bytes in Rust (stack-allocated). This is not directly addressable per hotspot but is eliminated automatically when hot paths move to Rust. + +## Memory Profiling Data (from tracemalloc) + +Memory profiling confirms that Python's object model creates significant memory overhead in the same hotspot areas identified by CPU profiling and structural analysis: + +| Structure | Python (measured) | Estimated Rust | Memory Ratio | +|---|---|---|---| +| Tree-sitter AST node wrappers | 87.3 MiB (343 files, 1.67M wrapper objects) | ~5-10 MiB (direct C struct access) | 9-17x | +| EmbeddingCache `list[float]` | 48.6 MiB (2k embeddings) | ~6 MiB (packed f32 arrays) | 8x | +| import_mapping | 5.6 MiB (2k modules) | ~1.5 MiB | 3.7x | +| rel_groups | 3.6 MiB | ~800 KiB | 4.5x | +| FunctionRegistryTrie | 3.5 MiB (10k entries, 13.2k intermediate dicts) | ~400 KiB (arena-allocated trie) | 9x | + +**Key memory findings:** +1. **AST node wrappers (87.3 MiB)** are the largest memory consumer. Each `node.children` access creates new Python Node wrapper objects around C pointers. A Rust extension performing extraction natively would avoid all wrapper allocation, reinforcing the Hotspot 1 recommendation. +2. **EmbeddingCache (48.6 MiB)** uses Python `float` objects (28 bytes each). A 768-dim embedding as `list[float]` uses ~21.5 KiB vs ~6 KiB as packed f32. Switching to numpy arrays (Python-level fix) would provide 4x reduction; Rust packed f32 arrays would be optimal. +3. **FunctionRegistryTrie (3.5 MiB)** has 13.2k intermediate Python dict objects (64+ bytes each) for 10k entries. A Rust compact trie with byte slices or arena allocation would use ~400 KiB. +4. **String duplication:** Qualified names are stored in multiple structures (trie, `_entries`, `simple_name_lookup`, `import_mapping`). Python's string interning does not cover long qualified names. Rust string interning via a global interner would deduplicate these. + +--- + +## Non-Language Optimizations (Algorithmic / Python-Level) + +CPU profiling and concurrency analysis identified multiple high-impact optimizations that do NOT require a language rewrite. **These should be implemented first** as they collectively address over 70% of CPU time. + +### ALGORITHMIC 0: Fix `find_ending_with()` Simple Name Index (THE #1 PRIORITY) + +**Issue:** `FunctionRegistryTrie.find_ending_with()` at `graph_updater.py:156` accounts for **48.3% of total CPU time** (15.07s of 31.2s). The `_simple_name_lookup` index has an 80.7% miss rate, causing a linear scan fallback with 123.7M `str.endswith()` calls. + +**Projected Speedup:** ~2x on total runtime (eliminating 15s from a 31s run) + +**Action:** Build a proper reverse index mapping simple (unqualified) names to their list of qualified names. Populate it during trie insertion. This converts the O(N) linear scan into an O(1) dict lookup per call. This is a pure Python data structure fix requiring minimal code changes. + +### ALGORITHMIC 0b: Replace pathlib `relative_to()` with String Operations + +**Issue:** `should_skip_path()` consumes **4.29s (13.7%)** due to pathlib's `relative_to()` creating intermediate `PurePosixPath` objects 54,519 times. The actual filesystem I/O is only 0.42s. + +**Projected Speedup:** ~3x on the file collection phase (reducing 4.29s to ~0.5s) + +**Action:** Replace `path.relative_to(base)` with `str(path)[len(str(base))+1:]` or equivalent string slicing. Merge the duplicate `rglob("*")` passes from `_collect_eligible_files()` and `identify_structure()` into a single traversal. Additionally, pre-filter at directory level: walk the tree manually and skip ignored directories (.git, __pycache__, node_modules, site) immediately rather than enumerating all 59K descendants and filtering after. This would reduce traversal from 59K to ~600 paths. + +### ALGORITHMIC 0c: Cache Type Inference Results Per File + +**Issue:** `build_local_variable_type_map()` consumes **2.59s (8.3%)** across 5,228 calls, re-traversing ASTs that have already been parsed with no caching across calls within the same file. + +**Projected Speedup:** ~2x to 5x on the type inference phase + +**Action:** Memoize type inference results per function AST node. Since the AST is immutable after parsing, results are safe to cache. + +### ALGORITHMIC 0d: Reduce Debug Logging Overhead + +**Issue:** Loguru logging consumes **1.84s (5.9%)** across 91,119 calls, including 85,099 debug-level calls processed even when not displayed. + +**Projected Speedup:** Eliminates ~1.8s (5.9% of total runtime) + +**Action:** Guard debug log calls with `if logger.isEnabledFor(DEBUG):` or use lazy formatting, or set the minimum log level to INFO in production. + +### ALGORITHMIC 0e: Use Compact JSON for Graph Export + +**Issue:** `_write_graph_json()` in `main.py:744` uses `json.dump(graph_data, f, indent=2)` which is **8x slower** than compact JSON (86ms vs 11ms for 10K nodes) and produces 1.5x larger output. + +**Projected Speedup:** 8x on graph JSON export + +**Action:** Use compact JSON (no indent) for machine consumption. Add a separate `--pretty` flag for human-readable output. + +### ALGORITHMIC 0f: Binary Format for Embedding Cache + +**Issue:** 500 embeddings (768-dim float vectors) stored as JSON = 6.3MB, save = 149ms, load = 38ms. Each embedding is serialized as a JSON array of 768 float values with full decimal precision. + +**Projected Speedup:** 10x+ on embedding cache I/O (both size and speed) + +**Action:** Use numpy `.npy` or `.npz` format for embedding vectors. A 768-dim float32 vector is 3 KiB in binary vs ~15 KiB in JSON text. + +### ALGORITHMIC 1: Batch Embedding API Usage + +**Issue:** The `embed_code_batch` function exists but is unused in the main pipeline. The embedding phase calls `embed_code` per-item instead. + +**Projected Speedup:** Potentially 5x to 12x on the embedding phase (based on batching reducing HTTP round-trip overhead and enabling server-side batching). The Baseten case study showed 12x throughput improvement from proper batching with GIL release. [Source: baseten.co/blog/your-client-code-matters-10x-higher-embedding-throughput-with-python-and-rust/] + +**Action:** Fix the Python pipeline to use `embed_code_batch`. This is a Python-level fix with zero language rewrite cost. + +### ALGORITHMIC 2: Incremental Call Re-Resolution + +**Issue:** The realtime updater (`realtime_updater.py`) performs full call re-resolution on every file change, reprocessing the entire function registry and call graph. + +**Projected Speedup:** 10x to 100x for incremental updates (per the concurrency analysis), since only the changed file's calls and its direct dependents need re-resolution. + +**Action:** Implement incremental call resolution that tracks which qualified names changed and only re-resolves calls that reference those names. This is an algorithmic improvement, not a language choice. + +**These two Python-level fixes should be implemented BEFORE the Rust extension work**, as they may reduce the urgency of the more expensive rewrites. + +--- + +## Language Comparison Matrix + +| Criterion | Rust (PyO3) | Cython | Go | Mojo | Zig | +|---|---|---|---|---|---| +| **Raw performance** | Excellent (C-level) | Good (C-level for numeric) | Good (2x slower than Rust) | Excellent (claims C-level) | Excellent (C-level) | +| **Python FFI quality** | Excellent (PyO3 is mature, zero-copy numpy, vectorcall) | Native (compiles to C extension) | Poor (cgo+ctypes, limited) | Poor (early stage, no stable FFI) | Poor (C ABI only, no Python tooling) | +| **Ecosystem for this workload** | Excellent (tree-sitter crate, regex, serde_json, radix_trie) | Limited (no tree-sitter, string ops need C) | Moderate (tree-sitter-go exists) | None (no tree-sitter, no graph libs) | Limited (tree-sitter C API via @cImport) | +| **Memory safety** | Excellent (borrow checker) | Poor (manual, C-level) | Good (GC, but adds pauses) | Unknown (early stage) | Moderate (manual, but safer than C) | +| **Build complexity** | Moderate (maturin makes it easy) | Low (cythonize) | High (separate binary, IPC needed) | High (Modular toolchain only) | High (no Python tooling) | +| **Developer availability** | Growing (22% increase in Python+Rust developers in 2025) | Declining | Low for Python extensions | Very low | Very low | +| **Real-world precedent** | ruff, uv, polars, pydantic-core, orjson | numpy, scipy (legacy) | None for similar tools | None for similar tools | None for similar tools | + +### Why Rust is the clear winner for this codebase: + +1. **PyO3 maturity:** PyO3 is the most mature Python FFI framework, with zero-copy mechanisms, vectorcall support, and 92% of pure Rust performance. [Source: pyo3.rs/main/performance] + +2. **Tree-sitter native support:** Tree-sitter's runtime is written in C/Rust. Rust can call the tree-sitter C API directly without any Python intermediary, eliminating the per-node FFI overhead that is the primary bottleneck. + +3. **Industry precedent:** The most successful Python performance tools of 2024-2025 are all Rust-backed: ruff (linter, 10-100x faster), uv (package manager), polars (DataFrame, 5-10x faster), pydantic-core (validation, 17x faster), orjson (JSON, 15x faster). [Source: thenewstack.io/rust-pythons-new-performance-engine/] + +4. **maturin build system:** maturin (also by the PyO3 team) simplifies building and distributing Rust Python extensions as standard wheels. No complex build system integration needed. + +--- + +## Prioritized Implementation Order + +### Phase 0: Python Algorithmic Fixes (addresses ~72% of CPU time) + +| Priority | Fix | Effort | CPU Time Saved | % of Total | +|---|---|---|---|---| +| 0a | Fix `find_ending_with` simple name index | Very low | ~15s | 48.3% | +| 0b | Replace pathlib `relative_to` with string ops + merge duplicate rglob | Low | ~4s | 13.7% | +| 0c | Cache type inference results per file | Low | ~2s | 8.3% | +| 0d | Reduce debug logging overhead | Very low | ~1.8s | 5.9% | +| 0e | Batch embedding API usage | Very low | TBD (embedding phase) | TBD | +| 0f | Incremental call re-resolution | Medium | 10x-100x on realtime | N/A (realtime only) | + +**Phase 0 collectively addresses ~72% of measured CPU time (22.8s of 31.2s) with pure Python changes.** After Phase 0, the expected baseline would be ~8-10s for the same 352-file workload. + +### Phase 1: Drop-in Rust-backed Libraries (zero code changes) + +| Priority | Library | Effort | Expected Speedup | +|---|---|---|---| +| 1a | JSON serialization (orjson) | Very low (dependency swap) | 5x-15x on JSON ops | +| ~~1b~~ | ~~Neo4j driver (neo4j-rust-ext)~~ | ~~RETRACTED~~ | ~~Inapplicable: codebase uses Memgraph/pymgclient, not Neo4j~~ | +| 1b | Embedding hash (BLAKE3) | Very low (one-line change) | 4x-10x on hashing (confirmed negligible: 0.04s) | + +**Note from profiling:** File hashing (`_hash_file`) is only 0.04s total (0.1%), and protobuf serialization is 0.17s total. These are negligible. BLAKE3 (Priority 1b) can be deprioritized. orjson remains worthwhile for larger codebases. The neo4j-rust-ext recommendation was retracted because this codebase uses Memgraph via `pymgclient` (C extension), not the Neo4j Python driver. + +### Phase 2: Rust Extension (addresses remaining CPU-bound overhead) + +| Priority | Component | Effort | Expected Speedup | +|---|---|---|---| +| 2a | AST traversal + type inference (Rust) | High (new extension) | 20x-50x on AST processing | +| 2b | Trie + call resolution (Rust) | Medium (extend 2a) | 10x-50x on lookups (GIL-bound) | + +**Phase 2 should be implemented as a single `codebase-rag-core` Rust crate**, since AST traversal, trie operations, and call resolution are tightly coupled. The Rust extension MUST release the GIL via `Python::allow_threads` during parsing and traversal to preserve thread-level parallelism. + +**Amdahl's law caveat (from integration-architect):** Tree-sitter C operations (parse + captures) are only 3.1% of CPU time. A 16x speedup on 3.1% yields only 1.03x total improvement. The value of the Rust AST extension is NOT in speeding up tree-sitter itself (already fast C code), but in eliminating the Python wrapper overhead around it: type inference re-traversal (8.3%), call resolution string operations, and interpreter loop overhead in the tight iteration loops. These Python-side AST costs total ~20% of CPU, making the combined Phase 2 extension worthwhile after Phase 0 algorithmic fixes are applied. + +### Phase 3: Architecture Improvements + +| Priority | Change | Effort | Expected Speedup | +|---|---|---|---| +| 3a | File processing parallelism (ThreadPoolExecutor) | Medium | Downgraded: marginal gains | + +**Phase 3 is downgraded based on revised analysis.** CPU profiling shows tree-sitter parsing is only 0.6% of CPU, and the file processing bottleneck (`pathlib.relative_to` at 13.7%) is GIL-bound pure Python that ThreadPoolExecutor cannot parallelize. The pathlib fix (Phase 0b, string slicing, 163x faster) is the correct solution, not parallelism. ProcessPoolExecutor for call resolution is also impractical: memory profiling shows 170 MiB peak memory, making serialization cost too high. The Rust PyO3 native extension (Phase 2) is the only viable path for parallelizing call resolution, as it can release the GIL via `Python::allow_threads`. + +--- + +## Sources + +- [Gauge.sh: Python extensions should be lazy](https://www.gauge.sh/blog/python-extensions-should-be-lazy) - 16x speedup moving AST processing to Rust +- [Neo4j Python Driver 10x Faster With Rust](https://neo4j.com/blog/developer/python-driver-10x-faster-with-rust/) - neo4j-rust-ext benchmarks +- [Baseten: 12x higher embedding throughput with Python and Rust](https://www.baseten.co/blog/your-client-code-matters-10x-higher-embedding-throughput-with-python-and-rust/) - PyO3 GIL release pattern +- [orjson: 500% Faster JSON in Python](https://medium.com/codeelevation/want-500-faster-json-in-python-try-orjson-powered-by-rust-22995c25c312) - JSON serialization benchmarks +- [PyO3 Performance Guide](https://pyo3.rs/main/performance) - FFI overhead characteristics +- [Rust: Python's New Performance Engine](https://thenewstack.io/rust-pythons-new-performance-engine/) - Industry adoption trends +- [Comparing Cython to Rust for Python Extensions](https://willayd.com/comparing-cython-to-rust-evaluating-python-extensions.html) - Graph algorithm benchmarks +- [SHA-256 Alternatives: BLAKE3 vs SHA-3 Speed Comparison](https://devtoolspro.org/articles/sha256-alternatives-faster-hash-functions-2025/) - Hash function benchmarks +- [Neo4j Performance Recommendations](https://neo4j.com/docs/python-manual/current/performance/) - Batch loading best practices +- [JetBrains Rust vs Python 2025](https://blog.jetbrains.com/rust/2025/11/10/rust-vs-python-finding-the-right-balance-between-speed-and-simplicity/) - String processing benchmarks +- [Databooth: Benchmarking Python with Cython, C, C++, and Rust](https://www.databooth.com.au/posts/py-num-bench/) - Extension comparison +- [Cython, Rust, and more: choosing a language for Python extensions](https://pythonspeed.com/articles/rust-cython-python-extensions/) - When to use each approach +- [ast-grep](https://github.com/ast-grep/ast-grep) - Rust tree-sitter code analysis tool +- [Rust trie implementations](https://dev.to/timclicks/two-trie-implementations-in-rust-ones-super-fast) - Trie performance +- [Corrode: Migrating from Python to Rust](https://corrode.dev/learn/migration-guides/python-to-rust/) - Migration guide +- [Datadog: Migrating static analyzer from Java to Rust](https://www.datadoghq.com/blog/engineering/how-we-migrated-our-static-analyzer-from-java-to-rust/) - Code analysis tool migration diff --git a/docs/reports/PRIORITIZED_SCORECARD.md b/docs/reports/PRIORITIZED_SCORECARD.md new file mode 100644 index 000000000..871d96534 --- /dev/null +++ b/docs/reports/PRIORITIZED_SCORECARD.md @@ -0,0 +1,284 @@ +# Prioritized Scorecard: Rewrite Candidates + +**Baseline:** 31.2s total, 179M function calls, indexing 352 Python files (cProfile) + +## Scoring Methodology + +Each candidate is scored 1 to 5 on six dimensions. The final rank is determined by **Net Score**, which weights measured/projected performance gain and scope of impact highest, while penalizing integration overhead, risk, and maintenance burden. + +**Weights:** Performance Gain (25%) | Memory Improvement (10%) | Integration Feasibility (20%) | Risk & Complexity (20%) | Scope of Impact (15%) | Maintenance Burden (10%) + +**Score key:** 5 = excellent, 4 = good, 3 = moderate, 2 = poor, 1 = unacceptable + +--- + +## Tier 1: ACCEPTED (High confidence, clear positive ROI) + +### Rank 1: Fix `find_ending_with` Linear Scan (Python Bugfix) + +| Dimension | Score | Rationale | +|---|---|---| +| Performance Gain | 5 | 48.3% of CPU (15.07s). Eliminates 123.7M `str.endswith()` calls. Projected ~1.9x total speedup. | +| Memory Improvement | 3 | Reduces temporary string allocations from linear scans. | +| Integration Feasibility | 5 | Pure Python fix. Zero new dependencies, zero build changes. | +| Risk & Complexity | 5 | Low risk. Fix the 80.7% miss rate in `_simple_name_lookup` index, or build suffix index. | +| Scope of Impact | 5 | Affects every file processed. Dominant bottleneck in the entire pipeline. | +| Maintenance Burden | 5 | No new language, no new build tooling. Standard Python data structure. | +| **Net Score** | **4.80** | | + +**Verdict: PROCEED IMMEDIATELY.** This is a bugfix, not a rewrite. The `_simple_name_lookup` index has an 80.7% miss rate, causing fallback to O(n) linear scan on every call resolution. Fixing the index population or adding a suffix index is a straightforward Python change with the highest ROI of any candidate. + +--- + +### Rank 2: Replace pathlib with String Operations in `should_skip_path` (Python Refactor) + +| Dimension | Score | Rationale | +|---|---|---| +| Performance Gain | 4 | 13.7% of CPU (4.29s across 59,012 calls). ~20x faster with string ops. | +| Memory Improvement | 4 | Eliminates ~118,000 intermediate Path objects per run. | +| Integration Feasibility | 5 | Internal refactor. No dependencies. | +| Risk & Complexity | 5 | Replace `Path.relative_to()` with `str.removeprefix()`. Straightforward. | +| Scope of Impact | 4 | Affects file traversal (called for every file and directory). | +| Maintenance Burden | 5 | Simpler code than current pathlib usage. | +| **Net Score** | **4.50** | | + +**Verdict: PROCEED.** Convert paths to strings at the boundary and use string comparison. The pathlib object creation overhead is avoidable. + +--- + +### Rank 3: Cache `build_local_variable_type_map` Results (Python Memoization) + +| Dimension | Score | Rationale | +|---|---|---| +| Performance Gain | 3 | 8.3% of CPU (2.59s across 5,228 calls). Saves ~2s. | +| Memory Improvement | 2 | Adds ~2MB cache. Slight memory increase. | +| Integration Feasibility | 5 | Add `@lru_cache` or dict-based memoization. No dependencies. | +| Risk & Complexity | 5 | Keyed by (file_path, function_start_line, function_end_line). Cache invalidation handled by existing incremental update system. | +| Scope of Impact | 3 | Affects call resolution for files with multiple functions. | +| Maintenance Burden | 5 | Standard memoization pattern. | +| **Net Score** | **3.90** | | + +**Verdict: PROCEED.** Standard memoization with minimal memory cost. + +--- + +### Rank 4: Suppress Debug Logging in Production (Config Change) + +| Dimension | Score | Rationale | +|---|---|---| +| Performance Gain | 3 | 5.9% of CPU (1.84s from 85,099 debug calls). Saves ~1.7s. | +| Memory Improvement | 2 | Reduces temporary string allocations from format strings. | +| Integration Feasibility | 5 | Set log level to INFO at start of `GraphUpdater.run()`. One line. | +| Risk & Complexity | 5 | Trivial. Debug output not needed during normal graph building. | +| Scope of Impact | 3 | Affects all debug logging throughout pipeline. | +| Maintenance Burden | 5 | No maintenance cost. | +| **Net Score** | **3.75** | | + +**Verdict: PROCEED.** Trivial change, meaningful gain. + +--- + +### Rank 5: Deduplicate Filesystem Traversal (Python Refactor) + +| Dimension | Score | Rationale | +|---|---|---| +| Performance Gain | 3 | 5.0% of CPU (1.57s). Eliminates duplicate `rglob("*")` + `should_skip_path()` pass. | +| Memory Improvement | 3 | Avoids building duplicate file lists. | +| Integration Feasibility | 4 | Moderate refactor: merge `identify_structure()` and `_collect_eligible_files()` into single traversal. | +| Risk & Complexity | 4 | Requires restructuring two-pass architecture. Not trivial but well-scoped. | +| Scope of Impact | 3 | Affects initial file discovery phase only. | +| Maintenance Burden | 4 | Single-pass is arguably simpler than two-pass. | +| **Net Score** | **3.55** | | + +**Verdict: PROCEED.** Combine with Rank 2 (string paths) for maximum benefit on the file traversal phase. + +--- + +### Rank 6: orjson (Drop-in JSON Replacement) + +| Dimension | Score | Rationale | +|---|---|---| +| Performance Gain | 3 | 5x to 15x on JSON ops. JSON is NOT a dominant hotspot in the profiling data (indexing phase), but significant for graph export and cache I/O. | +| Memory Improvement | 4 | 75% lower peak RSS for JSON operations. | +| Integration Feasibility | 5 | Add dependency, ~10 call sites need minor adjustment (bytes vs str). | +| Risk & Complexity | 5 | Widely adopted (polars, FastAPI). Pre-built wheels for all platforms. | +| Scope of Impact | 2 | JSON ops are a small fraction of total indexing time. Bigger impact on graph export/import. | +| Maintenance Burden | 5 | Drop-in replacement. No ongoing maintenance cost. | +| **Net Score** | **3.50** | | + +**Verdict: PROCEED.** Low effort, low risk, moderate gain on I/O-heavy workflows (export, cache load/save). Not a game-changer for indexing performance. + +--- + +## Tier 2: CONDITIONAL (Worthwhile only after Tier 1 is complete) + +### Rank 7: Rust AST Processing Extension (PyO3/maturin) + +| Dimension | Score | Rationale | +|---|---|---| +| Performance Gain | 2 | Tree-sitter ops are only 3.1% of CPU BEFORE Python fixes. After Tier 1 fixes (~3.7x speedup), tree-sitter becomes ~11.8% of reduced runtime. A 16x Rust speedup saves 0.94s from 8.5s. Only 1.12x total improvement post-fixes. | +| Memory Improvement | 4 | Eliminates Python object overhead (50-80 bytes per dict entry), reduces malloc calls by ~8x. | +| Integration Feasibility | 2 | ~110KB of Python code to port. 8+ language parsers. Complex multi-language pattern matching. Requires maturin build system, Rust toolchain in CI/Docker, platform-specific wheels. | +| Risk & Complexity | 2 | Large surface area. Tight coupling with existing data structures. Tree-sitter version compatibility. IngestorProtocol callback complexity. | +| Scope of Impact | 3 | Affects all file processing. But only becomes meaningful at 10,000+ file scale. | +| Maintenance Burden | 2 | Introduces Rust into a pure Python project. Requires Rust expertise for ongoing maintenance. Multi-language build complexity. | +| **Net Score** | **2.35** | | + +**Verdict: DEFER.** The integration architect's analysis is decisive: tree-sitter operations consume only 3.1% of actual CPU time. The language researcher's headline claim of 10x to 16x was based on incorrect assumptions about where time was spent. After Tier 1 Python fixes, the remaining 8.5s runtime has tree-sitter at 11.8%, making a 16x Rust speedup yield only 1.12x total. The high development cost (~110KB port, multi-language parsers) and maintenance burden (Rust toolchain, platform-specific wheels) make this poor ROI until the codebase scales an order of magnitude. + +**Reconsider when:** Repository size exceeds 5,000+ files, making tree-sitter operations a larger fraction of total runtime. + +--- + +### Rank 8: File Processing Parallelism (ProcessPoolExecutor) + +| Dimension | Score | Rationale | +|---|---|---| +| Performance Gain | 3 | 1.5x to 3x after Tier 1 fixes. Limited by sequential pass dependencies (Amdahl's law). | +| Memory Improvement | 1 | Increases memory (per-worker grammar loading, duplicate tries). | +| Integration Feasibility | 3 | Requires restructuring three-pass pipeline. Shared mutable state (trie, import maps) needs synchronization. | +| Risk & Complexity | 3 | Tree-sitter objects not serializable across process boundaries. Worker initialization overhead (~50ms per worker). | +| Scope of Impact | 3 | Affects per-file processing throughput. | +| Maintenance Burden | 3 | Adds concurrency complexity. Harder to debug. | +| **Net Score** | **2.70** | | + +**Verdict: DEFER.** Worth pursuing after Tier 1 fixes reduce the baseline. The concurrency analyst confirmed tree-sitter releases the GIL during parsing, so ThreadPoolExecutor (not ProcessPoolExecutor) is the preferred approach, with lower overhead. But this requires the three-pass architecture to be restructured. + +--- + +## Tier 3: REJECTED (Net gain does not justify complexity) + +### Rank 9: Rust FunctionRegistryTrie (PyO3, standalone) + +| Dimension | Score | Rationale | +|---|---|---| +| Performance Gain | 2 | Standalone: 1.5x to 3x on trie ops. Python call resolution code still creates strings for every lookup key. FFI crossing per-lookup cuts gains in half. | +| Memory Improvement | 4 | Contiguous memory layout eliminates per-node dict overhead. | +| Integration Feasibility | 2 | Only viable bundled with Rank 7 (Rust AST extension). Standalone, FFI overhead negates gains. | +| Risk & Complexity | 3 | Moderate if bundled. High coupling with Rank 7. | +| Scope of Impact | 2 | **Rank 1 (fix `find_ending_with`) eliminates the primary trie bottleneck.** After that fix, trie operations are no longer the dominant cost. | +| Maintenance Burden | 2 | Requires Rust maintenance alongside Python trie. | +| **Net Score** | **2.30** | | + +**Verdict: REJECT standalone. BUNDLE with Rank 7 if/when Rank 7 proceeds.** The critical insight from the integration architect: standalone Rust trie has negative net gains because FFI boundary crossing happens per-lookup (thousands of times per file). Only viable when bundled with the full Rust AST extension. Furthermore, Rank 1 (Python bugfix) eliminates the primary trie bottleneck (the linear scan), making Rust trie less urgent. + +--- + +### Rank 10: neo4j-rust-ext + +| Dimension | Score | Rationale | +|---|---|---| +| Performance Gain | 1 | **0x. This codebase uses Memgraph via pymgclient, NOT the Neo4j Python driver.** neo4j-rust-ext patches the `neo4j` driver which is not used. | +| Memory Improvement | 1 | N/A. | +| Integration Feasibility | 1 | Inapplicable. No `neo4j` dependency in `pyproject.toml`. | +| Risk & Complexity | 1 | Wrong driver assumption. | +| Scope of Impact | 1 | Zero impact. | +| Maintenance Burden | 1 | N/A. | +| **Net Score** | **1.00** | | + +**Verdict: REJECT.** The language researcher incorrectly assumed the codebase uses the Neo4j Python driver. It uses Memgraph via pymgclient (a C extension). neo4j-rust-ext has zero applicability. + +--- + +### Rank 11: BLAKE3 Hashing + +| Dimension | Score | Rationale | +|---|---|---| +| Performance Gain | 1 | Negligible. Hashing is NOT a bottleneck. `_hash_file` processes ~5ms total for 1000 files. `_content_hash` takes microseconds per call. hashlib SHA256 is already C-backed. | +| Memory Improvement | 1 | No meaningful change. | +| Integration Feasibility | 5 | One-line change per call site. Drop-in. | +| Risk & Complexity | 3 | Cache invalidation forces full re-index on first run after change. One-time negative impact dwarfs per-operation savings. | +| Scope of Impact | 1 | Hashing is <0.1% of total runtime. | +| Maintenance Burden | 4 | Minimal. | +| **Net Score** | **1.85** | | + +**Verdict: REJECT.** Optimizing an operation that takes microseconds per call provides no meaningful improvement. The cache invalidation cost (forced full re-index) creates a one-time penalty that exceeds months of per-operation savings. The integration architect's analysis is correct: "Skip unless profiling proves hashing is >5% of total wall clock time." It is far below 5%. + +--- + +### Rank 12: String Processing in Call Resolution (Rust, standalone) + +| Dimension | Score | Rationale | +|---|---|---| +| Performance Gain | 1 | **Negative standalone.** FFI overhead of passing import maps and trie state for each call resolution exceeds the savings from faster string processing. | +| Memory Improvement | 3 | Would reduce temporary string allocations. | +| Integration Feasibility | 1 | Deeply interleaved with trie lookups, import maps, AST node access. Cannot be isolated without massive FFI overhead. | +| Risk & Complexity | 1 | Requires marshalling all context across FFI per call. | +| Scope of Impact | 2 | Affects call resolution, but FFI boundary negates gains. | +| Maintenance Burden | 2 | Additional Rust code for marginal or negative benefit. | +| **Net Score** | **1.40** | | + +**Verdict: REJECT standalone. BUNDLE with Rank 7 only.** The integration architect proved that the boundary crossing cost exceeds per-operation savings when implemented standalone. Only viable as part of a comprehensive Rust AST extension (Rank 7). + +--- + +## Combined Impact Projection + +### Phase 1: Tier 1 Python Fixes (Ranks 1 through 6) + +| Fix | Time Saved | % of Total | Cumulative | +|-----|-----------|------------|------------| +| Rank 1: Fix find_ending_with | ~13.5s | 43.3% | 43.3% | +| Rank 2: String path ops | ~4.0s | 12.8% | 56.1% | +| Rank 3: Cache type inference | ~2.0s | 6.4% | 62.5% | +| Rank 4: Suppress debug logging | ~1.7s | 5.5% | 68.0% | +| Rank 5: Deduplicate FS traversal | ~1.5s | 4.8% | 72.8% | +| Rank 6: orjson (I/O workflows) | Variable | Marginal on indexing | 72.8%+ | +| **Total** | **~22.7s** | **72.8%** | | + +**Projected runtime after Phase 1:** ~8.5s (3.7x speedup from pure Python fixes) +**Integration overhead:** Zero +**Build system changes:** One dependency added (orjson) +**Maintenance burden:** None beyond standard Python + +### Phase 2: Tier 2 (Only if needed after Phase 1) + +After Phase 1, the remaining 8.5s breaks down as: +- Tree-sitter operations: ~1.0s (11.8%) +- Call resolution: ~2.5s (29.4%) +- Graph construction: ~2.5s (29.4%) +- File I/O + hashing: ~0.5s (5.9%) +- Miscellaneous: ~2.0s (23.5%) + +The Rust AST extension (Rank 7) would save ~0.94s from tree-sitter, reducing to ~7.6s (1.12x). File parallelism (Rank 8) could provide 1.5x to 3x on top. Combined: ~3.0 to 5.0s total. + +**Phase 2 is only justified when repository sizes exceed 5,000+ files**, where tree-sitter and call resolution become a proportionally larger fraction of total runtime. + +--- + +## Key Findings + +1. **72.8% of the total runtime is addressable with pure Python fixes** (zero integration overhead, zero build changes, zero maintenance burden). + +2. **The headline Rust AST rewrite (10x to 16x) targets only 3.1% of actual CPU time.** Profiling data invalidated the language researcher's core assumption about where time is spent. + +3. **neo4j-rust-ext is completely inapplicable** (wrong database driver). This was a factual error in the language recommendations. + +4. **BLAKE3 hashing optimizes a non-bottleneck** (microsecond-level operations that total <0.1% of runtime). + +5. **Standalone Rust trie and string processing have negative net gains** due to per-lookup FFI boundary crossing costs that exceed the per-operation savings. + +6. **The single largest optimization (Rank 1) is a Python bugfix**, not a language rewrite. Fixing the `_simple_name_lookup` index miss rate from 80.7% to near 0% eliminates 48.3% of total CPU time. + +--- + +## Scorecard Summary + +| Rank | Candidate | Type | Net Score | Time Saved | Verdict | +|------|-----------|------|-----------|------------|---------| +| 1 | Fix `find_ending_with` | Python bugfix | 4.80 | ~13.5s (43.3%) | **PROCEED** | +| 2 | String path ops | Python refactor | 4.50 | ~4.0s (12.8%) | **PROCEED** | +| 3 | Cache type inference | Python memoization | 3.90 | ~2.0s (6.4%) | **PROCEED** | +| 4 | Suppress debug logging | Config change | 3.75 | ~1.7s (5.5%) | **PROCEED** | +| 5 | Deduplicate FS traversal | Python refactor | 3.55 | ~1.5s (4.8%) | **PROCEED** | +| 6 | orjson | Dependency swap | 3.50 | Variable | **PROCEED** | +| 7 | Rust AST extension | Rust crate | 2.35 | ~0.94s post-fixes | **DEFER** | +| 8 | File parallelism | Architecture change | 2.70 | 1.5x to 3x post-fixes | **DEFER** | +| 9 | Rust trie (standalone) | Rust (PyO3) | 2.30 | Marginal standalone | **REJECT** | +| 10 | neo4j-rust-ext | N/A | 1.00 | 0 (wrong driver) | **REJECT** | +| 11 | BLAKE3 hashing | Dependency swap | 1.85 | Negligible | **REJECT** | +| 12 | Rust string processing | Rust (standalone) | 1.40 | Negative standalone | **REJECT** | + +--- + +**Note:** Task #9 (proof-of-concept benchmarks) was still in progress when this scorecard was produced. If benchmark data reveals performance characteristics that contradict the profiling data used here, this scorecard should be revised. However, the profiling data (cProfile, 31.2s, 179M calls) is empirical and provides a strong basis for these rankings. diff --git a/docs/reports/REWRITE_RECOMMENDATIONS.md b/docs/reports/REWRITE_RECOMMENDATIONS.md new file mode 100644 index 000000000..ebd649eda --- /dev/null +++ b/docs/reports/REWRITE_RECOMMENDATIONS.md @@ -0,0 +1,340 @@ +# Rewrite Recommendations: code-graph-rag Performance Optimization + +## Executive Summary + +A comprehensive performance analysis of the code-graph-rag codebase (31.2s total, 179M function calls indexing 352 Python files) reveals that **no language rewrite is currently justified**. The top performance bottlenecks are algorithmic inefficiencies and unnecessary object creation in pure Python code, addressable with zero new dependencies and zero build system changes. + +### Top 3 Recommendations + +1. **Fix `find_ending_with` suffix index** (Python bugfix): Eliminates 48.3% of total CPU time. The `_simple_name_lookup` index has an 80.7% miss rate, causing 123.7M `str.endswith()` calls via linear scan fallback. Benchmarked fix: **261x to 382x speedup** on the operation. Projected total speedup: ~1.9x. + +2. **Replace pathlib with string operations in `should_skip_path`** (Python refactor): Eliminates 13.7% of total CPU time. `pathlib.relative_to()` creates intermediate objects on every call (59,012 calls, 3.39s total). Benchmarked fix: **45x to 634x speedup** on path operations. Projected total speedup: ~1.15x. + +3. **Cache `build_local_variable_type_map` results** (Python memoization): Eliminates 8.3% of total CPU time. 5,228 uncached AST traversals. Projected total speedup: ~1.07x. + +**Combined Tier 1 impact:** ~3.7x total speedup (31.2s to ~8.5s) from pure Python fixes with zero integration overhead. + +### Key Finding: Rust Rewrite Not Justified + +The language researcher's headline recommendation (Rust AST extension for "10x to 16x speedup") targets tree-sitter operations that consume only **3.1% of actual CPU time**. After Tier 1 Python fixes, a 16x Rust speedup on tree-sitter would yield only **1.03x total improvement** (Amdahl's law). The high development cost (~110KB of Python to port, multi-language parser support, Rust toolchain in CI/Docker) and maintenance burden make this poor ROI until repository sizes exceed 5,000+ files. + +### Adversarial Review Outcome + +The adversarial reviewer confirmed that **no language rewrite candidate survives challenge**. All top hotspots are fixable in Python. The Rust AST extension was the only candidate with theoretical merit, but the measured 3.1% CPU share makes it unjustifiable at current scale. + +### Security Audit Outcome + +The security auditor approved all recommended candidates with zero disputes. The only new dependency (orjson) is a widely adopted, well-maintained package with pre-built wheels. + +--- + +## Profiling Baseline + +| Metric | Value | +|--------|-------| +| Profiling tool | cProfile | +| Total runtime | 31.2 seconds | +| Total function calls | 179M | +| Workload | `GraphUpdater.run(force=True)` indexing 352 Python files | +| Platform | macOS Darwin 25.3.0, ARM64 | +| Python version | 3.12.2 (CPython) | +| Key dependencies | tree-sitter 0.25.2, pymgclient, loguru, torch 2.10 | + +--- + +## Detailed Analysis: Accepted Candidates + +### Candidate 1: Fix `find_ending_with` Linear Scan + +**Priority:** 1 (Highest) +**Type:** Python bugfix +**Effort:** Low +**Files:** `codebase_rag/graph_updater.py:156-161` + +**Profiling Data:** +- Self time: 7.91s (25.3%) +- Cumulative time: 15.07s (48.3%) +- Call count: 27,376 calls +- Root cause: `_simple_name_lookup` index miss rate of 80.7% (22,096 of 27,376 calls) +- Fallback: `[qn for qn in self._entries.keys() if qn.endswith(f".{suffix}")]` generating 123.7M `str.endswith()` invocations + +**Benchmark Results:** + +| Registry Size | Queries | Linear Scan (ms) | Suffix Index (ms) | Speedup | +|---|---|---|---|---| +| 1,000 | 38 | 1.77 | 0.007 | 261x | +| 4,500 | 38 | 8.04 | 0.023 | 356x | +| 10,000 | 38 | 17.78 | 0.046 | 382x | + +**Fix:** Populate `_simple_name_lookup` for every insert path, including `__setitem__`. Build a complete suffix index mapping the last dot-separated segment to the full qualified name set. This converts O(n) scans to O(1) lookups. + +**Projected Net Gain:** ~1.9x total speedup (13.5s saved) +**Integration Overhead:** Zero +**Risk:** Very low + +--- + +### Candidate 2: Replace pathlib with String Operations + +**Priority:** 2 +**Type:** Python refactor +**Effort:** Low +**Files:** `codebase_rag/utils/path_utils.py`, `codebase_rag/graph_updater.py:364-388` + +**Profiling Data:** +- Cumulative time: 4.29s (13.7%) +- Call count: 59,270 calls +- Root cause: `pathlib.relative_to()` creates intermediate `PurePosixPath` objects (3.39s across 54,519 calls) + +**Benchmark Results:** + +| Operation | pathlib (ms) | String ops (ms) | Speedup | +|---|---|---|---| +| `relative_to` vs `removeprefix` (5K paths) | 61.3 | 0.097 | 634x | +| Full `should_skip_path` (5K paths) | 69.3 | 1.55 | 45x | +| Full `should_skip_path` (20K paths) | 285.9 | 6.21 | 46x | + +**Fix:** Convert paths to strings at the function boundary. Use `str.removeprefix()` and `str.split("/")` instead of `Path.relative_to()` and `Path.parts`. + +**Projected Net Gain:** ~1.15x total speedup (4.0s saved) +**Integration Overhead:** Zero +**Risk:** Very low + +--- + +### Candidate 3: Cache Type Inference Results + +**Priority:** 3 +**Type:** Python memoization +**Effort:** Low +**Files:** `codebase_rag/parsers/type_inference.py:119` + +**Profiling Data:** +- Cumulative time: 2.59s (8.3%) +- Call count: 5,228 calls +- Root cause: Re-traverses AST nodes per function for type inference without caching + +**Fix:** Memoize results keyed by `(file_path, function_start_line, function_end_line)`. Cache invalidation handled by existing incremental update system. + +**Projected Net Gain:** ~1.07x total speedup (2.0s saved) +**Integration Overhead:** ~2MB memory for cache +**Risk:** Low + +--- + +### Candidate 4: Suppress Debug Logging in Production + +**Priority:** 4 +**Type:** Configuration change +**Effort:** Trivial +**Files:** `codebase_rag/graph_updater.py` (run method) + +**Profiling Data:** +- Cumulative time: 1.84s (5.9%) +- Call count: 91,119 calls (85,099 debug-level) +- Root cause: Debug log calls processed even when output is suppressed + +**Fix:** Set loguru level to INFO at the start of `GraphUpdater.run()`, or use `logger.opt(lazy=True).debug()` for expensive format strings. + +**Projected Net Gain:** ~1.06x total speedup (1.7s saved) +**Integration Overhead:** Zero +**Risk:** Very low + +--- + +### Candidate 5: Deduplicate Filesystem Traversal + +**Priority:** 5 +**Type:** Python refactor +**Effort:** Low +**Files:** `codebase_rag/graph_updater.py:364`, `codebase_rag/parsers/structure_processor.py:49` + +**Profiling Data:** +- `identify_structure()`: 1.57s (5.0%) +- `_collect_eligible_files()`: 4.71s (15.1%, overlapping with Candidate 2) +- Root cause: Both call `rglob("*")` + `should_skip_path()` independently + +**Fix:** Merge into a single traversal pass that collects both structural elements and eligible files. + +**Projected Net Gain:** ~1.05x total speedup (1.5s saved) +**Integration Overhead:** Moderate refactor of two-pass architecture +**Risk:** Low + +--- + +### Candidate 6: orjson for JSON Serialization + +**Priority:** 6 +**Type:** Dependency swap +**Effort:** Trivial +**Files:** All files using `import json` (graph_loader.py, graph_updater.py, embedder.py, services/graph_service.py) + +**Benchmark Results:** + +| Operation | json (ms) | orjson (ms) | Speedup | +|---|---|---|---| +| Compact dumps (1.9 MB) | 5.73 | 1.01 | 5.7x | +| Indented dumps (1.9 MB) | 48.5 | 2.02 | 24.0x | +| Loads (1.9 MB) | 6.23 | 3.24 | 1.9x | + +**Fix:** Add `orjson>=3.10.0` to dependencies. Replace `json.dumps()` with `orjson.dumps()` (~10 call sites, minor API adjustment for bytes vs str return type). + +**Projected Net Gain:** 5.4x to 25x on JSON operations. Marginal impact on indexing (JSON is not a dominant hotspot), significant impact on graph export/import. +**Integration Overhead:** Near zero +**Security:** Widely adopted (polars, FastAPI). Pre-built wheels. Approved by security audit. +**Risk:** Very low + +--- + +## Combined Impact Projection + +| Phase | Fixes | Time Saved | Cumulative Speedup | Overhead | +|-------|-------|-----------|-------------------|----------| +| Tier 1 | Candidates 1 through 6 | ~22.7s | ~3.7x (31.2s to ~8.5s) | Zero (except orjson dep) | + +**Post Tier 1 runtime breakdown (projected ~8.5s):** + +| Component | Time | % of Reduced Total | +|-----------|------|--------------------| +| Call resolution | ~2.5s | 29.4% | +| Graph construction | ~2.5s | 29.4% | +| Miscellaneous | ~2.0s | 23.5% | +| Tree-sitter operations | ~1.0s | 11.8% | +| File I/O + hashing | ~0.5s | 5.9% | + +--- + +## Deferred Candidates + +### Rust AST Processing Extension (PyO3/maturin) + +**Status:** DEFERRED (reconsider at 5,000+ file scale) + +**Rationale:** Tree-sitter operations consume 3.1% of CPU (0.97s). After Tier 1 fixes, this becomes 11.8% of the reduced 8.5s runtime. A 16x Rust speedup saves 0.94s, yielding 1.12x total improvement. + +**Why deferred, not rejected:** +- At 5,000+ file scale, tree-sitter time scales linearly while Python fix savings are largely constant +- The structural overhead per node visit (20x to 50x) is real but only matters when visit count is high enough +- Rust extension would also unlock GIL-free thread parallelism for file processing + +**Cost if pursued:** ~110KB of Python code to port, 8+ language parsers, maturin build system, Rust toolchain in CI/Docker, platform-specific wheels, ongoing Rust maintenance + +### File Processing Parallelism + +**Status:** DEFERRED (pursue after Tier 1 fixes) + +**Rationale:** Tree-sitter releases the GIL during parsing, enabling ThreadPoolExecutor parallelism. However, shared mutable state (`FunctionRegistryTrie`, `import_mapping`) requires architectural restructuring. The three-pass architecture (structure, definitions, calls) has inherent sequential dependencies. + +**Projected gain:** 1.5x to 3x after Tier 1 fixes +**Prerequisite:** Tier 1 fixes must be applied first to establish the new performance baseline + +--- + +## Rejected Candidates + +### neo4j-rust-ext + +**Verdict:** REJECTED (inapplicable) +**Reason:** This codebase uses Memgraph via `pymgclient` (C extension), not the Neo4j Python driver. `neo4j-rust-ext` patches the `neo4j` driver which is not a dependency. The language researcher's recommendation was based on an incorrect assumption about the database driver. + +### BLAKE3 Hashing + +**Verdict:** REJECTED (invalidated by benchmarks) + +**Benchmark Results:** + +| Operation | SHA256 (ms) | BLAKE3 (ms) | Speedup | +|---|---|---|---| +| 500 snippet hashes | 0.155 | 0.325 | 0.5x (slower) | +| 2,000 snippet hashes | 0.594 | 1.177 | 0.5x (slower) | +| 50 file hashes (5KB avg) | 0.968 | 1.031 | 0.9x (slower) | + +**Reason:** The language recommendations projected 4x to 10x speedup based on algorithmic benchmarks, not Python binding benchmarks. hashlib SHA256 is already C-backed (OpenSSL). BLAKE3's SIMD advantages require large contiguous buffers; code snippets average 200 bytes. FFI overhead per call exceeds algorithmic savings for small inputs. Additionally, hashing is <0.1% of total runtime. + +### Rust FunctionRegistryTrie (Standalone) + +**Verdict:** REJECTED +**Reason:** Standalone Rust trie provides only 1.5x to 3x net gain after FFI overhead. The FFI boundary is crossed per-lookup (thousands of times per file), cutting gains roughly in half. More critically, the Python suffix index fix (Candidate 1) provides 261x to 382x speedup on the actual bottleneck, making the Rust trie unnecessary. Only viable if bundled with a full Rust AST extension. + +### Rust String Processing in Call Resolution (Standalone) + +**Verdict:** REJECTED +**Reason:** Negative net gains when implemented standalone. Call resolution is deeply interleaved with trie lookups, import map lookups, and AST node access. Extracting just the string processing would require marshalling all context (import maps, trie state, class inheritance) across FFI on every call, which exceeds the per-operation savings. + +--- + +## Optimize-First Recommendations (Non-Rewrite) + +These Python-level improvements should be implemented before any language rewrite consideration: + +1. **Use `embed_code_batch`** in `graph_updater.py:_generate_semantic_embeddings`: The batch function exists but the pipeline calls `embed_code` per item. Projected 5x to 20x speedup on the embedding phase. + +2. **Incremental call re-resolution** in `realtime_updater.py`: Currently performs full call re-resolution on every file change. Implementing incremental resolution (re-resolve only affected qualified names) would provide 10x to 100x speedup for realtime updates. + +3. **Fix BoundedASTCache memory limit**: `sys.getsizeof()` misses C-level tree-sitter memory, so the cache size limit is effectively broken. Use `tracemalloc` or a conservative estimate based on entry count instead. + +4. **EmbeddingCache data format**: Replace `list[float]` with numpy arrays for 4x memory reduction on embedding storage. + +5. **FunctionRegistryTrie dual storage**: Consolidate `_entries` dict and trie nodes to eliminate 2.5 MiB waste per 10K entries (addressable as part of Candidate 1). + +--- + +## Benchmark Methodology + +**Infrastructure:** Established by test-sentinel (task #1). All benchmarks in `benchmarks/` directory. + +| Parameter | Value | +|-----------|-------| +| Warmup runs | 3 (discarded) | +| Measured iterations | 20 to 100 per benchmark | +| Statistics | Median, mean, stddev, min, max, p95 | +| GC | Disabled during timing | +| Isolation | Fresh function scope per run | + +**Benchmark suite:** + +| File | Target | +|------|--------| +| `bench_find_ending_with_fix.py` | Suffix index vs linear scan | +| `bench_pathlib_vs_string.py` | pathlib vs string path operations | +| `bench_json_serialization.py` | stdlib json vs orjson | +| `bench_file_hashing.py` | SHA256 vs BLAKE3 vs BLAKE2b | +| `bench_trie.py` | FunctionRegistryTrie operations | +| `bench_string_ops.py` | String operation microbenchmarks | +| `bench_embedding_cache.py` | EmbeddingCache operations | +| `bench_ast_cache.py` | BoundedASTCache operations | +| `bench_graph_loader.py` | GraphLoader JSON parse + index build | +| `bench_dropin_replacements.py` | Drop-in library comparisons | + +Run all benchmarks: `uv run python benchmarks/run_all.py` + +--- + +## Profiling Data Sources + +| Phase | Task | Owner | Output | +|-------|------|-------|--------| +| Baseline | #1 | test-sentinel | Green test suite, benchmark methodology | +| CPU profiling | #2 | cpu-profiler | Hotspot report (cProfile, 31.2s, 179M calls) | +| Memory profiling | #3 | memory-profiler | Allocation report (tracemalloc, 25-frame traces) | +| I/O profiling | #4 | cpu-profiler | I/O report | +| Concurrency analysis | #5 | concurrency-analyst | GIL analysis, parallelism opportunities, scaling factors | +| Structural analysis | #6 | static-pattern-analyst | 9 language-inherent ceilings with severity rankings | +| Language research | #7 | language-researcher | Target language recommendations (Rust via PyO3) | +| Integration feasibility | #8 | integration-architect | FFI overhead analysis, build system impact, net gain calculations | +| Benchmarks | #9 | benchmark-designer | Measured performance for all candidates | +| Scorecard | #10 | evaluator | Prioritized ranking with scores | +| Adversarial review | #11 | adversarial-reviewer | No rewrite justified at current scale | +| Security audit | #12 | security-auditor | All candidates approved, zero disputes | + +--- + +## Conclusion + +The performance analysis produced a clear, data-driven result: **optimize Python first, rewrite later (if ever).** + +The top 5 bottlenecks consuming 72.8% of runtime are all pure Python algorithmic issues (linear scan fallback, pathlib object overhead, uncached traversals, debug logging, duplicate traversals). Fixing them provides ~3.7x total speedup with zero integration overhead, zero build system changes, and zero maintenance burden. + +The Rust AST extension, while technically sound as a future optimization for large-scale workloads, targets only 3.1% of current CPU time and provides ~1.03x total improvement after Python fixes. It should be reconsidered only when the codebase routinely processes 5,000+ file repositories and the Python fixes have been applied. + +No language rewrite recommendation survived the adversarial review at current scale. diff --git a/docs/sdk/cypher-generator.md b/docs/sdk/cypher-generator.md new file mode 100644 index 000000000..b9ef63613 --- /dev/null +++ b/docs/sdk/cypher-generator.md @@ -0,0 +1,47 @@ +--- +description: "Generate Cypher queries from natural language using Code-Graph-RAG's CypherGenerator." +--- + +# Cypher Generator + +The `CypherGenerator` translates natural language questions into Cypher queries for the knowledge graph. + +## Usage + +```python +import asyncio +from cgr import CypherGenerator + +async def main(): + gen = CypherGenerator() + cypher = await gen.generate("Find all classes that inherit from BaseModel") + print(cypher) + +asyncio.run(main()) +``` + +## Configuration + +The Cypher generator uses the configured Cypher provider. Set it via environment variables: + +```bash +CYPHER_PROVIDER=google +CYPHER_MODEL=gemini-2.5-flash +CYPHER_API_KEY=your-api-key +``` + +Or programmatically: + +```python +from cgr import settings + +settings.set_cypher("google", "gemini-2.5-flash", api_key="your-key") +``` + +## Supported Providers + +| Provider | Example Models | +|----------|---------------| +| Google | `gemini-2.5-pro`, `gemini-2.5-flash` | +| OpenAI | `gpt-4o`, `gpt-4o-mini` | +| Ollama | `codellama`, `llama3.2` | diff --git a/docs/sdk/graph-loader.md b/docs/sdk/graph-loader.md new file mode 100644 index 000000000..f14df3a90 --- /dev/null +++ b/docs/sdk/graph-loader.md @@ -0,0 +1,73 @@ +--- +description: "Load and query exported Code-Graph-RAG knowledge graphs with the Python SDK." +--- + +# Graph Loader + +The `load_graph` function loads exported JSON graph data for programmatic analysis. + +## Export a Graph + +First, export the knowledge graph to JSON: + +```bash +cgr export -o my_graph.json +``` + +Or export during graph update: + +```bash +cgr start --repo-path /path/to/repo --update-graph --clean -o my_graph.json +``` + +## Load and Query + +```python +from cgr import load_graph + +graph = load_graph("my_graph.json") +``` + +### Summary Statistics + +```python +summary = graph.summary() +print(f"Total nodes: {summary['total_nodes']}") +print(f"Total relationships: {summary['total_relationships']}") +``` + +### Find Nodes by Label + +```python +functions = graph.find_nodes_by_label("Function") +classes = graph.find_nodes_by_label("Class") +modules = graph.find_nodes_by_label("Module") +``` + +### Analyze Relationships + +```python +for func in functions[:5]: + relationships = graph.get_relationships_for_node(func.node_id) + print(f"Function {func.properties['name']} has {len(relationships)} relationships") +``` + +## Query Memgraph Directly + +For live queries against a running Memgraph instance: + +```python +from cgr import MemgraphIngestor + +with MemgraphIngestor(host="localhost", port=7687) as db: + rows = db.fetch_all("MATCH (f:Function) RETURN f.name LIMIT 10") + for row in rows: + print(row) +``` + +## Use Cases + +- Integration with other tools +- Custom analysis scripts +- Building documentation generators +- Creating code metrics dashboards diff --git a/docs/sdk/overview.md b/docs/sdk/overview.md new file mode 100644 index 000000000..8a4a88918 --- /dev/null +++ b/docs/sdk/overview.md @@ -0,0 +1,58 @@ +--- +description: "Python SDK overview for Code-Graph-RAG programmatic access." +--- + +# Python SDK Overview + +The `cgr` package provides short imports for programmatic use of Code-Graph-RAG. + +## Installation + +```bash +pip install code-graph-rag +``` + +With semantic code search: + +```bash +pip install 'code-graph-rag[semantic]' +``` + +## Quick Example + +```python +from cgr import load_graph + +graph = load_graph("graph.json") +print(graph.summary()) + +functions = graph.find_nodes_by_label("Function") +for fn in functions[:5]: + rels = graph.get_relationships_for_node(fn.node_id) + print(f"{fn.properties['name']}: {len(rels)} relationships") +``` + +## Available Modules + +| Import | Purpose | +|--------|---------| +| `from cgr import load_graph` | Load and query exported graph data | +| `from cgr import MemgraphIngestor` | Query Memgraph with Cypher directly | +| `from cgr import CypherGenerator` | Generate Cypher from natural language | +| `from cgr import embed_code` | Semantic code search with UniXcoder | +| `from cgr import settings` | Configure providers programmatically | + +## Configuration + +```python +from cgr import settings + +settings.set_orchestrator("openai", "gpt-4o", api_key="sk-...") +settings.set_cypher("google", "gemini-2.5-flash", api_key="your-key") +``` + +See individual pages for detailed API usage: + +- [Graph Loader](graph-loader.md) +- [Cypher Generator](cypher-generator.md) +- [Semantic Search](semantic-search.md) diff --git a/docs/sdk/semantic-search.md b/docs/sdk/semantic-search.md new file mode 100644 index 000000000..ac4393b32 --- /dev/null +++ b/docs/sdk/semantic-search.md @@ -0,0 +1,40 @@ +--- +description: "Semantic code search with UniXcoder embeddings in Code-Graph-RAG." +--- + +# Semantic Search + +Code-Graph-RAG supports intent-based code search using UniXcoder embeddings. Find functions by describing what they do rather than by exact names. + +## Installation + +Semantic search requires the `semantic` extra: + +```bash +pip install 'code-graph-rag[semantic]' +``` + +## Usage + +### Generate Code Embeddings + +```python +from cgr import embed_code + +embedding = embed_code("def authenticate(user, password): ...") +print(f"Embedding dimension: {len(embedding)}") +``` + +### Search by Description + +In the interactive CLI, you can search semantically: + +- "error handling functions" +- "authentication code" +- "database connection setup" + +The system returns potential matches with similarity scores. + +## How It Works + +UniXcoder is a unified cross-modal pre-trained model that supports both code understanding and generation. Code-Graph-RAG uses it to create embeddings that capture the semantic meaning of code, enabling searches based on what code does rather than what it's named. diff --git a/docs/stylesheets/extra.css b/docs/stylesheets/extra.css new file mode 100644 index 000000000..e9e4cc5f4 --- /dev/null +++ b/docs/stylesheets/extra.css @@ -0,0 +1,337 @@ +@import url('https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700&family=JetBrains+Mono:wght@400;500;600&display=swap'); + +:root { + --cgr-bg: #030712; + --cgr-surface: #111827; + --cgr-surface-lighter: #1f2937; + --cgr-brand: #6366f1; + --cgr-brand-light: #818cf8; + --cgr-brand-dark: #4f46e5; + --cgr-gray-50: #f9fafb; + --cgr-gray-400: #99a1af; + --cgr-gray-500: #6a7282; + --cgr-gray-800: #1e2939; + --cgr-indigo-700: #432dd7; +} + +/* Dark mode */ +[data-md-color-scheme="slate"] { + --md-default-bg-color: var(--cgr-bg); + --md-default-fg-color: var(--cgr-gray-50); + --md-default-fg-color--light: var(--cgr-gray-400); + --md-default-fg-color--lighter: var(--cgr-gray-500); + --md-default-fg-color--lightest: var(--cgr-gray-800); + --md-primary-fg-color: var(--cgr-brand); + --md-primary-fg-color--light: var(--cgr-brand-light); + --md-primary-fg-color--dark: var(--cgr-brand-dark); + --md-primary-bg-color: var(--cgr-gray-50); + --md-primary-bg-color--light: var(--cgr-gray-400); + --md-accent-fg-color: var(--cgr-brand-light); + --md-accent-fg-color--transparent: rgba(129, 140, 248, 0.1); + --md-accent-bg-color: var(--cgr-brand); + --md-code-bg-color: var(--cgr-surface); + --md-code-fg-color: #e2e8f0; + --md-code-hl-color: var(--cgr-surface-lighter); + --md-code-hl-number-color: #fbbf24; + --md-code-hl-string-color: #34d399; + --md-code-hl-keyword-color: #c084fc; + --md-code-hl-function-color: #60a5fa; + --md-code-hl-comment-color: var(--cgr-gray-500); + --md-code-hl-constant-color: #f472b6; + --md-code-hl-operator-color: #fbbf24; + --md-code-hl-punctuation-color: var(--cgr-gray-400); + --md-code-hl-special-color: #fb923c; + --md-code-hl-name-color: var(--cgr-gray-50); + --md-code-hl-generic-color: var(--cgr-gray-50); + --md-code-hl-variable-color: #f9fafb; + --md-footer-bg-color: var(--cgr-bg); + --md-footer-bg-color--dark: var(--cgr-bg); + --md-footer-fg-color: var(--cgr-gray-400); + --md-footer-fg-color--light: var(--cgr-gray-500); + --md-footer-fg-color--lighter: var(--cgr-gray-500); + --md-typeset-a-color: var(--cgr-brand-light); + --md-typeset-color: var(--cgr-gray-50); + --md-typeset-table-color: rgba(99, 102, 241, 0.05); + --md-typeset-table-color--light: rgba(99, 102, 241, 0.02); + --md-admonition-bg-color: var(--cgr-surface); + --md-shadow-z1: 0 0 0 transparent; + --md-shadow-z2: 0 0 0 transparent; + --md-shadow-z3: 0 0 0 transparent; +} + +[data-md-color-scheme="slate"] .md-header, +[data-md-color-scheme="slate"] .md-tabs { + background-color: var(--cgr-surface); + border-bottom: 1px solid var(--cgr-gray-800); +} + +[data-md-color-scheme="slate"] .md-tabs__link { + color: var(--cgr-gray-400); + opacity: 1; + transition: color 0.2s ease; +} + +[data-md-color-scheme="slate"] .md-tabs__link:hover { + color: var(--cgr-gray-50); +} + +[data-md-color-scheme="slate"] .md-tabs__link--active { + color: var(--cgr-brand-light); +} + +[data-md-color-scheme="slate"] .md-nav--primary .md-nav__item--active > .md-nav__link { + color: var(--cgr-brand-light); +} + +[data-md-color-scheme="slate"] .md-sidebar { + background-color: var(--cgr-bg); +} + +[data-md-color-scheme="slate"] .md-nav__link { + color: var(--cgr-gray-400); + transition: color 0.2s ease; +} + +[data-md-color-scheme="slate"] .md-nav__link:hover { + color: var(--cgr-gray-50); +} + +[data-md-color-scheme="slate"] .md-nav__link--active { + color: var(--cgr-brand-light); + font-weight: 500; +} + +[data-md-color-scheme="slate"] .md-search__form { + background-color: var(--cgr-surface); + border: 1px solid var(--cgr-gray-800); +} + +[data-md-color-scheme="slate"] .md-search__input::placeholder { + color: var(--cgr-gray-500); +} + +[data-md-color-scheme="slate"] .md-typeset code { + background-color: var(--cgr-surface); + border: 1px solid var(--cgr-gray-800); + color: var(--cgr-brand-light); +} + +[data-md-color-scheme="slate"] .md-typeset .admonition, +[data-md-color-scheme="slate"] .md-typeset details { + background-color: var(--cgr-surface); + border-color: var(--cgr-gray-800); +} + +[data-md-color-scheme="slate"] .md-typeset .md-typeset__table table { + border: 1px solid var(--cgr-gray-800); +} + +[data-md-color-scheme="slate"] .md-typeset .md-typeset__table th { + background-color: var(--cgr-surface); + border-color: var(--cgr-gray-800); +} + +[data-md-color-scheme="slate"] .md-typeset .md-typeset__table td { + border-color: var(--cgr-gray-800); +} + +[data-md-color-scheme="slate"] .md-typeset hr { + border-color: var(--cgr-gray-800); +} + +/* Light mode */ +[data-md-color-scheme="default"] { + --md-primary-fg-color: var(--cgr-brand-dark); + --md-primary-fg-color--light: var(--cgr-brand); + --md-primary-fg-color--dark: var(--cgr-indigo-700); + --md-primary-bg-color: #ffffff; + --md-accent-fg-color: var(--cgr-brand); + --md-accent-fg-color--transparent: rgba(99, 102, 241, 0.1); + --md-typeset-a-color: var(--cgr-brand-dark); + --md-code-bg-color: #f8f9fc; + --md-code-fg-color: #1e293b; + --md-code-hl-color: rgba(99, 102, 241, 0.08); + --md-code-hl-number-color: #b45309; + --md-code-hl-string-color: #059669; + --md-code-hl-keyword-color: #7c3aed; + --md-code-hl-function-color: #2563eb; + --md-code-hl-comment-color: #9ca3af; + --md-shadow-z1: 0 0 0 transparent; + --md-shadow-z2: 0 1px 3px rgba(0, 0, 0, 0.08); +} + +[data-md-color-scheme="default"] .md-header { + background-color: #ffffff; + border-bottom: 1px solid #e5e7eb; + color: #1e293b; +} + +[data-md-color-scheme="default"] .md-header .md-header__title { + color: #1e293b; +} + +[data-md-color-scheme="default"] .md-header .md-header__topic { + color: #1e293b; +} + +[data-md-color-scheme="default"] .md-header .md-header__button { + color: #475569; +} + +[data-md-color-scheme="default"] .md-tabs { + background-color: #ffffff; + border-bottom: 1px solid #e5e7eb; +} + +[data-md-color-scheme="default"] .md-tabs__link { + color: #64748b; + opacity: 1; +} + +[data-md-color-scheme="default"] .md-tabs__link:hover { + color: #1e293b; +} + +[data-md-color-scheme="default"] .md-tabs__link--active { + color: var(--cgr-brand-dark); +} + +[data-md-color-scheme="default"] .md-typeset code { + background-color: #f1f5f9; + border: 1px solid #e2e8f0; + color: var(--cgr-brand-dark); +} + +[data-md-color-scheme="default"] .md-search__form { + background-color: #f1f5f9; + border: 1px solid #e2e8f0; +} + +/* Shared styles */ +.md-typeset { + font-family: "Inter", -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, sans-serif; + font-size: 0.82rem; + line-height: 1.7; +} + +.md-typeset code, +.md-typeset pre, +.md-typeset kbd { + font-family: "JetBrains Mono", "SF Mono", "Cascadia Code", "Fira Code", monospace; + font-size: 0.82em; +} + +.md-typeset h1 { + font-weight: 700; + letter-spacing: -0.02em; +} + +.md-typeset h2 { + font-weight: 600; + letter-spacing: -0.01em; +} + +.md-typeset h3, +.md-typeset h4 { + font-weight: 600; +} + +.md-typeset a { + transition: color 0.2s ease; +} + +[data-md-color-scheme="slate"] .md-typeset a:hover { + color: var(--cgr-brand-light); +} + +[data-md-color-scheme="default"] .md-typeset a:hover { + color: var(--cgr-indigo-700); +} + +.md-header__title, +.md-tabs__link, +.md-nav__link, +.md-button, +.md-typeset .admonition-title, +.md-typeset summary, +.md-footer, +.md-typeset table:not([class]) th { + font-family: "Inter", -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, sans-serif; +} + +.md-header__title { + font-weight: 600; +} + +.md-tabs__link { + font-weight: 500; + font-size: 0.78rem; + letter-spacing: 0.01em; +} + +.md-nav__link { + font-size: 0.76rem; +} + +.md-button { + font-weight: 500; + border-radius: 8px; + padding: 0.6em 1.4em; + transition: background-color 0.2s ease, border-color 0.2s ease, transform 0.2s ease; +} + +.md-button--primary { + background-color: var(--cgr-brand); + border-color: var(--cgr-brand); + color: #ffffff; +} + +.md-button--primary:hover { + background-color: var(--cgr-brand-dark); + border-color: var(--cgr-brand-dark); + color: #ffffff; +} + +.md-typeset .md-button:hover { + transform: translateY(-1px); +} + +.md-content { + max-width: 52rem; +} + +.md-typeset pre > code { + border-radius: 8px; +} + +.md-typeset .admonition, +.md-typeset details { + border-radius: 8px; + border-width: 1px; + border-left-width: 4px; +} + +.md-typeset .admonition-title, +.md-typeset summary { + font-weight: 600; +} + +.md-search__form { + border-radius: 8px; +} + +.md-typeset table:not([class]) { + font-size: 0.8rem; + border-radius: 8px; + overflow: hidden; +} + +.md-typeset table:not([class]) th { + font-weight: 600; +} + +@media screen and (min-width: 76.25em) { + .md-sidebar--primary { + width: 13rem; + } +} diff --git a/evals/README.md b/evals/README.md new file mode 100644 index 000000000..93b5000fa --- /dev/null +++ b/evals/README.md @@ -0,0 +1,222 @@ +# cgr evaluation harness + +Scores the knowledge graph that `code-graph-rag` (cgr) builds against ground truth, with no Memgraph required (an in-memory capturing ingestor drives `GraphUpdater(...).run(force=True)`). + +## L1 — structure (containment) + +Scores cgr's definition nodes and `DEFINES`/`DEFINES_METHOD` edges against a scope-aware Python `ast` oracle. + +```bash +uv run python -m evals.cli --target codebase_rag +``` + +Writes `evals/results/scores.csv` and `evals/results/diff.json`. Node identity join is `(kind, file, start_line)`. + +## L2 — module-call attribution (ast oracle) + +Scores whether cgr attributes the right calls to the *module* (caller side). A +call runs at module-load time -- and so belongs to the module -- iff it is a +top-level statement, a decorator, or a default-argument expression, i.e. it is +NOT inside a function body. The L3 execution trace cannot measure this: it +records the innermost *function* frame as the caller and drops `` +frames, so module-level attribution is its structural blind spot. An `ast` +oracle fills it. + +```bash +uv run python -m evals.module_calls --target codebase_rag +``` + +How it works: + +- **Oracle** (`module_calls.oracle_module_calls`): walks each file's AST modelling + import-time execution. A call counts when it runs at module load: top-level + statements, list/set/dict comprehensions (eager), decorators, argument + defaults, and -- only when the file does not `from __future__ import + annotations` -- argument/return annotations. It does NOT count function/method + bodies, lambda bodies, or generator expressions (deferred until called or + consumed). Class bodies stay at module scope. It collects the simple name of + every such call whose callee is first-party (a name defined in the target), + excluding dunders. +- **cgr side** (`module_calls.cgr_module_calls`): every `CALLS` edge whose caller + is a `Module` node, keyed by `(module_file, callee_simple_name)`; a constructor + call resolved to a `Class.__init__` *method* is credited to `Class` (a bare + first-party function named `__init__` is left as a filtered dunder). +- **Score**: precision/recall over `(module_file, callee_simple_name)` edges. + +The exact-attribution guarantee is covered by `test_eval_module_calls.py` +(precision == recall == 1.0 on a controlled fixture: a top-level call, a +default-argument call, a `__main__` call, and a nested call that must NOT be +module-attributed). + +On the whole `codebase_rag` target the metric is a lower bound that surfaces two +real, separate cgr gaps (not attribution errors): + +- **Recall** is bounded by constructor calls to first-party classes with no + explicit `__init__` (NamedTuple/dataclass/pydantic) -- cgr has no method node + to point the call at, so no edge is emitted. Closing this needs constructor + calls to target the class node (tracked with the dead-code Class work). +- **Precision** is bounded by the trie suffix-match fallback occasionally + resolving a module-level call to an unrelated first-party name. + +## L3 — CALLS recall (execution-traced) + +Measures whether cgr's static `CALLS` graph contains the call edges that actually fire at runtime. + +```bash +uv run python -m evals.l3 +``` + +How it works: + +- **Static side** (`cgr_graph.extract_cgr_calls`): builds cgr's graph over the target package (default `codebase_rag`) and collects every `CALLS` edge. +- **Traced side** (`calls_trace.trace_calls`): runs cgr indexing a small fixture (`evals/results/l3_workspace/fixture/`, written by `_write_fixture`) under `sys.settrace`, recording every `(caller, callee)` where both are first-party functions in the target. This is a dynamic trace of *cgr's own code* executing — the fixture's only job is to drive cgr through diverse code paths. +- **Recall** = `|traced ∩ static| / |traced|`. `missed = traced − static` is written to `evals/results/calls_diff.json`. Two scopes are reported: *all calls* and *explicit* (excluding dunder callees). + +Because the ground truth is an execution trace, recall is a sound lower bound: it can only credit cgr for call sites the fixture actually exercises. Enriching the fixture (more Python constructs, more languages) widens coverage and is the intended way to harden the metric. + +### Decorator-wrapper normalization + +When a function is wrapped by a `functools.wraps` decorator (e.g. cgr's `@recursion_guard`), calling it dispatches at runtime through the decorator's generic inner `wrapper`, so a naive trace records two edges: + +``` +caller -> recursion_guard.decorator.wrapper # the generic wrapper frame +recursion_guard.decorator.wrapper -> the_real_method # wrapper calling func(...) +``` + +cgr's static graph instead "sees through" the decorator and records the single logical edge `caller -> the_real_method`, which is what a reader of the graph wants — the recycled `wrapper` is plumbing, not a meaningful call-graph node. + +To keep the trace and the static graph in agreement, `calls_trace._frame_qn` attributes a `wrapper` frame to the function it wraps (recovered from the wrapper's closed-over callable, following any `__wrapped__` chain). This turns `caller -> wrapper` into `caller -> the_real_method` and collapses `wrapper -> the_real_method` into a self-edge (which the tracer already drops). The decision is **normalize in the eval**, not model wrappers in cgr, so cgr's graph stays free of generic wrapper nodes. + +Covered by `codebase_rag/tests/test_l3_decorator_normalization.py`. + +## L1 (Go) — structure against a native `go/ast` oracle + +The Python L1 above grades cgr against a Python `ast` oracle. To grade other languages with *independent* ground truth, each language is checked against its own standard-library parser rather than against cgr's own tree-sitter output. The first such oracle is Go. + +```bash +uv run python -m evals.go_l1 --target /path/to/go/repo --project-name myrepo +``` + +How it works: + +- **Oracle** (`evals/oracles/go_ast.go`): a small Go program that walks the target with the standard library's `go/parser` + `go/ast` and emits one JSON record per declaration (function-local type declarations included, via `ast.Inspect`, since cgr captures those too). The `kind` field already uses cgr's `NodeLabel` vocabulary (`Function`, `Method`, `Class`, `Interface`, `Type`), so records join cgr's nodes directly on `(kind, file, start_line)`. Mapping: `func` → `Function`, `func` with a receiver → `Method`, `type … struct` → `Class`, `type … interface` → `Interface`, any other `type …` (defined types and aliases) → `Type`. Requires the `go` toolchain on `PATH`; `evals.go_l1` exits cleanly if it is missing. +- **cgr side** (`cgr_graph.extract_cgr_go_nodes`): builds cgr's graph over the target and keeps the Go (`.go`) definition nodes. +- **Fair file set**: `run_go_oracle` drops oracle records under any directory in cgr's `IGNORE_PATTERNS` (e.g. `bin`, `vendor`, `build`), so the oracle grades exactly the files cgr indexes — single source of truth, no drift. +- **Score**: per-kind precision/recall/F1 via `score.score_node_kinds`, written to `evals/results/go_scores.csv` and `evals/results/go_diff.json`. + +Validated on `apache/thrift` (1604 cgr Go nodes vs 1604 oracle nodes — exact): + +| label | tp | fp | fn | precision | recall | +|---|---|---|---|---|---| +| Function | 535 | 0 | 0 | 1.0000 | 1.0000 | +| Method | 907 | 0 | 0 | 1.0000 | 1.0000 | +| Class | 106 | 0 | 0 | 1.0000 | 1.0000 | +| Interface | 30 | 0 | 0 | 1.0000 | 1.0000 | +| Type | 26 | 0 | 0 | 1.0000 | 1.0000 | + +Both gaps the oracle originally exposed are fixed: Go `type` declarations (struct/interface/defined-type) are captured (see `codebase_rag/tests/test_go_type_declarations.py`), and Go receiver methods are now `Method` nodes qualified by their receiver type with a `DEFINES_METHOD` edge from it (see `codebase_rag/tests/test_go_receiver_methods.py`), rather than being mislabelled `Function`. + +## L1 (Rust) — structure against a native `syn` oracle + +The second native oracle is Rust, checked against `syn` (the de-facto standard Rust parser). + +```bash +uv run python -m evals.rust_l1 --target /path/to/rust/repo --project-name myrepo +``` + +- **Oracle** (`evals/oracles/rs_oracle/`): a small Rust program that parses every `.rs` file with `syn` and emits one JSON record per declaration, in cgr's `NodeLabel` vocabulary. A `syn::visit::Visit` walk recurses into function bodies (function-local defs), `impl`/`trait` associated types, and closures (which cgr models as anonymous `Function` nodes), so the comparison is apples-to-apples. Mapping: `struct` → `Class`, `enum` → `Enum`, `union` → `Union`, `trait` → `Interface` (+ its methods → `Method`), `type` (incl. associated types) → `Type`, `fn`/closure → `Function`, `impl` method → `Method`. Requires the `cargo` toolchain (`proc-macro2`'s `span-locations` feature gives real line numbers); `evals.rust_l1` exits cleanly if it is missing. +- **cgr side** (`cgr_graph.extract_cgr_rust_nodes`), **score** (`score.score_node_kinds`), output to `rs_scores.csv` / `rs_diff.json`. + +Validated on `apache/thrift`'s `lib/rs` (758 cgr Rust nodes vs 758 oracle nodes — exact, all kinds 1.0). The oracle surfaced one cgr gap, now fixed: methods in an `impl Trait for ` block (e.g. `impl From for u8`) were dropped because the `primitive_type` impl target was unhandled (see `codebase_rag/tests/test_rust_impl_primitive_target.py`). + +## L1 (TypeScript) — structure against the TypeScript compiler API + +The third native oracle is TypeScript, checked against the TypeScript compiler API. + +```bash +uv run python -m evals.ts_l1 --target /path/to/ts/repo --project-name myrepo +``` + +- **Oracle** (`evals/oracles/ts_oracle/`): a Node script that parses every `.ts`/`.tsx` file (`.d.ts` excluded) with the TypeScript compiler API and emits one JSON record per declaration, in cgr's `NodeLabel` vocabulary. Mapping, matching how cgr models TypeScript: `class` → `Class`, `interface` → `Interface`, `enum` → `Enum`, `type` → `Type`, `namespace`/`module` → `Class` (a class-like container), `function` → `Function` (or `Method` inside a namespace/class), arrow functions and function expressions → `Function` (cgr captures every one, like a Rust closure), `method`/`constructor` → `Method`. Requires `node`/`npm` (the `typescript` dependency is installed on first run; `package-lock.json` is committed and `node_modules/` is gitignored). `evals.ts_l1` exits cleanly if node is missing. +- **cgr side** (`cgr_graph.extract_cgr_ts_nodes`), **score** (`score.score_node_kinds`), output to `ts_scores.csv` / `ts_diff.json`. + +Validated on `apache/thrift`'s TypeScript (`lib/nodets`, `lib/ts`): 136 cgr nodes vs 136 oracle nodes — exact, all kinds 1.0. No cgr gap found. + +## L1 (JavaScript) — structure against the TypeScript compiler API + +The same compiler-API oracle parses JavaScript too (the TypeScript compiler accepts JS), so JavaScript reuses `evals/oracles/ts_oracle/` over `.js`/`.jsx`. + +```bash +uv run python -m evals.js_l1 --target /path/to/js/repo --project-name myrepo +``` + +Same mapping as TypeScript, with two JS-specific points matching cgr: object-literal shorthand methods are modelled as standalone `Function`s (not `Method`s), and every arrow function / function expression is a `Function`. Output to `js_scores.csv` / `js_diff.json`. + +Validated on `apache/thrift`'s JavaScript (`lib/js`, `lib/nodejs`): 1087 cgr nodes vs 1087 oracle nodes — exact, all kinds 1.0. No cgr gap found. + +## L1 (Java) — structure against the JDK Compiler Tree API + +The sixth native oracle is Java, checked against the JDK's own parser (`com.sun.source` / `javax.tools`). + +```bash +uv run python -m evals.java_l1 --target /path/to/java/repo --project-name myrepo +``` + +- **Oracle** (`evals/oracles/java_oracle/Oracle.java`): parses every `.java` file with the JDK Compiler Tree API (`task.parse()` only parses, so missing dependencies are fine) and emits one JSON record per declaration. Mapping, matching how cgr models Java: `class` → `Class`, `interface` → `Interface` (+ its method signatures → `Method`), annotation type (`@interface`) → `Class`, `enum` → `Enum`, method/constructor → `Method`. A method declared inside an **anonymous class** (e.g. `new Runnable() { public void run() {...} }`) is modelled as a standalone `Function` — the same way cgr treats it (and JS object-literal methods); the oracle replicates cgr's rule (a member is a `Method` only when its nearest enclosing named class precedes any enclosing method/lambda body). Requires `javac`/`java`; the oracle is compiled on first run (the `.class` is gitignored, the source committed). `evals.java_l1` exits cleanly if the JDK is missing. +- **cgr side** (`cgr_graph.extract_cgr_java_nodes`), **score** (`score.score_node_kinds`), output to `java_scores.csv` / `java_diff.json`. + +Validated on `apache/thrift`'s `lib/java`: 2861 cgr nodes vs 2861 oracle nodes — exact, all kinds 1.0 (including the 103 anonymous-class methods graded as `Function`). No cgr gap found. + +## L1 (Lua) — structure against a `luaparse` oracle + +The seventh native oracle is Lua, checked against `luaparse`. + +```bash +uv run python -m evals.lua_l1 --target /path/to/lua/repo --project-name myrepo +``` + +- **Oracle** (`evals/oracles/lua_oracle/`): a Node script that parses every `.lua` file with `luaparse` (`luaVersion: "5.3"`, so bitwise operators / integer division parse) and emits a `Function` record per function declaration/expression. Lua has no classes, so cgr models every function — global, `local`, table (`t.f`), method (`t:m`), and anonymous function expressions — as a `Function`. Requires `node`/`npm` (the `luaparse` dependency installs on first run; `package-lock.json` committed, `node_modules/` gitignored). +- **cgr side** (`cgr_graph.extract_cgr_lua_nodes`), **score** (`score.score_node_kinds`), output to `lua_scores.csv` / `lua_diff.json`. + +Validated on `apache/thrift`'s Lua (`lib/lua`, `test/lua`): 376 cgr nodes vs 376 oracle nodes — exact, 1.0. No cgr gap found. + +## L1 (PHP) — structure against a `php-parser` oracle + +The eighth native oracle is PHP, checked against `php-parser` (a pure-JS PHP parser, so no `php` binary is needed). + +```bash +uv run python -m evals.php_l1 --target /path/to/php/repo --project-name myrepo +``` + +- **Oracle** (`evals/oracles/php_oracle/`): a Node script that parses every `.php` file with `php-parser` and emits one record per declaration. Mapping, matching cgr: `class` → `Class`, `interface` → `Interface` (+ methods → `Method`), `trait` → `Class` (+ methods → `Method`), `enum` → `Enum`, `function` → `Function`, closure / arrow `fn` → `Function`. Methods of an **anonymous class** (`new class {...}`) are `Function`s (like Java/JS object-literal members), and a declaration's line is its first attribute (`#[Attr]`) line when present — both matching cgr's node span. Requires `node`/`npm` (the `php-parser` dependency installs on first run; `package-lock.json` committed, `node_modules/` gitignored). +- **cgr side** (`cgr_graph.extract_cgr_php_nodes`), **score** (`score.score_node_kinds`), output to `php_scores.csv` / `php_diff.json`. + +Validated on `apache/thrift`'s PHP (`lib/php`): 1295 cgr nodes vs 1295 oracle nodes — exact, all kinds 1.0. No cgr gap found. + +## Latest results (target: `codebase_rag`) + +Committed snapshots live in `evals/results/` — `scores.csv` (L1), `diff.json` (L1 per-label missing/extra), `calls_diff.json` (L3 missed edges). Regenerate with the commands above. + +### L1 — structure (`uv run python -m evals.cli`) + +| category | label | tp | fp | fn | precision | recall | f1 | +|---|---|---|---|---|---|---|---| +| node | Module | 417 | 0 | 0 | 1.0000 | 1.0000 | 1.0000 | +| node | Class | 926 | 0 | 0 | 1.0000 | 1.0000 | 1.0000 | +| node | Function | 1955 | 0 | 0 | 1.0000 | 1.0000 | 1.0000 | +| node | Method | 3919 | 0 | 0 | 1.0000 | 1.0000 | 1.0000 | +| edge | DEFINES | 2742 | 0 | 0 | 1.0000 | 1.0000 | 1.0000 | +| edge | DEFINES_METHOD | 3919 | 0 | 0 | 1.0000 | 1.0000 | 1.0000 | +| edge | INHERITS | 153 | 0 | 0 | 1.0000 | 1.0000 | 1.0000 | +| edge | IMPORTS | 1274 | 0 | 0 | 1.0000 | 1.0000 | 1.0000 | + +Span (end_line) accuracy on matched defs: 6800/6800 exact. + +### L3 — CALLS recall (`uv run python -m evals.l3`) + +| scope | traced | captured | missed | recall | +|---|---|---|---|---| +| all calls | 634 | 634 | 0 | 1.0000 | +| explicit (no dunders) | 580 | 580 | 0 | 1.0000 | + +The L3 fixture exercises rich Python plus all 11 supported languages; recall is a sound lower bound over the cgr code paths that fixture drives. These numbers are for the Python `codebase_rag` target — graded multi-language recall (JS/Rust/Go/Java/C/C++/Lua/PHP/Scala) is future work pending a SCIP-based oracle. diff --git a/evals/__init__.py b/evals/__init__.py new file mode 100644 index 000000000..2a41b33c2 --- /dev/null +++ b/evals/__init__.py @@ -0,0 +1,5 @@ +from .ast_oracle import extract_oracle_graph +from .cgr_graph import extract_cgr_graph +from .score import score + +__all__ = ["extract_cgr_graph", "extract_oracle_graph", "score"] diff --git a/evals/ast_oracle.py b/evals/ast_oracle.py new file mode 100644 index 000000000..aac365052 --- /dev/null +++ b/evals/ast_oracle.py @@ -0,0 +1,168 @@ +import ast +from collections.abc import Iterator +from pathlib import Path + +from loguru import logger + +from codebase_rag import constants as cs + +from . import constants as ec +from . import logs as ls +from .types_defs import DefNode, EdgeKey, GraphData, NameEdge, NodeKey + +_MODULE = cs.NodeLabel.MODULE.value +_CLASS = cs.NodeLabel.CLASS.value +_FUNCTION = cs.NodeLabel.FUNCTION.value +_METHOD = cs.NodeLabel.METHOD.value +_DEFINES = cs.RelationshipType.DEFINES.value +_DEFINES_METHOD = cs.RelationshipType.DEFINES_METHOD.value +_INHERITS = cs.RelationshipType.INHERITS.value +_IMPORTS = cs.RelationshipType.IMPORTS.value + + +def extract_oracle_graph(target: Path, project_name: str) -> GraphData: + nodes: dict[NodeKey, DefNode] = {} + edges: set[EdgeKey] = set() + name_edges: set[NameEdge] = set() + + parsed: list[tuple[str, ast.Module]] = [] + module_index: dict[str, str] = {} + for path in _iter_py_files(target): + rel = path.relative_to(target).as_posix() + try: + tree = ast.parse(path.read_text(encoding="utf-8")) + except (SyntaxError, UnicodeDecodeError, ValueError) as error: + logger.warning(ls.ORACLE_PARSE_FAILED.format(path=rel, error=error)) + continue + parsed.append((rel, tree)) + module_index[_module_dotted(rel, project_name)] = rel + + for rel, tree in parsed: + module_key = NodeKey(_MODULE, rel, ec.MODULE_START_LINE) + nodes[module_key] = DefNode(module_key, Path(rel).stem, 0) + _walk_scope(tree.body, _MODULE, module_key, rel, nodes, edges, name_edges) + for target_file in _import_targets(tree, rel, module_index, project_name): + name_edges.add(NameEdge(_IMPORTS, module_key, target_file)) + + return GraphData(nodes=nodes, edges=edges, name_edges=name_edges) + + +def _module_dotted(rel: str, project_name: str) -> str: + parts = list(Path(rel).with_suffix("").parts) + if parts and parts[-1] == ec.INIT_STEM: + parts = parts[:-1] + return cs.SEPARATOR_DOT.join([project_name, *parts]) + + +def _from_base_parts(node: ast.ImportFrom, pkg_parts: list[str]) -> list[str] | None: + if node.level == 0: + return node.module.split(cs.SEPARATOR_DOT) if node.module else None + keep = len(pkg_parts) - (node.level - 1) + if keep < 0: + return None + parts = pkg_parts[:keep] + if node.module: + parts = parts + node.module.split(cs.SEPARATOR_DOT) + return parts + + +def _import_targets( + tree: ast.Module, rel: str, module_index: dict[str, str], project_name: str +) -> set[str]: + pkg_parts = [project_name, *Path(rel).parent.parts] + targets: set[str] = set() + for node in ast.walk(tree): + if isinstance(node, ast.Import): + for alias in node.names: + if alias.name in module_index: + targets.add(module_index[alias.name]) + elif isinstance(node, ast.ImportFrom): + base_parts = _from_base_parts(node, pkg_parts) + if base_parts is None: + continue + base_dotted = cs.SEPARATOR_DOT.join(base_parts) + for alias in node.names: + if alias.name == "*": + if base_dotted in module_index: + targets.add(module_index[base_dotted]) + continue + sub = cs.SEPARATOR_DOT.join([*base_parts, alias.name]) + if sub in module_index: + targets.add(module_index[sub]) + elif base_dotted in module_index: + targets.add(module_index[base_dotted]) + return targets + + +def _base_name(expr: ast.expr) -> str | None: + if isinstance(expr, ast.Name): + return expr.id + if isinstance(expr, ast.Attribute): + return expr.attr + if isinstance(expr, ast.Subscript): + return _base_name(expr.value) + return None + + +def _iter_py_files(target: Path) -> Iterator[Path]: + for path in target.rglob(f"*{ec.PY_SUFFIX}"): + parts = path.relative_to(target).parts + if set(parts) & ec.IGNORE_DIRS: + continue + if any(part.endswith(ec.EGG_INFO_SUFFIX) for part in parts): + continue + yield path + + +def _end_line(node: ast.stmt) -> int: + end = node.end_lineno + return end if end is not None else node.lineno + + +def _child_stmts(node: ast.stmt) -> list[ast.stmt]: + out: list[ast.stmt] = [] + for _field, value in ast.iter_fields(node): + for item in value if isinstance(value, list) else [value]: + if isinstance(item, ast.stmt): + out.append(item) + elif isinstance(item, ast.ExceptHandler | ast.match_case): + # (H) except handlers and match cases are not ast.stmt but hold + # (H) statement bodies that may define functions/classes. + out.extend(s for s in item.body if isinstance(s, ast.stmt)) + return out + + +def _walk_scope( + stmts: list[ast.stmt], + scope_kind: str, + scope_key: NodeKey, + rel: str, + nodes: dict[NodeKey, DefNode], + edges: set[EdgeKey], + name_edges: set[NameEdge], +) -> None: + for node in stmts: + if isinstance(node, ast.ClassDef): + key = NodeKey(_CLASS, rel, node.lineno) + nodes[key] = DefNode(key, node.name, _end_line(node)) + if scope_kind == _MODULE: + edges.add(EdgeKey(_DEFINES, scope_key, key)) + for base in node.bases: + if base_name := _base_name(base): + name_edges.add(NameEdge(_INHERITS, key, base_name)) + _walk_scope(node.body, _CLASS, key, rel, nodes, edges, name_edges) + elif isinstance(node, ast.FunctionDef | ast.AsyncFunctionDef): + if scope_kind == _CLASS: + key = NodeKey(_METHOD, rel, node.lineno) + nodes[key] = DefNode(key, node.name, _end_line(node)) + edges.add(EdgeKey(_DEFINES_METHOD, scope_key, key)) + else: + key = NodeKey(_FUNCTION, rel, node.lineno) + nodes[key] = DefNode(key, node.name, _end_line(node)) + if scope_kind == _MODULE: + edges.add(EdgeKey(_DEFINES, scope_key, key)) + _walk_scope(node.body, _FUNCTION, key, rel, nodes, edges, name_edges) + else: + _walk_scope( + _child_stmts(node), scope_kind, scope_key, rel, nodes, edges, name_edges + ) diff --git a/evals/calls_trace.py b/evals/calls_trace.py new file mode 100644 index 000000000..2d9483b07 --- /dev/null +++ b/evals/calls_trace.py @@ -0,0 +1,109 @@ +import inspect +import sys +from collections.abc import Callable +from pathlib import Path +from types import CodeType, FrameType + +from . import constants as ec + +_SYNTHETIC_QUALNAME_MARKERS = ( + "", + "", + "", + "", + "", + "", +) +_LOCALS_SEGMENT = "." + +# (H) functools.wraps decorator wrappers: the inner function is named "wrapper" and +# (H) closes over the wrapped callable under one of these free-variable names. cgr +# (H) resolves a call to a decorated function as a call to the function itself (it sees +# (H) through the decorator), so the trace must attribute the generic wrapper frame to +# (H) the function it wraps; otherwise calls would be credited to the recycled wrapper +# (H) node. See evals/README.md ("Decorator-wrapper normalization"). +_WRAPPER_CODE_NAME = "wrapper" +_WRAPPED_FREE_VARS = ("func", "fn", "wrapped", "method", "f") + + +def _code_qn(code: CodeType, target: Path, project_name: str) -> str | None: + try: + file = Path(code.co_filename).resolve() + except (OSError, ValueError): + return None + try: + rel = file.relative_to(target) + except ValueError: + return None + if not file.name.endswith(ec.PY_SUFFIX): + return None + + qualname = code.co_qualname + if any(marker in qualname for marker in _SYNTHETIC_QUALNAME_MARKERS): + return None + qualname = qualname.replace(_LOCALS_SEGMENT, "") + + parts = list(rel.with_suffix("").parts) + if parts and parts[-1] == ec.INIT_STEM: + parts = parts[:-1] + module_dotted = ec.SEP.join([project_name, *parts]) + return ec.SEP.join([module_dotted, qualname]) + + +def _wrapped_code(frame: FrameType) -> CodeType | None: + # (H) Recover the wrapped function's code from a @wraps wrapper frame via its + # (H) closed-over callable, following any __wrapped__ chain to the real function. + code = frame.f_code + if code.co_name != _WRAPPER_CODE_NAME: + return None + for name in _WRAPPED_FREE_VARS: + if name not in code.co_freevars: + continue + candidate = frame.f_locals.get(name) + if not callable(candidate): + continue + unwrapped = inspect.unwrap(candidate) + wrapped_code = getattr(unwrapped, "__code__", None) or getattr( + getattr(unwrapped, "__func__", None), "__code__", None + ) + if isinstance(wrapped_code, CodeType): + return wrapped_code + return None + + +def _frame_qn(frame: FrameType, target: Path, project_name: str) -> str | None: + if (wrapped := _wrapped_code(frame)) is not None and ( + qn := _code_qn(wrapped, target, project_name) + ) is not None: + return qn + return _code_qn(frame.f_code, target, project_name) + + +def trace_calls( + workload: Callable[[], None], target: Path, project_name: str +) -> set[tuple[str, str]]: + target = target.resolve() + edges: set[tuple[str, str]] = set() + + def tracer(frame: FrameType, event: str, arg: object) -> None: + if event != ec.TRACE_CALL_EVENT: + return None + caller = frame.f_back + if caller is None: + return None + callee_qn = _frame_qn(frame, target, project_name) + if callee_qn is None: + return None + caller_qn = _frame_qn(caller, target, project_name) + if caller_qn is None or caller_qn == callee_qn: + return None + edges.add((caller_qn, callee_qn)) + return None + + previous = sys.gettrace() + sys.settrace(tracer) + try: + workload() + finally: + sys.settrace(previous) + return edges diff --git a/evals/cgr_graph.py b/evals/cgr_graph.py new file mode 100644 index 000000000..5dfc0d0ae --- /dev/null +++ b/evals/cgr_graph.py @@ -0,0 +1,402 @@ +from pathlib import Path + +from codebase_rag import constants as cs +from codebase_rag.graph_updater import GraphUpdater +from codebase_rag.parser_loader import load_parsers +from codebase_rag.types_defs import PropertyDict, PropertyValue, ResultRow + +from . import constants as ec +from .types_defs import DefNode, EdgeKey, GraphData, NameEdge, NodeKey + +_RelTuple = tuple[str, PropertyValue, str, str, PropertyValue] +_NodeId = tuple[str, PropertyValue] + + +class _CapturingIngestor: + def __init__(self) -> None: + self.nodes: dict[_NodeId, PropertyDict] = {} + self.rels: list[_RelTuple] = [] + + def ensure_node_batch(self, label: str, properties: PropertyDict) -> None: + uid = properties[cs.NODE_UNIQUE_CONSTRAINTS[label]] + self.nodes[(str(label), uid)] = dict(properties) + + def ensure_relationship_batch( + self, + from_spec: tuple[str, str, PropertyValue], + rel_type: str, + to_spec: tuple[str, str, PropertyValue], + properties: PropertyDict | None = None, + ) -> None: + from_label, _from_key, from_val = from_spec + to_label, _to_key, to_val = to_spec + self.rels.append( + (str(from_label), from_val, str(rel_type), str(to_label), to_val) + ) + + def flush_all(self) -> None: + return None + + def fetch_all( + self, query: str, params: PropertyDict | None = None + ) -> list[ResultRow]: + return [] + + def execute_write(self, query: str, params: PropertyDict | None = None) -> None: + return None + + +def _capture(target: Path, project_name: str) -> _CapturingIngestor: + parsers, queries = load_parsers() + ingestor = _CapturingIngestor() + GraphUpdater( + ingestor=ingestor, + repo_path=target, + parsers=parsers, + queries=queries, + project_name=project_name, + ).run(force=True) + return ingestor + + +def extract_cgr_graph(target: Path, project_name: str) -> GraphData: + return _to_graph_data(_capture(target, project_name), project_name) + + +def extract_cgr_calls(target: Path, project_name: str) -> set[tuple[str, str]]: + ingestor = _capture(target, project_name) + calls_value = cs.RelationshipType.CALLS.value + return { + (str(from_val), str(to_val)) + for from_label, from_val, rel_type, to_label, to_val in ingestor.rels + if rel_type == calls_value + } + + +def _lang_node_key( + label: str, props: PropertyDict, suffix: str | tuple[str, ...] +) -> NodeKey | None: + path = props.get(cs.KEY_PATH) + if path is None: + return None + file = str(path) + if not file.endswith(suffix): + return None + raw_start = props.get(cs.KEY_START_LINE) + if not isinstance(raw_start, int | float): + return None + return NodeKey(label, file, int(raw_start)) + + +def extract_cgr_lang_nodes( + target: Path, + project_name: str, + suffix: str | tuple[str, ...], + kind_values: frozenset[str], +) -> dict[NodeKey, DefNode]: + ingestor = _capture(target, project_name) + nodes: dict[NodeKey, DefNode] = {} + for (label, _uid), props in ingestor.nodes.items(): + if label not in kind_values: + continue + key = _lang_node_key(label, props, suffix) + if key is None: + continue + raw_end = props.get(cs.KEY_END_LINE) + end_line = int(raw_end) if isinstance(raw_end, int | float) else 0 + nodes[key] = DefNode(key, str(props.get(cs.KEY_NAME, "")), end_line) + return nodes + + +def _lang_endpoint_key( + label: str, + props: PropertyDict, + suffix: str | tuple[str, ...], + exclude_suffix: str | None = None, +) -> NodeKey | None: + # (H) Resolve any node (incl. the per-file Module, which carries no + # (H) start_line) to a NodeKey so containment edges can join on it. cgr keys + # (H) module-level DEFINES parents at the module node; mirror the ast oracle + # (H) by placing the module at MODULE_START_LINE. + path = props.get(cs.KEY_PATH) + if path is None: + return None + file = str(path) + if not file.endswith(suffix): + return None + if exclude_suffix is not None and file.endswith(exclude_suffix): + return None + raw_start = props.get(cs.KEY_START_LINE) + if label == cs.NodeLabel.MODULE.value: + # (H) The per-file module carries no start line (keyed at line 0); an + # (H) inline module (Rust `mod`) carries its declaration line, which keeps + # (H) it distinct from the file module so nested containment can join. + if isinstance(raw_start, int | float): + return NodeKey(label, file, int(raw_start)) + return NodeKey(label, file, ec.MODULE_START_LINE) + if not isinstance(raw_start, int | float): + return None + return NodeKey(label, file, int(raw_start)) + + +def extract_cgr_lang_graph( + target: Path, + project_name: str, + suffix: str | tuple[str, ...], + kind_values: frozenset[str], + exclude_suffix: str | None = None, +) -> GraphData: + ingestor = _capture(target, project_name) + nodes: dict[NodeKey, DefNode] = {} + by_uid: dict[_NodeId, NodeKey] = {} + for (label, uid), props in ingestor.nodes.items(): + endpoint = _lang_endpoint_key(label, props, suffix, exclude_suffix) + if endpoint is None: + continue + by_uid[(label, uid)] = endpoint + if label not in kind_values: + continue + raw_end = props.get(cs.KEY_END_LINE) + end_line = int(raw_end) if isinstance(raw_end, int | float) else 0 + nodes[endpoint] = DefNode(endpoint, str(props.get(cs.KEY_NAME, "")), end_line) + + edges: set[EdgeKey] = set() + name_edges: set[NameEdge] = set() + for from_label, from_val, rel_type, to_label, to_val in ingestor.rels: + if rel_type in ec.SCORED_EDGE_TYPE_VALUES: + parent = by_uid.get((from_label, from_val)) + child = by_uid.get((to_label, to_val)) + if parent is not None and child is not None: + edges.add(EdgeKey(rel_type, parent, child)) + elif rel_type in ec.INHERITANCE_NAME_EDGE_TYPE_VALUES: + # (H) Inheritance is graded by the base's SIMPLE NAME (cgr's to-value + # (H) is the resolved base qn, or the bare name when unresolved). + source = by_uid.get((from_label, from_val)) + if source is not None: + # (H) Base simple name: cgr's resolved target may be a dotted qn + # (H) (`module.Base`) or a Rust path (`std::io::Read`), so split on + # (H) both `.` and `::`. + flat = str(to_val).replace(cs.SEPARATOR_DOUBLE_COLON, cs.SEPARATOR_DOT) + target_name = flat.rsplit(cs.SEPARATOR_DOT, 1)[-1] + name_edges.add(NameEdge(rel_type, source, target_name)) + return GraphData(nodes=nodes, edges=edges, name_edges=name_edges) + + +def restrict_to_files(graph: GraphData, files: set[str]) -> GraphData: + # (H) Scope a graph to a file universe. A compile_commands.json oracle only + # (H) "sees" files its compiled TUs reach, while cgr indexes the whole tree + # (H) (bundled test deps, uncompiled sources). Grading cgr's out-of-universe + # (H) nodes against that oracle is meaningless, so restrict cgr to the files + # (H) the oracle actually parsed before scoring. Drops only false positives: + # (H) no oracle node lives outside its own universe, so recall is untouched. + nodes = {k: v for k, v in graph.nodes.items() if k.file in files} + edges = {e for e in graph.edges if e.parent.file in files and e.child.file in files} + name_edges = {n for n in graph.name_edges if n.source.file in files} + return GraphData(nodes=nodes, edges=edges, name_edges=name_edges) + + +def extract_cgr_cpp_nodes(target: Path, project_name: str) -> dict[NodeKey, DefNode]: + return extract_cgr_lang_nodes( + target, project_name, ec.CPP_SUFFIXES, ec.CPP_SCORED_NODE_KIND_VALUES + ) + + +def extract_cgr_cpp_graph(target: Path, project_name: str) -> GraphData: + return extract_cgr_lang_graph( + target, project_name, ec.CPP_SUFFIXES, ec.CPP_SCORED_NODE_KIND_VALUES + ) + + +def extract_cgr_go_nodes(target: Path, project_name: str) -> dict[NodeKey, DefNode]: + return extract_cgr_lang_nodes( + target, project_name, ec.GO_SUFFIX, ec.GO_SCORED_NODE_KIND_VALUES + ) + + +def extract_cgr_go_graph(target: Path, project_name: str) -> GraphData: + return extract_cgr_lang_graph( + target, project_name, ec.GO_SUFFIX, ec.GO_SCORED_NODE_KIND_VALUES + ) + + +def extract_cgr_rust_nodes(target: Path, project_name: str) -> dict[NodeKey, DefNode]: + return extract_cgr_lang_nodes( + target, project_name, ec.RS_SUFFIX, ec.RS_SCORED_NODE_KIND_VALUES + ) + + +def extract_cgr_rust_graph(target: Path, project_name: str) -> GraphData: + return extract_cgr_lang_graph( + target, project_name, ec.RS_SUFFIX, ec.RS_SCORED_NODE_KIND_VALUES + ) + + +def extract_cgr_lua_nodes(target: Path, project_name: str) -> dict[NodeKey, DefNode]: + return extract_cgr_lang_nodes( + target, project_name, ec.LUA_SUFFIX, ec.LUA_SCORED_NODE_KIND_VALUES + ) + + +def extract_cgr_lua_graph(target: Path, project_name: str) -> GraphData: + return extract_cgr_lang_graph( + target, project_name, ec.LUA_SUFFIX, ec.LUA_SCORED_NODE_KIND_VALUES + ) + + +def extract_cgr_php_nodes(target: Path, project_name: str) -> dict[NodeKey, DefNode]: + return extract_cgr_lang_nodes( + target, project_name, ec.PHP_SUFFIX, ec.PHP_SCORED_NODE_KIND_VALUES + ) + + +def extract_cgr_php_graph(target: Path, project_name: str) -> GraphData: + return extract_cgr_lang_graph( + target, project_name, ec.PHP_SUFFIX, ec.PHP_SCORED_NODE_KIND_VALUES + ) + + +def extract_cgr_java_nodes(target: Path, project_name: str) -> dict[NodeKey, DefNode]: + return extract_cgr_lang_nodes( + target, project_name, ec.JAVA_SUFFIX, ec.JAVA_SCORED_NODE_KIND_VALUES + ) + + +def extract_cgr_java_graph(target: Path, project_name: str) -> GraphData: + return extract_cgr_lang_graph( + target, project_name, ec.JAVA_SUFFIX, ec.JAVA_SCORED_NODE_KIND_VALUES + ) + + +def extract_cgr_js_nodes(target: Path, project_name: str) -> dict[NodeKey, DefNode]: + return extract_cgr_lang_nodes( + target, project_name, ec.JS_SUFFIXES, ec.JS_SCORED_NODE_KIND_VALUES + ) + + +def extract_cgr_js_graph(target: Path, project_name: str) -> GraphData: + return extract_cgr_lang_graph( + target, project_name, ec.JS_SUFFIXES, ec.JS_SCORED_NODE_KIND_VALUES + ) + + +def extract_cgr_ts_graph(target: Path, project_name: str) -> GraphData: + return extract_cgr_lang_graph( + target, + project_name, + ec.TS_SUFFIXES, + ec.TS_SCORED_NODE_KIND_VALUES, + exclude_suffix=ec.TS_DTS_SUFFIX, + ) + + +def extract_cgr_ts_nodes(target: Path, project_name: str) -> dict[NodeKey, DefNode]: + ingestor = _capture(target, project_name) + nodes: dict[NodeKey, DefNode] = {} + for (label, _uid), props in ingestor.nodes.items(): + if label not in ec.TS_SCORED_NODE_KIND_VALUES: + continue + path = props.get(cs.KEY_PATH) + if path is None: + continue + file = str(path) + # (H) Match the oracle: real .ts/.tsx sources, excluding .d.ts type stubs. + if not file.endswith(ec.TS_SUFFIXES) or file.endswith(ec.TS_DTS_SUFFIX): + continue + raw_start = props.get(cs.KEY_START_LINE) + if not isinstance(raw_start, int | float): + continue + key = NodeKey(label, file, int(raw_start)) + raw_end = props.get(cs.KEY_END_LINE) + end_line = int(raw_end) if isinstance(raw_end, int | float) else 0 + nodes[key] = DefNode(key, str(props.get(cs.KEY_NAME, "")), end_line) + return nodes + + +def _node_key(label: str, props: PropertyDict) -> NodeKey | None: + path = props.get(cs.KEY_PATH) + if path is None: + return None + file = str(path) + if not file.endswith(ec.PY_SUFFIX): + return None + if label == cs.NodeLabel.MODULE.value: + return NodeKey(label, file, ec.MODULE_START_LINE) + raw_start = props.get(cs.KEY_START_LINE) + if not isinstance(raw_start, int | float): + return None + return NodeKey(label, file, int(raw_start)) + + +def _edge_allowed(rel_type: str, parent_kind: str) -> bool: + if rel_type == cs.RelationshipType.DEFINES.value: + return parent_kind == cs.NodeLabel.MODULE.value + return parent_kind == cs.NodeLabel.CLASS.value + + +def _internal_target_file(qn: str, internal_modules: dict[str, str]) -> str | None: + parts = qn.split(cs.SEPARATOR_DOT) + while parts: + candidate = cs.SEPARATOR_DOT.join(parts) + if candidate in internal_modules: + return internal_modules[candidate] + parts = parts[:-1] + return None + + +def _to_graph_data(ingestor: _CapturingIngestor, project_name: str) -> GraphData: + nodes: dict[NodeKey, DefNode] = {} + by_uid: dict[_NodeId, NodeKey] = {} + for (label, uid), props in ingestor.nodes.items(): + if label not in ec.SCORED_NODE_KIND_VALUES: + continue + key = _node_key(label, props) + if key is None: + continue + raw_end = props.get(cs.KEY_END_LINE) + end_line = int(raw_end) if isinstance(raw_end, int | float) else 0 + name = str(props.get(cs.KEY_NAME, "")) + nodes[key] = DefNode(key, name, end_line) + by_uid[(label, uid)] = key + + edges: set[EdgeKey] = set() + for from_label, from_val, rel_type, to_label, to_val in ingestor.rels: + if rel_type not in ec.SCORED_EDGE_TYPE_VALUES: + continue + parent = by_uid.get((from_label, from_val)) + child = by_uid.get((to_label, to_val)) + if parent is None or child is None: + continue + if _edge_allowed(rel_type, parent.kind): + edges.add(EdgeKey(rel_type, parent, child)) + + prefix = project_name + cs.SEPARATOR_DOT + # (H) Only real in-repo Python modules count as internal import targets. cgr + # (H) also emits placeholder MODULE nodes for unresolved imports whose path is + # (H) the dotted import name (e.g. "thrift.TTornado", "std.set"); requiring a + # (H) .py path excludes those so IMPORTS is graded against real files only, + # (H) consistent with the .py node filter and the ast oracle. + internal_modules: dict[str, str] = { + str(uid): str(props[cs.KEY_PATH]) + for (label, uid), props in ingestor.nodes.items() + if label == cs.NodeLabel.MODULE.value + and props.get(cs.KEY_PATH) + and str(props[cs.KEY_PATH]).endswith(ec.PY_SUFFIX) + and (str(uid) == project_name or str(uid).startswith(prefix)) + } + + name_edges: set[NameEdge] = set() + for from_label, from_val, rel_type, _to_label, to_val in ingestor.rels: + if rel_type not in ec.SCORED_NAME_EDGE_TYPE_VALUES: + continue + source = by_uid.get((from_label, from_val)) + if source is None: + continue + if rel_type == cs.RelationshipType.INHERITS.value: + target = str(to_val).rsplit(cs.SEPARATOR_DOT, 1)[-1] + name_edges.add(NameEdge(rel_type, source, target)) + elif rel_type == cs.RelationshipType.IMPORTS.value: + target_path = _internal_target_file(str(to_val), internal_modules) + if target_path is not None: + name_edges.add(NameEdge(rel_type, source, target_path)) + + return GraphData(nodes=nodes, edges=edges, name_edges=name_edges) diff --git a/evals/cli.py b/evals/cli.py new file mode 100644 index 000000000..b2792aa07 --- /dev/null +++ b/evals/cli.py @@ -0,0 +1,110 @@ +import csv +import json +from pathlib import Path +from typing import Annotated + +import typer +from loguru import logger +from rich.console import Console +from rich.table import Table + +from . import constants as ec +from . import logs as ls +from .ast_oracle import extract_oracle_graph +from .cgr_graph import extract_cgr_graph +from .score import score +from .types_defs import ScoreResult + +console = Console() + + +def main( + target: Annotated[ + Path, typer.Option(help="Directory to evaluate (cgr repo source).") + ] = Path(ec.DEFAULT_TARGET), + project_name: Annotated[ + str, typer.Option(help="cgr project name; defaults to target dir name.") + ] = "", + out_dir: Annotated[ + Path, typer.Option(help="Directory for scores.csv and diff.json.") + ] = Path(ec.DEFAULT_OUT_DIR), +) -> None: + target = target.resolve() + project = project_name or target.name + + logger.info(ls.EXTRACTING_CGR.format(target=target, project=project)) + cgr_graph = extract_cgr_graph(target, project) + logger.success( + ls.CGR_GRAPH_DONE.format(nodes=len(cgr_graph.nodes), edges=len(cgr_graph.edges)) + ) + + logger.info(ls.EXTRACTING_ORACLE.format(target=target)) + oracle_graph = extract_oracle_graph(target, project) + logger.success( + ls.ORACLE_GRAPH_DONE.format( + nodes=len(oracle_graph.nodes), edges=len(oracle_graph.edges) + ) + ) + + result = score(cgr_graph, oracle_graph) + _write_outputs(result, out_dir) + _render(result) + + +def _write_outputs(result: ScoreResult, out_dir: Path) -> None: + out_dir.mkdir(parents=True, exist_ok=True) + scores_path = out_dir / ec.SCORES_FILENAME + with scores_path.open("w", newline="", encoding="utf-8") as handle: + writer = csv.DictWriter(handle, fieldnames=list(ec.CSV_FIELDS)) + writer.writeheader() + for row in result.rows: + writer.writerow(row) + logger.success(ls.WROTE_SCORES.format(path=scores_path)) + + diff_path = out_dir / ec.DIFF_FILENAME + diff_path.write_text(json.dumps(result.diff, indent=2), encoding="utf-8") + logger.success(ls.WROTE_DIFF.format(path=diff_path)) + + +def _render(result: ScoreResult) -> None: + table = Table(title="cgr L1 structure eval (Python)") + table.add_column("category") + table.add_column("label") + table.add_column("tp", justify="right") + table.add_column("fp", justify="right") + table.add_column("fn", justify="right") + table.add_column("precision", justify="right") + table.add_column("recall", justify="right") + table.add_column("f1", justify="right") + for row in result.rows: + table.add_row( + row["category"], + row["label"], + str(row["tp"]), + str(row["fp"]), + str(row["fn"]), + f"{row['precision']:.4f}", + f"{row['recall']:.4f}", + f"{row['f1']:.4f}", + ) + console.print(table) + + loc = result.location + location_table = Table(title="span (end_line) accuracy on matched defs") + location_table.add_column("matched", justify="right") + location_table.add_column("end_exact", justify="right") + location_table.add_column("end_within_1", justify="right") + location_table.add_column("mean_abs_delta", justify="right") + location_table.add_column("max_abs_delta", justify="right") + location_table.add_row( + str(loc.matched), + str(loc.end_exact), + str(loc.end_within_one), + f"{loc.mean_abs_delta:.4f}", + str(loc.max_abs_delta), + ) + console.print(location_table) + + +if __name__ == "__main__": + typer.run(main) diff --git a/evals/constants.py b/evals/constants.py new file mode 100644 index 000000000..df123183b --- /dev/null +++ b/evals/constants.py @@ -0,0 +1,296 @@ +from enum import StrEnum + +from codebase_rag import constants as cs + +PY_SUFFIX = ".py" +MODULE_START_LINE = 0 + +SCORED_NODE_KINDS: tuple[cs.NodeLabel, ...] = ( + cs.NodeLabel.MODULE, + cs.NodeLabel.CLASS, + cs.NodeLabel.FUNCTION, + cs.NodeLabel.METHOD, +) +SCORED_NODE_KIND_VALUES: frozenset[str] = frozenset(k.value for k in SCORED_NODE_KINDS) +# (H) Span (end_line) grading excludes Module: a module's end_line is the whole +# (H) file, which the ast oracle records as 0, so it is not a meaningful def span. +SPANNED_NODE_KINDS_TUPLE: tuple[cs.NodeLabel, ...] = ( + cs.NodeLabel.CLASS, + cs.NodeLabel.FUNCTION, + cs.NodeLabel.METHOD, +) +SPANNED_NODE_KINDS: frozenset[str] = frozenset( + k.value for k in SPANNED_NODE_KINDS_TUPLE +) + +SCORED_EDGE_TYPES: tuple[cs.RelationshipType, ...] = ( + cs.RelationshipType.DEFINES, + cs.RelationshipType.DEFINES_METHOD, +) +SCORED_EDGE_TYPE_VALUES: frozenset[str] = frozenset(e.value for e in SCORED_EDGE_TYPES) + +# (H) L2 dependency edges scored by name/path rather than node location: +# (H) INHERITS by base simple name; IMPORTS by in-repo target file path (internal +# (H) module dependency graph only; external targets are DEPENDS_ON_EXTERNAL). +SCORED_NAME_EDGE_TYPES: tuple[cs.RelationshipType, ...] = ( + cs.RelationshipType.INHERITS, + cs.RelationshipType.IMPORTS, +) +INIT_STEM = "__init__" +SEP = cs.SEPARATOR_DOT +TRACE_CALL_EVENT = "call" +L3_DIFF_FILENAME = "calls_diff.json" +L3_WORKSPACE = "l3_workspace" +SCORED_NAME_EDGE_TYPE_VALUES: frozenset[str] = frozenset( + e.value for e in SCORED_NAME_EDGE_TYPES +) +DIFF_NAME_EDGE_PREFIX = "name_edge:" +NAME_EDGE_REPR = "{rel} {sfile}:{sstart} -> {target}" + +IGNORE_DIRS: frozenset[str] = frozenset( + { + ".git", + ".venv", + "venv", + "__pycache__", + "build", + "dist", + "site", + "node_modules", + ".ruff_cache", + ".pytest_cache", + ".mypy_cache", + ".ty_cache", + } +) +EGG_INFO_SUFFIX = ".egg-info" + + +class Category(StrEnum): + NODE = "node" + EDGE = "edge" + SPAN = "span" + + +AGGREGATE_LABEL = "ALL" + +# (H) Span grading: among nodes matched by (kind, file, start), how often cgr's +# (H) end_line agrees with the oracle's. Surfaced as its own category so a wrong +# (H) node span is visible even when node identity is already 1.0. +DIFF_SPAN_PREFIX = "span:" +SPAN_REPR = "{kind} {file}:{start}-{end}" + +CSV_FIELDS: tuple[str, ...] = ( + "category", + "label", + "tp", + "fp", + "fn", + "precision", + "recall", + "f1", +) +LEFT_COLUMNS: frozenset[str] = frozenset({"category", "label"}) + +DEFAULT_TARGET = "codebase_rag" +DEFAULT_OUT_DIR = "evals/results" +SCORES_FILENAME = "scores.csv" +DIFF_FILENAME = "diff.json" + +NODE_REPR = "{kind} {file}:{start} {name}" +EDGE_REPR = "{rel} {pfile}:{pstart} -> {cfile}:{cstart}" +DIFF_NODE_PREFIX = "node:" +DIFF_EDGE_PREFIX = "edge:" + +ROUND_DIGITS = 4 + +# (H) Go structure eval: cgr nodes graded against the go/ast oracle +# (H) (evals/oracles/go_ast.go), joined on (kind, file, start_line). +GO_SUFFIX = ".go" +GO_SCORED_NODE_KINDS: tuple[cs.NodeLabel, ...] = ( + cs.NodeLabel.FUNCTION, + cs.NodeLabel.METHOD, + cs.NodeLabel.CLASS, + cs.NodeLabel.INTERFACE, + cs.NodeLabel.TYPE, +) +GO_SCORED_NODE_KIND_VALUES: frozenset[str] = frozenset( + k.value for k in GO_SCORED_NODE_KINDS +) +GO_ORACLE_DIRNAME = "oracles" +GO_ORACLE_GO_FILE = "go_ast.go" +GO_BIN = "go" +GO_RUN = "run" +GO_MODULE_ENV = "GO111MODULE" +GO_MODULE_OFF = "off" +GO_DEFAULT_TARGET = "." +GO_SCORES_FILENAME = "go_scores.csv" +GO_DIFF_FILENAME = "go_diff.json" +ORACLE_KEY_KIND = "kind" +ORACLE_KEY_FILE = "file" +ORACLE_KEY_LINE = "line" +ORACLE_KEY_END_LINE = "end_line" +ORACLE_KEY_NAME = "name" +# (H) Edge-payload keys: an oracle that grades containment edges emits a +# (H) {nodes: [...], edges: [...]} object, each edge carrying rel + parent/child +# (H) node references joined against cgr on (kind, file, line). +ORACLE_KEY_NODES = "nodes" +ORACLE_KEY_EDGES = "edges" +ORACLE_KEY_REL = "rel" +ORACLE_KEY_PARENT = "parent" +ORACLE_KEY_CHILD = "child" +# (H) Name-edge payload keys: an inheritance edge carries its source node ref and +# (H) the base type's SIMPLE NAME (cgr resolves bases by simple name, not qn). +ORACLE_KEY_NAME_EDGES = "name_edges" +ORACLE_KEY_SOURCE = "source" +ORACLE_KEY_TARGET_NAME = "target_name" + +# (H) Inheritance edges graded by base simple name: INHERITS (extends/superclass +# (H) and superinterface) and IMPLEMENTS (a class implementing an interface). +INHERITANCE_NAME_EDGE_TYPES: tuple[cs.RelationshipType, ...] = ( + cs.RelationshipType.INHERITS, + cs.RelationshipType.IMPLEMENTS, +) +INHERITANCE_NAME_EDGE_TYPE_VALUES: frozenset[str] = frozenset( + e.value for e in INHERITANCE_NAME_EDGE_TYPES +) + +# (H) Rust structure eval: cgr nodes graded against the syn oracle +# (H) (evals/oracles/rs_oracle), joined on (kind, file, start_line). +RS_SUFFIX = ".rs" +RS_SCORED_NODE_KINDS: tuple[cs.NodeLabel, ...] = ( + cs.NodeLabel.FUNCTION, + cs.NodeLabel.METHOD, + cs.NodeLabel.CLASS, + cs.NodeLabel.INTERFACE, + cs.NodeLabel.ENUM, + cs.NodeLabel.UNION, + cs.NodeLabel.TYPE, +) +RS_SCORED_NODE_KIND_VALUES: frozenset[str] = frozenset( + k.value for k in RS_SCORED_NODE_KINDS +) +RS_ORACLE_DIRNAME = "rs_oracle" +CARGO_BIN = "cargo" +CARGO_RUN = "run" +CARGO_RELEASE = "--release" +CARGO_MANIFEST = "--manifest-path" +CARGO_QUIET = "-q" +CARGO_ARG_SEP = "--" +RS_SCORES_FILENAME = "rs_scores.csv" +RS_DIFF_FILENAME = "rs_diff.json" + +# (H) TypeScript structure eval: cgr nodes graded against the TS-compiler-API +# (H) oracle (evals/oracles/ts_oracle), joined on (kind, file, start_line). +TS_SUFFIXES: tuple[str, ...] = (".ts", ".tsx") +TS_SCORED_NODE_KINDS: tuple[cs.NodeLabel, ...] = ( + cs.NodeLabel.FUNCTION, + cs.NodeLabel.METHOD, + cs.NodeLabel.CLASS, + cs.NodeLabel.INTERFACE, + cs.NodeLabel.ENUM, + cs.NodeLabel.TYPE, +) +TS_SCORED_NODE_KIND_VALUES: frozenset[str] = frozenset( + k.value for k in TS_SCORED_NODE_KINDS +) +TS_ORACLE_DIRNAME = "ts_oracle" +TS_ORACLE_SCRIPT = "ts_ast.js" +NODE_BIN = "node" +NPM_BIN = "npm" +NPM_INSTALL = "install" +NPM_FLAGS: tuple[str, ...] = ("--no-audit", "--no-fund") +NODE_MODULES_DIRNAME = "node_modules" +TS_DTS_SUFFIX = ".d.ts" +TS_SCORES_FILENAME = "ts_scores.csv" +TS_DIFF_FILENAME = "ts_diff.json" + +# (H) JavaScript structure eval: same TS-compiler-API oracle, run over .js/.jsx. +JS_SUFFIXES: tuple[str, ...] = (".js", ".jsx") +JS_SCORED_NODE_KINDS: tuple[cs.NodeLabel, ...] = ( + cs.NodeLabel.FUNCTION, + cs.NodeLabel.METHOD, + cs.NodeLabel.CLASS, +) +JS_SCORED_NODE_KIND_VALUES: frozenset[str] = frozenset( + k.value for k in JS_SCORED_NODE_KINDS +) +JS_SCORES_FILENAME = "js_scores.csv" +JS_DIFF_FILENAME = "js_diff.json" + +# (H) Java structure eval: cgr nodes graded against the JDK Compiler Tree API +# (H) oracle (evals/oracles/java_oracle/Oracle.java), joined on (kind, file, line). +JAVA_SUFFIX = ".java" +JAVA_SCORED_NODE_KINDS: tuple[cs.NodeLabel, ...] = ( + cs.NodeLabel.FUNCTION, + cs.NodeLabel.METHOD, + cs.NodeLabel.CLASS, + cs.NodeLabel.INTERFACE, + cs.NodeLabel.ENUM, +) +JAVA_SCORED_NODE_KIND_VALUES: frozenset[str] = frozenset( + k.value for k in JAVA_SCORED_NODE_KINDS +) +JAVA_ORACLE_DIRNAME = "java_oracle" +JAVA_ORACLE_SOURCE = "Oracle.java" +JAVA_ORACLE_CLASS = "Oracle" +JAVAC_BIN = "javac" +JAVA_BIN = "java" +JAVA_CP_FLAG = "-cp" +JAVA_SCORES_FILENAME = "java_scores.csv" +JAVA_DIFF_FILENAME = "java_diff.json" + +# (H) Lua structure eval: cgr nodes graded against a luaparse oracle. Lua has no +# (H) classes, so every function (global/local/table/method/anonymous) is Function. +LUA_SUFFIX = ".lua" +LUA_SCORED_NODE_KINDS: tuple[cs.NodeLabel, ...] = (cs.NodeLabel.FUNCTION,) +LUA_SCORED_NODE_KIND_VALUES: frozenset[str] = frozenset( + k.value for k in LUA_SCORED_NODE_KINDS +) +LUA_ORACLE_DIRNAME = "lua_oracle" +LUA_ORACLE_SCRIPT = "lua_ast.js" +LUA_SCORES_FILENAME = "lua_scores.csv" +LUA_DIFF_FILENAME = "lua_diff.json" + +# (H) PHP structure eval: cgr nodes graded against a php-parser oracle. +PHP_SUFFIX = ".php" +PHP_SCORED_NODE_KINDS: tuple[cs.NodeLabel, ...] = ( + cs.NodeLabel.FUNCTION, + cs.NodeLabel.METHOD, + cs.NodeLabel.CLASS, + cs.NodeLabel.INTERFACE, + cs.NodeLabel.ENUM, +) +PHP_SCORED_NODE_KIND_VALUES: frozenset[str] = frozenset( + k.value for k in PHP_SCORED_NODE_KINDS +) +PHP_ORACLE_DIRNAME = "php_oracle" +PHP_ORACLE_SCRIPT = "php_ast.js" +PHP_SCORES_FILENAME = "php_scores.csv" +PHP_DIFF_FILENAME = "php_diff.json" + +# (H) C/C++ structure eval: cgr nodes graded against a libclang oracle driven by a +# (H) compile_commands.json, so includes and macros resolve to the true AST (which +# (H) tree-sitter cannot do). Joined on (kind, file, start_line). +CPP_SUFFIXES: tuple[str, ...] = ( + ".cpp", + ".cc", + ".cxx", + ".c", + ".hpp", + ".hh", + ".hxx", + ".h", +) +CPP_SCORED_NODE_KINDS: tuple[cs.NodeLabel, ...] = ( + cs.NodeLabel.FUNCTION, + cs.NodeLabel.METHOD, + cs.NodeLabel.CLASS, +) +CPP_SCORED_NODE_KIND_VALUES: frozenset[str] = frozenset( + k.value for k in CPP_SCORED_NODE_KINDS +) +CPP_COMPDB_FILENAME = "compile_commands.json" +CPP_SCORES_FILENAME = "cpp_scores.csv" +CPP_DIFF_FILENAME = "cpp_diff.json" +CPP_DEFAULT_TARGET = "." diff --git a/evals/cpp_l1.py b/evals/cpp_l1.py new file mode 100644 index 000000000..840bf3ff3 --- /dev/null +++ b/evals/cpp_l1.py @@ -0,0 +1,61 @@ +from pathlib import Path +from typing import Annotated + +import typer +from loguru import logger + +from . import constants as ec +from . import logs as ls +from .cgr_graph import extract_cgr_cpp_graph, restrict_to_files +from .oracles import cpp_available, run_cpp_oracle +from .score import score_structure +from .structure_report import render, write_outputs + +_TITLE = "cgr L1 structure eval (C/C++ vs libclang)" + + +def main( + target: Annotated[ + Path, + typer.Option(help="Directory of C/C++ sources with a compile_commands.json."), + ] = Path(ec.CPP_DEFAULT_TARGET), + project_name: Annotated[ + str, typer.Option(help="cgr project name; defaults to target dir name.") + ] = "", + out_dir: Annotated[ + Path, typer.Option(help="Directory for cpp_scores.csv and cpp_diff.json.") + ] = Path(ec.DEFAULT_OUT_DIR), +) -> None: + target = target.resolve() + if not cpp_available() or not (target / ec.CPP_COMPDB_FILENAME).is_file(): + logger.error( + ls.CPP_ORACLE_MISSING.format(compdb=ec.CPP_COMPDB_FILENAME, target=target) + ) + raise typer.Exit(code=1) + + project = project_name or target.name + + logger.info(ls.CPP_EXTRACTING_CGR.format(target=target, project=project)) + cgr = extract_cgr_cpp_graph(target, project) + logger.success(ls.CPP_CGR_DONE.format(count=len(cgr.nodes))) + + logger.info(ls.CPP_EXTRACTING_ORACLE.format(target=target)) + oracle = run_cpp_oracle(target) + logger.success(ls.CPP_ORACLE_DONE.format(count=len(oracle.nodes))) + + # (H) The compile_commands.json defines the gradeable universe: the oracle only + # (H) sees files its compiled TUs reach, so scope cgr to those files before + # (H) scoring. Without this, cgr's whole-tree index (bundled test deps, + # (H) uncompiled sources) is graded as false positives against a partial oracle. + cgr = restrict_to_files(cgr, {key.file for key in oracle.nodes}) + logger.success(ls.CPP_CGR_SCOPED.format(count=len(cgr.nodes))) + + result = score_structure( + cgr, oracle, ec.CPP_SCORED_NODE_KINDS, ec.SCORED_EDGE_TYPES, grade_spans=True + ) + write_outputs(result, out_dir, ec.CPP_SCORES_FILENAME, ec.CPP_DIFF_FILENAME) + render(result, _TITLE) + + +if __name__ == "__main__": + typer.run(main) diff --git a/evals/go_l1.py b/evals/go_l1.py new file mode 100644 index 000000000..58294bdf7 --- /dev/null +++ b/evals/go_l1.py @@ -0,0 +1,51 @@ +from pathlib import Path +from typing import Annotated + +import typer +from loguru import logger + +from . import constants as ec +from . import logs as ls +from .cgr_graph import extract_cgr_go_graph +from .oracles import go_available, run_go_oracle +from .score import score_structure +from .structure_report import render, write_outputs + +_TITLE = "cgr L1 structure eval (Go vs go/ast)" + + +def main( + target: Annotated[ + Path, typer.Option(help="Directory of Go sources to evaluate.") + ] = Path(ec.GO_DEFAULT_TARGET), + project_name: Annotated[ + str, typer.Option(help="cgr project name; defaults to target dir name.") + ] = "", + out_dir: Annotated[ + Path, typer.Option(help="Directory for go_scores.csv and go_diff.json.") + ] = Path(ec.DEFAULT_OUT_DIR), +) -> None: + if not go_available(): + logger.error(ls.GO_ORACLE_MISSING.format(binary=ec.GO_BIN)) + raise typer.Exit(code=1) + + target = target.resolve() + project = project_name or target.name + + logger.info(ls.GO_EXTRACTING_CGR.format(target=target, project=project)) + cgr = extract_cgr_go_graph(target, project) + logger.success(ls.GO_CGR_DONE.format(count=len(cgr.nodes))) + + logger.info(ls.GO_EXTRACTING_ORACLE.format(binary=ec.GO_BIN, target=target)) + oracle = run_go_oracle(target) + logger.success(ls.GO_ORACLE_DONE.format(count=len(oracle.nodes))) + + result = score_structure( + cgr, oracle, ec.GO_SCORED_NODE_KINDS, ec.SCORED_EDGE_TYPES, grade_spans=True + ) + write_outputs(result, out_dir, ec.GO_SCORES_FILENAME, ec.GO_DIFF_FILENAME) + render(result, _TITLE) + + +if __name__ == "__main__": + typer.run(main) diff --git a/evals/java_l1.py b/evals/java_l1.py new file mode 100644 index 000000000..e9afc0aa7 --- /dev/null +++ b/evals/java_l1.py @@ -0,0 +1,51 @@ +from pathlib import Path +from typing import Annotated + +import typer +from loguru import logger + +from . import constants as ec +from . import logs as ls +from .cgr_graph import extract_cgr_java_graph +from .oracles import java_available, run_java_oracle +from .score import score_structure +from .structure_report import render, write_outputs + +_TITLE = "cgr L1 structure eval (Java vs JDK Compiler Tree API)" + + +def main( + target: Annotated[ + Path, typer.Option(help="Directory of Java sources to evaluate.") + ] = Path(ec.GO_DEFAULT_TARGET), + project_name: Annotated[ + str, typer.Option(help="cgr project name; defaults to target dir name.") + ] = "", + out_dir: Annotated[ + Path, typer.Option(help="Directory for java_scores.csv and java_diff.json.") + ] = Path(ec.DEFAULT_OUT_DIR), +) -> None: + if not java_available(): + logger.error(ls.JAVA_ORACLE_MISSING) + raise typer.Exit(code=1) + + target = target.resolve() + project = project_name or target.name + + logger.info(ls.JAVA_EXTRACTING_CGR.format(target=target, project=project)) + cgr = extract_cgr_java_graph(target, project) + logger.success(ls.JAVA_CGR_DONE.format(count=len(cgr.nodes))) + + logger.info(ls.JAVA_EXTRACTING_ORACLE.format(binary=ec.JAVA_BIN, target=target)) + oracle = run_java_oracle(target) + logger.success(ls.JAVA_ORACLE_DONE.format(count=len(oracle.nodes))) + + result = score_structure( + cgr, oracle, ec.JAVA_SCORED_NODE_KINDS, ec.SCORED_EDGE_TYPES, grade_spans=True + ) + write_outputs(result, out_dir, ec.JAVA_SCORES_FILENAME, ec.JAVA_DIFF_FILENAME) + render(result, _TITLE) + + +if __name__ == "__main__": + typer.run(main) diff --git a/evals/js_l1.py b/evals/js_l1.py new file mode 100644 index 000000000..10380f58a --- /dev/null +++ b/evals/js_l1.py @@ -0,0 +1,51 @@ +from pathlib import Path +from typing import Annotated + +import typer +from loguru import logger + +from . import constants as ec +from . import logs as ls +from .cgr_graph import extract_cgr_js_graph +from .oracles import run_javascript_oracle, typescript_available +from .score import score_structure +from .structure_report import render, write_outputs + +_TITLE = "cgr L1 structure eval (JavaScript vs tsc)" + + +def main( + target: Annotated[ + Path, typer.Option(help="Directory of JavaScript sources to evaluate.") + ] = Path(ec.GO_DEFAULT_TARGET), + project_name: Annotated[ + str, typer.Option(help="cgr project name; defaults to target dir name.") + ] = "", + out_dir: Annotated[ + Path, typer.Option(help="Directory for js_scores.csv and js_diff.json.") + ] = Path(ec.DEFAULT_OUT_DIR), +) -> None: + if not typescript_available(): + logger.error(ls.TS_ORACLE_MISSING) + raise typer.Exit(code=1) + + target = target.resolve() + project = project_name or target.name + + logger.info(ls.JS_EXTRACTING_CGR.format(target=target, project=project)) + cgr = extract_cgr_js_graph(target, project) + logger.success(ls.JS_CGR_DONE.format(count=len(cgr.nodes))) + + logger.info(ls.JS_EXTRACTING_ORACLE.format(binary=ec.NODE_BIN, target=target)) + oracle = run_javascript_oracle(target) + logger.success(ls.JS_ORACLE_DONE.format(count=len(oracle.nodes))) + + result = score_structure( + cgr, oracle, ec.JS_SCORED_NODE_KINDS, ec.SCORED_EDGE_TYPES, grade_spans=True + ) + write_outputs(result, out_dir, ec.JS_SCORES_FILENAME, ec.JS_DIFF_FILENAME) + render(result, _TITLE) + + +if __name__ == "__main__": + typer.run(main) diff --git a/evals/l3.py b/evals/l3.py new file mode 100644 index 000000000..20d416bd7 --- /dev/null +++ b/evals/l3.py @@ -0,0 +1,532 @@ +import json +from pathlib import Path +from typing import Annotated + +import typer +from loguru import logger +from rich.console import Console +from rich.table import Table + +from codebase_rag.graph_updater import GraphUpdater +from codebase_rag.parser_loader import load_parsers +from codebase_rag.types_defs import PropertyDict, PropertyValue, ResultRow + +from . import constants as ec +from . import logs as ls +from .calls_trace import trace_calls +from .cgr_graph import extract_cgr_calls + +console = Console() + +FIXTURE_A = """class Animal: + def speak(self) -> str: + return self.sound() + + def sound(self) -> str: + return "..." + + +class Dog(Animal): + def sound(self) -> str: + return "woof" + + +def make(kind: str) -> Animal: + return Dog() if kind == "dog" else Animal() +""" + +FIXTURE_B = """from .a import Animal, Dog, make + + +def greet(kind: str) -> str: + animal = make(kind) + return describe(animal) + + +def describe(animal: Animal) -> str: + return animal.speak() + + +def run() -> str: + d = Dog() + return d.speak() + greet("dog") +""" + + +FIXTURE_C = """import asyncio +from dataclasses import dataclass +from functools import wraps +from typing import Iterator + +from .a import Animal, Dog + + +def trace(fn): + @wraps(fn) + def wrapper(*args, **kwargs): + return fn(*args, **kwargs) + + return wrapper + + +@dataclass +class Counter: + total: int = 0 + + def add(self, value: int) -> int: + self.total += value + return self.total + + @property + def doubled(self) -> int: + return self.total * 2 + + @staticmethod + def zero() -> int: + return 0 + + @classmethod + def start(cls) -> "Counter": + return cls(total=cls.zero()) + + +class Shelter(Animal): + def __init__(self) -> None: + self.pets: list[Animal] = [] + + def admit(self, pet: Animal) -> None: + self.pets.append(pet) + + def noises(self) -> list[str]: + return [pet.sound() for pet in self.pets] + + def loud(self) -> dict[str, str]: + return {pet.sound(): pet.speak() for pet in self.pets} + + +@trace +def build_shelter(count: int) -> Shelter: + shelter = Shelter() + for _ in range(count): + shelter.admit(Dog()) + return shelter + + +def categorize(value: int) -> str: + match value: + case 0: + return Counter.zero.__name__ + case n if n > 0: + return "positive" + case _: + return "negative" + + +def stream(limit: int) -> Iterator[int]: + counter = Counter.start() + for i in range(limit): + yield counter.add(i) + + +async def gather(limit: int) -> int: + counter = Counter() + await asyncio.sleep(0) + return counter.add(limit) + + +def run_rich() -> int: + shelter = build_shelter(2) + total = sum(len(noise) for noise in shelter.noises()) + apply = lambda c: c.doubled + return total + apply(Counter.start()) +""" + + +FIXTURE_JS_UTIL = """export function greet(name) { + return "hi " + name; +} + + +export class Base { + speak() { + return this.sound(); + } + + sound() { + return "..."; + } +} +""" + +FIXTURE_JS_APP = """import { greet, Base } from "./util.js"; + + +class Dog extends Base { + sound() { + return "woof"; + } +} + + +function run() { + const d = new Dog(); + return d.speak() + greet("dog"); +} + + +const handler = () => run(); + +export { run, handler }; +""" + + +FIXTURE_TS_SHAPES = """export interface Shape { + area(): number; +} + + +export abstract class Base implements Shape { + abstract area(): number; + + describe(): string { + return `area=${this.area()}`; + } +} +""" + +FIXTURE_TS_MAIN = """import { Base, Shape } from "./shapes"; + + +class Square extends Base { + constructor(private side: number) { + super(); + } + + area(): number { + return this.side * this.side; + } +} + + +function total(shapes: Shape[]): number { + return shapes.reduce((acc, s) => acc + s.area(), 0); +} + + +function run(): string { + const sq = new Square(3); + return sq.describe() + total([sq]); +} + +export { run }; +""" + +FIXTURE_RS_SHAPES = """pub trait Shape { + fn area(&self) -> f64; +} + +pub struct Square { + pub side: f64, +} + +impl Square { + pub fn new(side: f64) -> Square { + Square { side } + } +} + +impl Shape for Square { + fn area(&self) -> f64 { + self.side * self.side + } +} + +pub fn describe(s: &dyn Shape) -> f64 { + s.area() +} +""" + +FIXTURE_RS_MAIN = """mod shapes; + +use shapes::{describe, Shape, Square}; + +fn run() -> f64 { + let sq = Square::new(3.0); + describe(&sq) + sq.area() +} + +fn main() { + run(); +} +""" + +FIXTURE_GO_MAIN = """package fixture + +type Shape interface { + Area() float64 +} + +type Square struct { + Side float64 +} + +func (s Square) Area() float64 { + return s.Side * s.Side +} + +func describe(s Shape) float64 { + return s.Area() +} + +func Run() float64 { + sq := Square{Side: 3.0} + return describe(sq) + sq.Area() +} +""" + + +FIXTURE_JAVA = """package fixture; + +interface Shape { + double area(); +} + +class Square implements Shape { + private double side; + + Square(double side) { + this.side = side; + } + + public double area() { + return this.side * this.side; + } +} + +public class Service { + double describe(Shape s) { + return s.area(); + } + + double run() { + Square sq = new Square(3.0); + return describe(sq) + sq.area(); + } +} +""" + +FIXTURE_C_HEADER = """int square(int x); +int compute(int n); +""" + +FIXTURE_C_SRC = """#include "calc.h" + +int square(int x) { + return x * x; +} + +int compute(int n) { + return square(n) + square(n + 1); +} +""" + +FIXTURE_CPP = """class Shape { +public: + virtual double area() const = 0; + double describe() const { return area(); } +}; + +class Square : public Shape { + double side; + +public: + Square(double s) : side(s) {} + double area() const override { return side * side; } +}; + +double run() { + Square sq(3.0); + return sq.describe() + sq.area(); +} +""" + +FIXTURE_LUA = """local M = {} + +function M.square(x) + return x * x +end + +function M.compute(n) + return M.square(n) + M.square(n + 1) +end + +return M +""" + +FIXTURE_PHP = """side = $side; + } + + public function area(): float { + return $this->side * $this->side; + } +} + +function describe(Shape $s): float { + return $s->area(); +} + +function run(): float { + $sq = new Square(3.0); + return describe($sq) + $sq->area(); +} +""" + +FIXTURE_SCALA = """package fixture + +trait Shape { + def area(): Double +} + +class Square(side: Double) extends Shape { + def area(): Double = side * side +} + +object Service { + def describe(s: Shape): Double = s.area() + + def run(): Double = { + val sq = new Square(3.0) + describe(sq) + sq.area() + } +} +""" + + +class _NullIngestor: + def ensure_node_batch(self, label: str, properties: PropertyDict) -> None: + return None + + def ensure_relationship_batch( + self, + from_spec: tuple[str, str, PropertyValue], + rel_type: str, + to_spec: tuple[str, str, PropertyValue], + properties: PropertyDict | None = None, + ) -> None: + return None + + def flush_all(self) -> None: + return None + + def fetch_all( + self, query: str, params: PropertyDict | None = None + ) -> list[ResultRow]: + return [] + + def execute_write(self, query: str, params: PropertyDict | None = None) -> None: + return None + + +def _is_dunder_callee(qn: str) -> bool: + name = qn.rsplit(ec.SEP, 1)[-1] + return name.startswith("__") and name.endswith("__") + + +def _write_fixture(root: Path) -> None: + pkg = root / "fixture" + pkg.mkdir(parents=True, exist_ok=True) + (pkg / "__init__.py").touch() + (pkg / "a.py").write_text(FIXTURE_A) + (pkg / "b.py").write_text(FIXTURE_B) + (pkg / "c.py").write_text(FIXTURE_C) + (pkg / "util.js").write_text(FIXTURE_JS_UTIL) + (pkg / "app.js").write_text(FIXTURE_JS_APP) + (pkg / "shapes.ts").write_text(FIXTURE_TS_SHAPES) + (pkg / "main.ts").write_text(FIXTURE_TS_MAIN) + (pkg / "shapes.rs").write_text(FIXTURE_RS_SHAPES) + (pkg / "main.rs").write_text(FIXTURE_RS_MAIN) + (pkg / "service.go").write_text(FIXTURE_GO_MAIN) + (pkg / "Service.java").write_text(FIXTURE_JAVA) + (pkg / "calc.h").write_text(FIXTURE_C_HEADER) + (pkg / "calc.c").write_text(FIXTURE_C_SRC) + (pkg / "shapes.cpp").write_text(FIXTURE_CPP) + (pkg / "module.lua").write_text(FIXTURE_LUA) + (pkg / "service.php").write_text(FIXTURE_PHP) + (pkg / "Shapes.scala").write_text(FIXTURE_SCALA) + + +def main( + target: Annotated[ + Path, typer.Option(help="cgr source to evaluate CALLS recall for.") + ] = Path(ec.DEFAULT_TARGET), + project_name: Annotated[str, typer.Option(help="cgr project name.")] = "", + out_dir: Annotated[Path, typer.Option(help="Directory for the calls diff.")] = Path( + ec.DEFAULT_OUT_DIR + ), +) -> None: + target = target.resolve() + project = project_name or target.name + + logger.info(ls.L3_STATIC.format(target=target, project=project)) + static_calls = extract_cgr_calls(target, project) + logger.success(ls.L3_STATIC_DONE.format(count=len(static_calls))) + + workspace = out_dir / ec.L3_WORKSPACE + _write_fixture(workspace) + parsers, queries = load_parsers() + + def workload() -> None: + GraphUpdater( + ingestor=_NullIngestor(), + repo_path=workspace / "fixture", + parsers=parsers, + queries=queries, + project_name=project, + ).run(force=True) + + logger.info(ls.L3_TRACING.format(target=target)) + traced = trace_calls(workload, target, project) + logger.success(ls.L3_TRACED_DONE.format(count=len(traced))) + + missed = sorted(traced - static_calls) + + out_dir.mkdir(parents=True, exist_ok=True) + diff_path = out_dir / ec.L3_DIFF_FILENAME + diff_path.write_text( + json.dumps({"missing": [f"{a} -> {b}" for a, b in missed]}, indent=2), + encoding="utf-8", + ) + logger.success(ls.WROTE_DIFF.format(path=diff_path)) + + explicit = {(a, b) for (a, b) in traced if not _is_dunder_callee(b)} + table = Table(title="cgr L3 CALLS recall (execution-traced ground truth)") + table.add_column("scope") + table.add_column("traced", justify="right") + table.add_column("captured", justify="right") + table.add_column("missed", justify="right") + table.add_column("recall", justify="right") + for label, edges in (("all calls", traced), ("explicit (no dunders)", explicit)): + captured = edges & static_calls + recall = len(captured) / len(edges) if edges else 1.0 + table.add_row( + label, + str(len(edges)), + str(len(captured)), + str(len(edges) - len(captured)), + f"{recall:.4f}", + ) + console.print(table) + + +if __name__ == "__main__": + typer.run(main) diff --git a/evals/logs.py b/evals/logs.py new file mode 100644 index 000000000..c007f730c --- /dev/null +++ b/evals/logs.py @@ -0,0 +1,51 @@ +EXTRACTING_CGR = "Building cgr graph for {target} (project={project})" +CGR_GRAPH_DONE = "cgr graph: {nodes} python nodes, {edges} scored edges" +EXTRACTING_ORACLE = "Building ast oracle for {target}" +ORACLE_GRAPH_DONE = "ast oracle: {nodes} python nodes, {edges} scored edges" +WROTE_SCORES = "Wrote scores to {path}" +WROTE_DIFF = "Wrote diff to {path}" +ORACLE_PARSE_FAILED = "Skipped unparseable file {path}: {error}" +L3_STATIC = "Extracting cgr static CALLS for {target} (project={project})" +L3_STATIC_DONE = "cgr static CALLS: {count} edges" +L3_TRACING = "Tracing a workload through {target} to collect runtime call edges" +L3_TRACED_DONE = "traced runtime call edges (first-party): {count}" +GO_EXTRACTING_CGR = "Building cgr Go nodes for {target} (project={project})" +GO_CGR_DONE = "cgr Go nodes: {count}" +GO_EXTRACTING_ORACLE = "Running go/ast oracle ({binary}) over {target}" +GO_ORACLE_DONE = "go/ast oracle nodes: {count}" +GO_ORACLE_MISSING = "Go toolchain '{binary}' not found on PATH; cannot run the oracle" +CPP_EXTRACTING_CGR = "Building cgr C/C++ nodes for {target} (project={project})" +CPP_CGR_DONE = "cgr C/C++ nodes: {count}" +CPP_CGR_SCOPED = "cgr C/C++ nodes scoped to compiled universe: {count}" +CPP_EXTRACTING_ORACLE = "Running libclang oracle over {target} (compile_commands.json)" +CPP_ORACLE_DONE = "libclang oracle nodes: {count}" +CPP_ORACLE_MISSING = "libclang unavailable, or no {compdb} found in {target}" +RS_EXTRACTING_CGR = "Building cgr Rust nodes for {target} (project={project})" +RS_CGR_DONE = "cgr Rust nodes: {count}" +RS_EXTRACTING_ORACLE = "Running syn oracle ({binary}) over {target}" +RS_ORACLE_DONE = "syn oracle nodes: {count}" +RS_ORACLE_MISSING = "Rust toolchain '{binary}' not found on PATH; cannot run the oracle" +TS_EXTRACTING_CGR = "Building cgr TypeScript nodes for {target} (project={project})" +TS_CGR_DONE = "cgr TypeScript nodes: {count}" +TS_EXTRACTING_ORACLE = "Running TypeScript compiler oracle ({binary}) over {target}" +TS_ORACLE_DONE = "TypeScript oracle nodes: {count}" +TS_ORACLE_MISSING = "node/npm not found on PATH; cannot run the TypeScript oracle" +JS_EXTRACTING_CGR = "Building cgr JavaScript nodes for {target} (project={project})" +JS_CGR_DONE = "cgr JavaScript nodes: {count}" +JS_EXTRACTING_ORACLE = "Running TypeScript compiler oracle ({binary}) over {target}" +JS_ORACLE_DONE = "JavaScript oracle nodes: {count}" +JAVA_EXTRACTING_CGR = "Building cgr Java nodes for {target} (project={project})" +JAVA_CGR_DONE = "cgr Java nodes: {count}" +JAVA_EXTRACTING_ORACLE = "Running JDK Compiler Tree API oracle ({binary}) over {target}" +JAVA_ORACLE_DONE = "Java oracle nodes: {count}" +JAVA_ORACLE_MISSING = "javac/java not found on PATH; cannot run the Java oracle" +LUA_EXTRACTING_CGR = "Building cgr Lua nodes for {target} (project={project})" +LUA_CGR_DONE = "cgr Lua nodes: {count}" +LUA_EXTRACTING_ORACLE = "Running luaparse oracle ({binary}) over {target}" +LUA_ORACLE_DONE = "luaparse oracle nodes: {count}" +LUA_ORACLE_MISSING = "node/npm not found on PATH; cannot run the Lua oracle" +PHP_EXTRACTING_CGR = "Building cgr PHP nodes for {target} (project={project})" +PHP_CGR_DONE = "cgr PHP nodes: {count}" +PHP_EXTRACTING_ORACLE = "Running php-parser oracle ({binary}) over {target}" +PHP_ORACLE_DONE = "php-parser oracle nodes: {count}" +PHP_ORACLE_MISSING = "node/npm not found on PATH; cannot run the PHP oracle" diff --git a/evals/lua_l1.py b/evals/lua_l1.py new file mode 100644 index 000000000..57af56320 --- /dev/null +++ b/evals/lua_l1.py @@ -0,0 +1,51 @@ +from pathlib import Path +from typing import Annotated + +import typer +from loguru import logger + +from . import constants as ec +from . import logs as ls +from .cgr_graph import extract_cgr_lua_graph +from .oracles import lua_oracle_available, run_lua_oracle +from .score import score_structure +from .structure_report import render, write_outputs + +_TITLE = "cgr L1 structure eval (Lua vs luaparse)" + + +def main( + target: Annotated[ + Path, typer.Option(help="Directory of Lua sources to evaluate.") + ] = Path(ec.GO_DEFAULT_TARGET), + project_name: Annotated[ + str, typer.Option(help="cgr project name; defaults to target dir name.") + ] = "", + out_dir: Annotated[ + Path, typer.Option(help="Directory for lua_scores.csv and lua_diff.json.") + ] = Path(ec.DEFAULT_OUT_DIR), +) -> None: + if not lua_oracle_available(): + logger.error(ls.LUA_ORACLE_MISSING) + raise typer.Exit(code=1) + + target = target.resolve() + project = project_name or target.name + + logger.info(ls.LUA_EXTRACTING_CGR.format(target=target, project=project)) + cgr = extract_cgr_lua_graph(target, project) + logger.success(ls.LUA_CGR_DONE.format(count=len(cgr.nodes))) + + logger.info(ls.LUA_EXTRACTING_ORACLE.format(binary=ec.NODE_BIN, target=target)) + oracle = run_lua_oracle(target) + logger.success(ls.LUA_ORACLE_DONE.format(count=len(oracle.nodes))) + + result = score_structure( + cgr, oracle, ec.LUA_SCORED_NODE_KINDS, ec.SCORED_EDGE_TYPES, grade_spans=True + ) + write_outputs(result, out_dir, ec.LUA_SCORES_FILENAME, ec.LUA_DIFF_FILENAME) + render(result, _TITLE) + + +if __name__ == "__main__": + typer.run(main) diff --git a/evals/module_calls.py b/evals/module_calls.py new file mode 100644 index 000000000..b75a54294 --- /dev/null +++ b/evals/module_calls.py @@ -0,0 +1,246 @@ +# (H) L2 module-call attribution: does cgr attribute the right calls to the +# (H) module? The L3 trace records the innermost function frame as the caller and +# (H) drops frames, so it is structurally blind to module-level call +# (H) attribution. This eval fills that gap with an AST oracle that models +# (H) import-time execution. Both sides are compared as (module_file, +# (H) callee_simple_name) name-edges, restricted to first-party callees and +# (H) excluding dunders, since cgr only emits first-party CALLS. +import ast +from pathlib import Path +from typing import Annotated + +import typer +from loguru import logger +from rich.console import Console +from rich.table import Table + +from codebase_rag import constants as cs + +from . import constants as ec +from .ast_oracle import _iter_py_files +from .cgr_graph import _capture +from .types_defs import NameEdge, NodeKey + +console = Console() + +_CALLS = cs.RelationshipType.CALLS.value +_INSTANTIATES = cs.RelationshipType.INSTANTIATES.value + + +def _is_dunder(name: str) -> bool: + return name.startswith("__") and name.endswith("__") + + +def _callee_name(func: ast.expr) -> str | None: + if isinstance(func, ast.Name): + return func.id + if isinstance(func, ast.Attribute): + return func.attr + return None + + +def _has_future_annotations(tree: ast.Module) -> bool: + for node in tree.body: + if isinstance(node, ast.ImportFrom) and node.module == "__future__": + if any(alias.name == "annotations" for alias in node.names): + return True + return False + + +class _ModuleCallVisitor(ast.NodeVisitor): + # (H) Collect callee names of calls that execute at module-load time. A + # (H) function's decorators, argument defaults, and (unless postponed) + # (H) annotations run in the enclosing scope, so they are visited at the + # (H) current depth; only its body is function scope. Class bodies execute at + # (H) definition time, so they stay at the enclosing depth. Lambda bodies and + # (H) generator expressions are deferred (run when called/consumed), so their + # (H) calls are not import-time and are entered as a nested (function) scope. + def __init__(self, count_annotations: bool) -> None: + self.names: set[str] = set() + self._func_depth = 0 + self._count_annotations = count_annotations + + def visit_Call(self, node: ast.Call) -> None: + if self._func_depth == 0 and (name := _callee_name(node.func)): + self.names.add(name) + self.generic_visit(node) + + def _visit_function(self, node: ast.FunctionDef | ast.AsyncFunctionDef) -> None: + for decorator in node.decorator_list: + if self._func_depth == 0: + # (H) a bare decorator `@task` is a Name (not a Call), so record + # (H) its callee name explicitly; applying it runs at module load. + target = ( + decorator.func if isinstance(decorator, ast.Call) else decorator + ) + if name := _callee_name(target): + self.names.add(name) + self.visit(decorator) + if self._count_annotations: + args = node.args + for arg in ( + *args.posonlyargs, + *args.args, + *args.kwonlyargs, + args.vararg, + args.kwarg, + ): + if arg is not None and arg.annotation is not None: + self.visit(arg.annotation) + if node.returns is not None: + self.visit(node.returns) + for default in (*node.args.defaults, *node.args.kw_defaults): + if default is not None: + self.visit(default) + self._func_depth += 1 + for stmt in node.body: + self.visit(stmt) + self._func_depth -= 1 + + def visit_FunctionDef(self, node: ast.FunctionDef) -> None: + self._visit_function(node) + + def visit_AsyncFunctionDef(self, node: ast.AsyncFunctionDef) -> None: + self._visit_function(node) + + def visit_ClassDef(self, node: ast.ClassDef) -> None: + # (H) a class decorator runs at definition (module-load) time too; the + # (H) class body stays at the current depth (eager at import). + if self._func_depth == 0: + for decorator in node.decorator_list: + target = ( + decorator.func if isinstance(decorator, ast.Call) else decorator + ) + if name := _callee_name(target): + self.names.add(name) + self.generic_visit(node) + + def visit_Lambda(self, node: ast.Lambda) -> None: + for default in (*node.args.defaults, *node.args.kw_defaults): + if default is not None: + self.visit(default) + self._func_depth += 1 + self.visit(node.body) + self._func_depth -= 1 + + def visit_GeneratorExp(self, node: ast.GeneratorExp) -> None: + # (H) the outermost iterable is evaluated eagerly when the generator is + # (H) created (enclosing scope); the element, conditions, and any further + # (H) iterables are lazy (run during consumption). + if node.generators: + self.visit(node.generators[0].iter) + self._func_depth += 1 + self.visit(node.elt) + for index, comprehension in enumerate(node.generators): + if index > 0: + self.visit(comprehension.iter) + for condition in comprehension.ifs: + self.visit(condition) + self._func_depth -= 1 + + +def _first_party_names(trees: list[ast.Module]) -> set[str]: + names: set[str] = set() + for tree in trees: + for node in ast.walk(tree): + if isinstance(node, ast.FunctionDef | ast.AsyncFunctionDef | ast.ClassDef): + names.add(node.name) + return names + + +def oracle_module_calls(target: Path, project_name: str) -> set[NameEdge]: + parsed: list[tuple[str, ast.Module]] = [] + for path in _iter_py_files(target): + rel = path.relative_to(target).as_posix() + try: + parsed.append((rel, ast.parse(path.read_text(encoding=cs.ENCODING_UTF8)))) + except (SyntaxError, UnicodeDecodeError, ValueError): + continue + first_party = _first_party_names([tree for _rel, tree in parsed]) + + edges: set[NameEdge] = set() + for rel, tree in parsed: + visitor = _ModuleCallVisitor( + count_annotations=not _has_future_annotations(tree) + ) + visitor.visit(tree) + module_key = NodeKey(cs.NodeLabel.MODULE.value, rel, ec.MODULE_START_LINE) + for name in visitor.names: + if name in first_party and not _is_dunder(name): + edges.add(NameEdge(_CALLS, module_key, name)) + return edges + + +def cgr_module_calls(target: Path, project_name: str) -> set[NameEdge]: + ingestor = _capture(target, project_name) + module_label = cs.NodeLabel.MODULE.value + module_paths: dict[str, str] = { + str(uid): str(props[cs.KEY_PATH]) + for (label, uid), props in ingestor.nodes.items() + if label == module_label + and props.get(cs.KEY_PATH) + and str(props[cs.KEY_PATH]).endswith(ec.PY_SUFFIX) + } + + method_label = cs.NodeLabel.METHOD.value + edges: set[NameEdge] = set() + for from_label, from_val, rel_type, to_label, to_val in ingestor.rels: + # (H) A module-scope construction `X()` is an INSTANTIATES edge to the + # (H) class node (callee is the class name directly); a function/method + # (H) call is a CALLS edge. The oracle records both as a bare callee name, + # (H) so credit both kinds of module-caller edge. + if rel_type not in (_CALLS, _INSTANTIATES) or from_label != module_label: + continue + path = module_paths.get(str(from_val)) + if path is None: + continue + segments = str(to_val).split(ec.SEP) + name = segments[-1] + # (H) A constructor call `X()` on a class WITH __init__ resolves to the + # (H) `X.__init__` METHOD via CALLS; the oracle sees the class name `X`, so + # (H) credit it to the class. A bare first-party FUNCTION named `__init__` + # (H) is left as a dunder (filtered below), not remapped to its segment. + if name == ec.INIT_STEM and to_label == method_label and len(segments) >= 2: + name = segments[-2] + if _is_dunder(name): + continue + module_key = NodeKey(module_label, path, ec.MODULE_START_LINE) + edges.add(NameEdge(_CALLS, module_key, name)) + return edges + + +def score_module_calls( + cgr: set[NameEdge], oracle: set[NameEdge] +) -> tuple[int, int, int, float, float]: + tp = len(cgr & oracle) + fp = len(cgr - oracle) + fn = len(oracle - cgr) + precision = tp / (tp + fp) if tp + fp else 1.0 + recall = tp / (tp + fn) if tp + fn else 1.0 + return tp, fp, fn, precision, recall + + +def main( + target: Annotated[ + Path, typer.Option(help="cgr source to evaluate module-call attribution for.") + ] = Path(ec.DEFAULT_TARGET), + project_name: Annotated[str, typer.Option(help="cgr project name.")] = "", +) -> None: + target = target.resolve() + project = project_name or target.name + + logger.info("Building cgr module-call edges for {}", target) + cgr = cgr_module_calls(target, project) + logger.info("Building oracle module-call edges for {}", target) + oracle = oracle_module_calls(target, project) + + tp, fp, fn, precision, recall = score_module_calls(cgr, oracle) + table = Table(title="cgr L2 module-call attribution (ast oracle ground truth)") + for col in ("tp", "fp", "fn", "precision", "recall"): + table.add_column(col, justify="right") + table.add_row(str(tp), str(fp), str(fn), f"{precision:.4f}", f"{recall:.4f}") + console.print(table) + + +if __name__ == "__main__": + typer.run(main) diff --git a/evals/oracles/__init__.py b/evals/oracles/__init__.py new file mode 100644 index 000000000..7e0c2d2eb --- /dev/null +++ b/evals/oracles/__init__.py @@ -0,0 +1,29 @@ +from .cpp_oracle import cpp_available, run_cpp_oracle +from .go_oracle import go_available, run_go_oracle +from .java_oracle import java_available, run_java_oracle +from .lua_oracle import lua_oracle_available, run_lua_oracle +from .php_oracle import php_oracle_available, run_php_oracle +from .rust_oracle import run_rust_oracle, rust_available +from .typescript_oracle import ( + run_javascript_oracle, + run_typescript_oracle, + typescript_available, +) + +__all__ = [ + "cpp_available", + "run_cpp_oracle", + "go_available", + "run_go_oracle", + "java_available", + "run_java_oracle", + "lua_oracle_available", + "run_lua_oracle", + "php_oracle_available", + "run_php_oracle", + "run_rust_oracle", + "rust_available", + "run_javascript_oracle", + "run_typescript_oracle", + "typescript_available", +] diff --git a/evals/oracles/_common.py b/evals/oracles/_common.py new file mode 100644 index 000000000..7ea487d29 --- /dev/null +++ b/evals/oracles/_common.py @@ -0,0 +1,86 @@ +from __future__ import annotations + +from pathlib import PurePosixPath + +from codebase_rag import constants as cs + +from .. import constants as ec +from ..types_defs import ( + DefNode, + EdgeKey, + GraphData, + NameEdge, + NodeKey, + OracleEdge, + OracleNameEdge, + OracleNodeRef, + OraclePayload, + OracleRecord, +) + + +def is_ignored(rel_file: str) -> bool: + # (H) Mirror cgr's directory-component ignore (path_utils.should_skip_path) + # (H) so an oracle grades the same file set cgr indexes. + dir_parts = PurePosixPath(rel_file).parent.parts + return not cs.IGNORE_PATTERNS.isdisjoint(dir_parts) + + +def records_to_nodes(records: list[OracleRecord]) -> dict[NodeKey, DefNode]: + nodes: dict[NodeKey, DefNode] = {} + for rec in records: + rel_file = rec[ec.ORACLE_KEY_FILE] + if is_ignored(rel_file): + continue + line = int(rec[ec.ORACLE_KEY_LINE]) + key = NodeKey(rec[ec.ORACLE_KEY_KIND], rel_file, line) + end_line = int(rec.get(ec.ORACLE_KEY_END_LINE, line)) + nodes[key] = DefNode(key, rec[ec.ORACLE_KEY_NAME], end_line) + return nodes + + +def _ref_to_key(ref: OracleNodeRef) -> NodeKey: + return NodeKey( + ref[ec.ORACLE_KEY_KIND], + ref[ec.ORACLE_KEY_FILE], + int(ref[ec.ORACLE_KEY_LINE]), + ) + + +def records_to_edges(edges: list[OracleEdge]) -> set[EdgeKey]: + out: set[EdgeKey] = set() + for edge in edges: + parent = edge[ec.ORACLE_KEY_PARENT] + child = edge[ec.ORACLE_KEY_CHILD] + if is_ignored(parent[ec.ORACLE_KEY_FILE]) or is_ignored( + child[ec.ORACLE_KEY_FILE] + ): + continue + out.add( + EdgeKey(edge[ec.ORACLE_KEY_REL], _ref_to_key(parent), _ref_to_key(child)) + ) + return out + + +def records_to_name_edges(name_edges: list[OracleNameEdge]) -> set[NameEdge]: + out: set[NameEdge] = set() + for edge in name_edges: + source = edge[ec.ORACLE_KEY_SOURCE] + if is_ignored(source[ec.ORACLE_KEY_FILE]): + continue + out.add( + NameEdge( + edge[ec.ORACLE_KEY_REL], + _ref_to_key(source), + edge[ec.ORACLE_KEY_TARGET_NAME], + ) + ) + return out + + +def payload_to_graph(payload: OraclePayload) -> GraphData: + return GraphData( + nodes=records_to_nodes(payload.get(ec.ORACLE_KEY_NODES, [])), + edges=records_to_edges(payload.get(ec.ORACLE_KEY_EDGES, [])), + name_edges=records_to_name_edges(payload.get(ec.ORACLE_KEY_NAME_EDGES, [])), + ) diff --git a/evals/oracles/cpp_oracle.py b/evals/oracles/cpp_oracle.py new file mode 100644 index 000000000..8490e4ccd --- /dev/null +++ b/evals/oracles/cpp_oracle.py @@ -0,0 +1,185 @@ +from __future__ import annotations + +from pathlib import Path +from typing import TYPE_CHECKING + +from codebase_rag import constants as cs + +from .. import constants as ec +from ..types_defs import ( + GraphData, + OracleEdge, + OracleNameEdge, + OracleNodeRef, + OraclePayload, + OracleRecord, +) +from ._common import payload_to_graph + +if TYPE_CHECKING: + from clang.cindex import Cursor + +# (H) The libclang oracle is authoritative C/C++ ground truth: driven by a +# (H) compile_commands.json it resolves #includes and expands macros to the true +# (H) translation-unit AST, which tree-sitter (cgr's parser) cannot do. cgr's +# (H) C/C++ nodes are graded against it on (kind, file, start_line). + +_CLASS = cs.NodeLabel.CLASS.value +_FUNCTION = cs.NodeLabel.FUNCTION.value +_METHOD = cs.NodeLabel.METHOD.value +_MODULE = cs.NodeLabel.MODULE.value +_DEFINES = cs.RelationshipType.DEFINES.value +_DEFINES_METHOD = cs.RelationshipType.DEFINES_METHOD.value +_INHERITS = cs.RelationshipType.INHERITS.value +_BASE_SPECIFIER = "CXX_BASE_SPECIFIER" + +_NodeId = tuple[str, str, int] +_EdgeId = tuple[str, str, int, str, int] +_NameEdgeId = tuple[str, str, int, str] + +# (H) libclang CursorKind members are registered dynamically (not static class +# (H) attributes), so map by the kind's stable NAME string — exactly what +# (H) `cursor.kind.name` yields at runtime — instead of `ci.CursorKind.CLASS_DECL`. +_KIND_BY_NAME: dict[str, str] = { + "CLASS_DECL": _CLASS, + "STRUCT_DECL": _CLASS, + "CLASS_TEMPLATE": _CLASS, + "FUNCTION_DECL": _FUNCTION, + "FUNCTION_TEMPLATE": _FUNCTION, + "CXX_METHOD": _METHOD, + "CONSTRUCTOR": _METHOD, + "DESTRUCTOR": _METHOD, + "CONVERSION_FUNCTION": _METHOD, +} + + +def cpp_available() -> bool: + try: + import clang.cindex as ci + + ci.Index.create() + except Exception: + return False + return True + + +def _rel(path: str, root: Path) -> str | None: + try: + return Path(path).resolve().relative_to(root).as_posix() + except ValueError: + return None + + +def run_cpp_oracle(target: Path) -> GraphData: + import clang.cindex as ci + + root = target.resolve() + db = ci.CompilationDatabase.fromDirectory(str(root)) + index = ci.Index.create() + nodes: dict[_NodeId, OracleRecord] = {} + edges: dict[_EdgeId, OracleEdge] = {} + name_edges: dict[_NameEdgeId, OracleNameEdge] = {} + + for command in db.getAllCompileCommands(): + args = list(command.arguments)[1:] + try: + tu = index.parse(None, args=args) + except ci.TranslationUnitLoadError: + continue + _walk(tu.cursor, root, nodes, edges, name_edges) + + payload = OraclePayload( + nodes=list(nodes.values()), + edges=list(edges.values()), + name_edges=list(name_edges.values()), + ) + return payload_to_graph(payload) + + +def _walk( + cursor: Cursor, + root: Path, + nodes: dict[_NodeId, OracleRecord], + edges: dict[_EdgeId, OracleEdge], + name_edges: dict[_NameEdgeId, OracleNameEdge], +) -> None: + for child in cursor.get_children(): + _emit(child, root, nodes, edges, name_edges) + _walk(child, root, nodes, edges, name_edges) + + +def _base_simple_name(spelling: str) -> str: + # (H) Mirror cgr's base-name normalization (extract_cgr_lang_graph): collapse + # (H) `::` to `.` and take the last component, so the oracle and cgr agree on + # (H) the inheritance target spelling. + flat = spelling.replace(cs.SEPARATOR_DOUBLE_COLON, cs.SEPARATOR_DOT) + return flat.rsplit(cs.SEPARATOR_DOT, 1)[-1] + + +def _emit( + cursor: Cursor, + root: Path, + nodes: dict[_NodeId, OracleRecord], + edges: dict[_EdgeId, OracleEdge], + name_edges: dict[_NameEdgeId, OracleNameEdge], +) -> None: + if not cursor.is_definition(): + return + kind = _KIND_BY_NAME.get(cursor.kind.name) + if kind is None or cursor.location.file is None: + return + rel = _rel(cursor.location.file.name, root) + if rel is None: + return + line = cursor.location.line + key: _NodeId = (kind, rel, line) + if key not in nodes: + nodes[key] = OracleRecord( + kind=kind, + file=rel, + line=line, + name=cursor.spelling, + end_line=cursor.extent.end.line, + ) + + if kind == _METHOD: + parent = cursor.semantic_parent + if parent is None or parent.location.file is None: + return + prel = _rel(parent.location.file.name, root) + if prel is not None: + _add_edge(edges, _DEFINES_METHOD, _CLASS, prel, parent.location.line, key) + return + + _add_edge(edges, _DEFINES, _MODULE, rel, ec.MODULE_START_LINE, key) + if kind == _CLASS: + for child in cursor.get_children(): + if child.kind.name != _BASE_SPECIFIER: + continue + base = _base_simple_name(child.type.spelling) + nk: _NameEdgeId = (_INHERITS, rel, line, base) + if nk not in name_edges: + name_edges[nk] = OracleNameEdge( + rel=_INHERITS, + source=OracleNodeRef(kind=_CLASS, file=rel, line=line), + target_name=base, + ) + + +def _add_edge( + edges: dict[_EdgeId, OracleEdge], + rel: str, + pkind: str, + pfile: str, + pline: int, + child: _NodeId, +) -> None: + ckind, cfile, cline = child + ek: _EdgeId = (rel, pfile, pline, cfile, cline) + if ek in edges: + return + edges[ek] = OracleEdge( + rel=rel, + parent=OracleNodeRef(kind=pkind, file=pfile, line=pline), + child=OracleNodeRef(kind=ckind, file=cfile, line=cline), + ) diff --git a/evals/oracles/go_ast.go b/evals/oracles/go_ast.go new file mode 100644 index 000000000..a7a1d8605 --- /dev/null +++ b/evals/oracles/go_ast.go @@ -0,0 +1,259 @@ +// Authoritative Go structure oracle for the cgr eval harness. +// +// Walks a directory of Go sources with the standard library's own go/parser +// and go/ast, and emits a JSON payload {nodes, edges}. Node "kind" fields use +// cgr's NodeLabel vocabulary (Function, Method, Class, Interface, Type) and +// edges use cgr's RelationshipType vocabulary, so both join cgr's graph on +// (kind, file, line). +// +// Mapping (Go declaration -> cgr NodeLabel): +// +// func without receiver -> Function +// func with receiver -> Method +// type ... struct {} -> Class +// type ... interface {} -> Interface +// type ... (other) -> Type (defined types and aliases alike) +// +// Containment edges (matching how cgr models Go containment): +// +// DEFINES : Module(file, line 0) -> top-level Function / Class / Interface / Type +// DEFINES_METHOD : receiver type's node -> Method (cross-file within a package) +// +// cgr models a Go module per file, so a DEFINES parent is the file's module +// keyed at line 0. A receiver method's parent is the node of its receiver type, +// resolved package-wide (a method may sit in a different file than its type). +// +// Run: GO111MODULE=off go run go_ast.go +package main + +import ( + "encoding/json" + "go/ast" + "go/parser" + "go/token" + "os" + "path/filepath" + "strings" +) + +// Def is a single declaration record. Line is the identifier line (the node's +// start, matching cgr); EndLine is the line of the declaration's last token. +type Def struct { + Kind string `json:"kind"` + File string `json:"file"` + Line int `json:"line"` + EndLine int `json:"end_line"` + Name string `json:"name"` +} + +// NodeRef identifies an edge endpoint by (kind, file, line). +type NodeRef struct { + Kind string `json:"kind"` + File string `json:"file"` + Line int `json:"line"` +} + +// Edge is a containment relationship between two node references. +type Edge struct { + Rel string `json:"rel"` + Parent NodeRef `json:"parent"` + Child NodeRef `json:"child"` +} + +// Payload is the oracle's stdout shape. +type Payload struct { + Nodes []Def `json:"nodes"` + Edges []Edge `json:"edges"` +} + +// ignoredDirs are skipped during the walk; they never hold first-party sources. +var ignoredDirs = map[string]bool{ + ".git": true, + "vendor": true, + "node_modules": true, + "testdata": true, +} + +const ( + kindFunction = "Function" + kindMethod = "Method" + kindClass = "Class" + kindInterface = "Interface" + kindType = "Type" + kindModule = "Module" + relDefines = "DEFINES" + relDefinesMeth = "DEFINES_METHOD" + moduleLine = 0 + goSuffix = ".go" +) + +func typeSpecKind(spec *ast.TypeSpec) string { + switch spec.Type.(type) { + case *ast.StructType: + return kindClass + case *ast.InterfaceType: + return kindInterface + default: + return kindType + } +} + +// baseTypeName strips pointer and generic instantiation wrappers off a receiver +// type expression, leaving the bare type name (e.g. *Point[T] -> "Point"). +func baseTypeName(expr ast.Expr) string { + switch t := expr.(type) { + case *ast.StarExpr: + return baseTypeName(t.X) + case *ast.IndexExpr: + return baseTypeName(t.X) + case *ast.IndexListExpr: + return baseTypeName(t.X) + case *ast.Ident: + return t.Name + } + return "" +} + +func recvTypeName(recv *ast.FieldList) string { + if recv == nil || len(recv.List) == 0 { + return "" + } + return baseTypeName(recv.List[0].Type) +} + +// parsedFile bundles a parsed source with its location data for the two passes. +type parsedFile struct { + fset *token.FileSet + file *ast.File + rel string + dir string +} + +// collectNodes records every declaration (including function-local types) so the +// node set is an apples-to-apples ground truth for cgr's node capture. +func collectNodes(pf parsedFile, defs *[]Def) { + ast.Inspect(pf.file, func(n ast.Node) bool { + switch d := n.(type) { + case *ast.FuncDecl: + kind := kindFunction + if d.Recv != nil { + kind = kindMethod + } + line := pf.fset.Position(d.Name.Pos()).Line + end := pf.fset.Position(d.End()).Line + *defs = append(*defs, Def{kind, pf.rel, line, end, d.Name.Name}) + case *ast.TypeSpec: + line := pf.fset.Position(d.Name.Pos()).Line + end := pf.fset.Position(d.End()).Line + *defs = append(*defs, Def{typeSpecKind(d), pf.rel, line, end, d.Name.Name}) + } + return true + }) +} + +// typeKey scopes a type name to its package directory; methods resolve their +// receiver type within the same package, which Go keeps in one directory. +func typeKey(dir, name string) string { + return dir + "\x00" + name +} + +// collectTypes records each top-level type's node so receiver methods can later +// point DEFINES_METHOD at the right (kind, file, line). +func collectTypes(pf parsedFile, types map[string]Def) { + for _, decl := range pf.file.Decls { + gen, ok := decl.(*ast.GenDecl) + if !ok || gen.Tok != token.TYPE { + continue + } + for _, spec := range gen.Specs { + ts, ok := spec.(*ast.TypeSpec) + if !ok { + continue + } + line := pf.fset.Position(ts.Name.Pos()).Line + end := pf.fset.Position(ts.End()).Line + types[typeKey(pf.dir, ts.Name.Name)] = Def{typeSpecKind(ts), pf.rel, line, end, ts.Name.Name} + } + } +} + +// collectEdges emits DEFINES for top-level funcs/types and DEFINES_METHOD for +// receiver methods, mirroring cgr's per-file module containment. +func collectEdges(pf parsedFile, types map[string]Def, edges *[]Edge) { + module := NodeRef{kindModule, pf.rel, moduleLine} + for _, decl := range pf.file.Decls { + switch d := decl.(type) { + case *ast.FuncDecl: + line := pf.fset.Position(d.Name.Pos()).Line + if d.Recv == nil { + child := NodeRef{kindFunction, pf.rel, line} + *edges = append(*edges, Edge{relDefines, module, child}) + continue + } + owner, ok := types[typeKey(pf.dir, recvTypeName(d.Recv))] + if !ok { + continue + } + parent := NodeRef{owner.Kind, owner.File, owner.Line} + child := NodeRef{kindMethod, pf.rel, line} + *edges = append(*edges, Edge{relDefinesMeth, parent, child}) + case *ast.GenDecl: + if d.Tok != token.TYPE { + continue + } + for _, spec := range d.Specs { + ts, ok := spec.(*ast.TypeSpec) + if !ok { + continue + } + line := pf.fset.Position(ts.Name.Pos()).Line + child := NodeRef{typeSpecKind(ts), pf.rel, line} + *edges = append(*edges, Edge{relDefines, module, child}) + } + } + } +} + +func main() { + root := os.Args[1] + var parsed []parsedFile + _ = filepath.Walk(root, func(path string, info os.FileInfo, err error) error { + if err != nil { + return nil + } + if info.IsDir() { + if ignoredDirs[info.Name()] { + return filepath.SkipDir + } + return nil + } + if !strings.HasSuffix(path, goSuffix) { + return nil + } + fset := token.NewFileSet() + file, perr := parser.ParseFile(fset, path, nil, 0) + if perr != nil { + return nil + } + rel, rerr := filepath.Rel(root, path) + if rerr != nil { + rel = path + } + rel = filepath.ToSlash(rel) + parsed = append(parsed, parsedFile{fset, file, rel, filepath.ToSlash(filepath.Dir(rel))}) + return nil + }) + + types := map[string]Def{} + for _, pf := range parsed { + collectTypes(pf, types) + } + + defs := []Def{} + edges := []Edge{} + for _, pf := range parsed { + collectNodes(pf, &defs) + collectEdges(pf, types, &edges) + } + _ = json.NewEncoder(os.Stdout).Encode(Payload{Nodes: defs, Edges: edges}) +} diff --git a/evals/oracles/go_oracle.py b/evals/oracles/go_oracle.py new file mode 100644 index 000000000..ac96bdab3 --- /dev/null +++ b/evals/oracles/go_oracle.py @@ -0,0 +1,29 @@ +from __future__ import annotations + +import json +import os +import shutil +import subprocess +from pathlib import Path + +from .. import constants as ec +from ..types_defs import GraphData, OraclePayload +from ._common import payload_to_graph + +_ORACLE_GO = Path(__file__).parent / ec.GO_ORACLE_GO_FILE + + +def go_available() -> bool: + return shutil.which(ec.GO_BIN) is not None + + +def run_go_oracle(target: Path) -> GraphData: + proc = subprocess.run( + [ec.GO_BIN, ec.GO_RUN, str(_ORACLE_GO), str(target)], + capture_output=True, + text=True, + check=True, + env={**os.environ, ec.GO_MODULE_ENV: ec.GO_MODULE_OFF}, + ) + payload: OraclePayload = json.loads(proc.stdout or "{}") + return payload_to_graph(payload) diff --git a/evals/oracles/java_oracle.py b/evals/oracles/java_oracle.py new file mode 100644 index 000000000..c69625fe4 --- /dev/null +++ b/evals/oracles/java_oracle.py @@ -0,0 +1,50 @@ +from __future__ import annotations + +import json +import shutil +import subprocess +from pathlib import Path + +from .. import constants as ec +from ..types_defs import GraphData, OraclePayload +from ._common import payload_to_graph + +_ORACLE_DIR = Path(__file__).parent / ec.JAVA_ORACLE_DIRNAME +_SOURCE = _ORACLE_DIR / ec.JAVA_ORACLE_SOURCE +_CLASS = _ORACLE_DIR / f"{ec.JAVA_ORACLE_CLASS}.class" + + +def java_available() -> bool: + return ( + shutil.which(ec.JAVAC_BIN) is not None and shutil.which(ec.JAVA_BIN) is not None + ) + + +def _ensure_compiled() -> None: + if _CLASS.is_file(): + return + javac = shutil.which(ec.JAVAC_BIN) + if javac is None: + return + subprocess.run( + [javac, str(_SOURCE)], + cwd=str(_ORACLE_DIR), + capture_output=True, + text=True, + check=True, + ) + + +def run_java_oracle(target: Path) -> GraphData: + _ensure_compiled() + java = shutil.which(ec.JAVA_BIN) + if java is None: + return GraphData(nodes={}, edges=set(), name_edges=set()) + proc = subprocess.run( + [java, ec.JAVA_CP_FLAG, str(_ORACLE_DIR), ec.JAVA_ORACLE_CLASS, str(target)], + capture_output=True, + text=True, + check=True, + ) + payload: OraclePayload = json.loads(proc.stdout or "{}") + return payload_to_graph(payload) diff --git a/evals/oracles/java_oracle/Oracle.java b/evals/oracles/java_oracle/Oracle.java new file mode 100644 index 000000000..ba67b8a9b --- /dev/null +++ b/evals/oracles/java_oracle/Oracle.java @@ -0,0 +1,222 @@ +// Authoritative Java structure oracle for the cgr eval harness. +// +// Parses every .java file under a directory with the JDK's own Compiler Tree API +// (javax.tools + com.sun.source) and emits one JSON record per declaration, in +// cgr's NodeLabel vocabulary, so records join cgr's graph on (kind, file, line). +// task.parse() only parses (no resolution), so missing dependencies are fine. +// +// Mapping (Java construct -> cgr NodeLabel): +// +// class -> Class +// interface / @interface -> Interface (its method signatures -> Method) +// enum -> Enum +// method / constructor -> Method +// +// Containment edges (matching how cgr models Java containment): +// +// DEFINES : the file module -> every named type (top-level OR nested) +// DEFINES_METHOD : the method's immediate enclosing named type -> Method +// +// cgr models a Java module per file (keyed at line 0) and DEFINES every named +// type from it (containment is flat, not nested-type-scoped). A method binds to +// its nearest enclosing named type. Methods of an anonymous class are Functions +// (no DEFINES_METHOD), matching the node mapping. +// +// Output is a {nodes, edges} payload joining cgr on (kind, file, line). +// +// Compile: javac Oracle.java ; Run: java -cp Oracle + +import com.sun.source.tree.ClassTree; +import com.sun.source.tree.CompilationUnitTree; +import com.sun.source.tree.LambdaExpressionTree; +import com.sun.source.tree.LineMap; +import com.sun.source.tree.MethodTree; +import com.sun.source.tree.Tree; +import com.sun.source.util.JavacTask; +import com.sun.source.util.SourcePositions; +import com.sun.source.util.TreePath; +import com.sun.source.util.TreePathScanner; +import com.sun.source.util.Trees; +import java.nio.file.FileVisitResult; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.nio.file.SimpleFileVisitor; +import java.nio.file.attribute.BasicFileAttributes; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashSet; +import java.util.List; +import java.util.Set; +import javax.tools.JavaCompiler; +import javax.tools.JavaFileObject; +import javax.tools.StandardJavaFileManager; +import javax.tools.ToolProvider; + +public class Oracle { + static final Set IGNORED = + new HashSet<>(Arrays.asList(".git", "target", "build", "node_modules", "vendor")); + static final List recs = new ArrayList<>(); + static final List edges = new ArrayList<>(); + static final List nameEdges = new ArrayList<>(); + static final long MODULE_LINE = 0; + + static String esc(String s) { + return s.replace("\\", "\\\\").replace("\"", "\\\""); + } + + // (H) Simple name of an extends/implements type: drop generics and any + // (H) package/outer qualifier, matching how cgr resolves bases by simple name. + static String simpleName(Object typeTree) { + String s = typeTree.toString(); + int lt = s.indexOf('<'); + if (lt >= 0) { + s = s.substring(0, lt); + } + int dot = s.lastIndexOf('.'); + if (dot >= 0) { + s = s.substring(dot + 1); + } + return s.trim(); + } + + static void emitNameEdge( + String rel, String file, String skind, long sline, String targetName) { + nameEdges.add("{\"rel\":\"" + rel + "\",\"source\":{\"kind\":\"" + skind + + "\",\"file\":\"" + esc(file) + "\",\"line\":" + sline + + "},\"target_name\":\"" + esc(targetName) + "\"}"); + } + + static void emit(String kind, String file, long line, long endLine, String name) { + recs.add("{\"kind\":\"" + kind + "\",\"file\":\"" + esc(file) + + "\",\"line\":" + line + ",\"end_line\":" + endLine + + ",\"name\":\"" + esc(name) + "\"}"); + } + + static void emitEdge( + String rel, String file, String pkind, long pline, String ckind, long cline) { + edges.add("{\"rel\":\"" + rel + "\",\"parent\":{\"kind\":\"" + pkind + + "\",\"file\":\"" + esc(file) + "\",\"line\":" + pline + + "},\"child\":{\"kind\":\"" + ckind + "\",\"file\":\"" + esc(file) + + "\",\"line\":" + cline + "}}"); + } + + static String classKind(ClassTree node) { + switch (node.getKind()) { + case INTERFACE: + return "Interface"; + case ENUM: + return "Enum"; + // (H) cgr models an annotation type (@interface) as a Class. + default: + return "Class"; + } + } + + public static void main(String[] args) throws Exception { + Path root = Paths.get(args[0]).toAbsolutePath().normalize(); + List files = new ArrayList<>(); + Files.walkFileTree(root, new SimpleFileVisitor() { + public FileVisitResult preVisitDirectory(Path d, BasicFileAttributes a) { + Path name = d.getFileName(); + if (name != null && IGNORED.contains(name.toString())) { + return FileVisitResult.SKIP_SUBTREE; + } + return FileVisitResult.CONTINUE; + } + + public FileVisitResult visitFile(Path f, BasicFileAttributes a) { + if (f.toString().endsWith(".java")) { + files.add(f); + } + return FileVisitResult.CONTINUE; + } + }); + if (files.isEmpty()) { + System.out.print("{\"nodes\":[],\"edges\":[],\"name_edges\":[]}"); + return; + } + + JavaCompiler compiler = ToolProvider.getSystemJavaCompiler(); + StandardJavaFileManager fm = compiler.getStandardFileManager(null, null, null); + Iterable units = fm.getJavaFileObjectsFromPaths(files); + JavacTask task = (JavacTask) compiler.getTask(null, fm, d -> {}, null, null, units); + SourcePositions sp = Trees.instance(task).getSourcePositions(); + + for (CompilationUnitTree unit : task.parse()) { + Path abs = Paths.get(unit.getSourceFile().toUri()); + String rel = root.relativize(abs).toString().replace('\\', '/'); + LineMap lm = unit.getLineMap(); + new TreePathScanner() { + public Void visitClass(ClassTree node, Void p) { + long pos = sp.getStartPosition(unit, node); + // (H) Anonymous classes have an empty name and no cgr node. + if (pos >= 0 && node.getSimpleName().length() > 0) { + long line = lm.getLineNumber(pos); + long endLine = lm.getLineNumber(sp.getEndPosition(unit, node)); + String kind = classKind(node); + emit(kind, rel, line, endLine, node.getSimpleName().toString()); + // (H) Every named type is DEFINEd by the file module, + // (H) including nested types (cgr keeps this flat). + emitEdge("DEFINES", rel, "Module", MODULE_LINE, kind, line); + // (H) extends superclass -> INHERITS (a class only). + if (node.getExtendsClause() != null) { + emitNameEdge("INHERITS", rel, kind, line, + simpleName(node.getExtendsClause())); + } + // (H) The implements clause holds a class/enum's interfaces + // (H) (-> IMPLEMENTS) but an interface's superinterfaces + // (H) (-> INHERITS, like cgr). + String hrel = node.getKind() == Tree.Kind.INTERFACE + ? "INHERITS" : "IMPLEMENTS"; + for (Tree it : node.getImplementsClause()) { + emitNameEdge(hrel, rel, kind, line, simpleName(it)); + } + } + return super.visitClass(node, p); + } + + public Void visitMethod(MethodTree node, Void p) { + long pos = sp.getStartPosition(unit, node); + if (pos >= 0) { + // (H) cgr labels a member a Method only when its nearest + // (H) enclosing named class precedes any enclosing method or + // (H) lambda body; members of an anonymous class (declared in + // (H) a method body) are modelled as standalone Functions. + String kind = "Function"; + ClassTree owner = null; + for (TreePath up = getCurrentPath().getParentPath(); + up != null; up = up.getParentPath()) { + Tree t = up.getLeaf(); + if (t instanceof ClassTree + && ((ClassTree) t).getSimpleName().length() > 0) { + kind = "Method"; + owner = (ClassTree) t; + break; + } + if (t instanceof MethodTree || t instanceof LambdaExpressionTree) { + break; + } + } + long line = lm.getLineNumber(pos); + long endLine = lm.getLineNumber(sp.getEndPosition(unit, node)); + emit(kind, rel, line, endLine, node.getName().toString()); + // (H) A Method binds to its enclosing named type; an + // (H) anonymous-class member (Function) has no such edge. + if (owner != null) { + long opos = sp.getStartPosition(unit, owner); + if (opos >= 0) { + emitEdge("DEFINES_METHOD", rel, classKind(owner), + lm.getLineNumber(opos), "Method", line); + } + } + } + return super.visitMethod(node, p); + } + }.scan(unit, null); + } + System.out.print("{\"nodes\":[" + String.join(",", recs) + + "],\"edges\":[" + String.join(",", edges) + + "],\"name_edges\":[" + String.join(",", nameEdges) + "]}"); + } +} diff --git a/evals/oracles/lua_oracle.py b/evals/oracles/lua_oracle.py new file mode 100644 index 000000000..1cd331316 --- /dev/null +++ b/evals/oracles/lua_oracle.py @@ -0,0 +1,50 @@ +from __future__ import annotations + +import json +import shutil +import subprocess +from pathlib import Path + +from .. import constants as ec +from ..types_defs import GraphData, OraclePayload +from ._common import payload_to_graph + +_ORACLE_DIR = Path(__file__).parent / ec.LUA_ORACLE_DIRNAME +_SCRIPT = _ORACLE_DIR / ec.LUA_ORACLE_SCRIPT +_NODE_MODULES = _ORACLE_DIR / ec.NODE_MODULES_DIRNAME + + +def lua_oracle_available() -> bool: + return ( + shutil.which(ec.NODE_BIN) is not None and shutil.which(ec.NPM_BIN) is not None + ) + + +def _ensure_deps() -> None: + if _NODE_MODULES.is_dir(): + return + npm = shutil.which(ec.NPM_BIN) + if npm is None: + return + subprocess.run( + [npm, ec.NPM_INSTALL, *ec.NPM_FLAGS], + cwd=str(_ORACLE_DIR), + capture_output=True, + text=True, + check=True, + ) + + +def run_lua_oracle(target: Path) -> GraphData: + _ensure_deps() + node = shutil.which(ec.NODE_BIN) + if node is None: + return GraphData(nodes={}, edges=set(), name_edges=set()) + proc = subprocess.run( + [node, str(_SCRIPT), str(target)], + capture_output=True, + text=True, + check=True, + ) + payload: OraclePayload = json.loads(proc.stdout or "{}") + return payload_to_graph(payload) diff --git a/evals/oracles/lua_oracle/lua_ast.js b/evals/oracles/lua_oracle/lua_ast.js new file mode 100644 index 000000000..521ebbae9 --- /dev/null +++ b/evals/oracles/lua_oracle/lua_ast.js @@ -0,0 +1,77 @@ +// Authoritative Lua structure oracle for the cgr eval harness. +// +// Parses every .lua file with luaparse and emits one JSON record per function +// declaration/expression, in cgr's NodeLabel vocabulary. Lua has no classes, so +// cgr models every function (global, local, table `t.f`, method `t:m`, and +// anonymous function expressions) as a Function node, joined on (kind, file, line). +// +// Containment edges: Lua has no classes/methods, so the only edge is DEFINES, +// from the enclosing function (for a nested function) else the file module +// (keyed at line 0) -> Function. Output is a {nodes, edges} payload. +// +// Run: node lua_ast.js + +const luaparse = require("luaparse"); +const fs = require("fs"); +const path = require("path"); + +const IGNORED = new Set([".git", "node_modules", "vendor"]); +const MODULE_LINE = 0; +const nodes = []; +const edges = []; + +function walk(node, file, parentRef) { + if (node === null || typeof node !== "object") return; + if (Array.isArray(node)) { + for (const c of node) walk(c, file, parentRef); + return; + } + if (node.type === "FunctionDeclaration" && node.loc) { + const line = node.loc.start.line; + nodes.push({ kind: "Function", file, line, end_line: node.loc.end.line, name: "fn" }); + edges.push({ + rel: "DEFINES", + parent: { kind: parentRef.kind, file, line: parentRef.line }, + child: { kind: "Function", file, line }, + }); + // (H) Functions nested in this one bind to it (its lexical parent). + const sub = { kind: "Function", line }; + for (const k of Object.keys(node)) { + if (k === "loc" || k === "range") continue; + walk(node[k], file, sub); + } + return; + } + for (const k of Object.keys(node)) { + if (k === "loc" || k === "range") continue; + walk(node[k], file, parentRef); + } +} + +function visitDir(dir, root) { + for (const entry of fs.readdirSync(dir, { withFileTypes: true })) { + const p = path.join(dir, entry.name); + if (entry.isDirectory()) { + if (!IGNORED.has(entry.name)) visitDir(p, root); + } else if (entry.name.endsWith(".lua")) { + const src = fs.readFileSync(p, "utf8"); + try { + // luaVersion 5.3 enables bitwise operators / integer division so the + // oracle parses the same modern Lua that cgr's tree-sitter grammar does. + const ast = luaparse.parse(src, { + locations: true, + comments: false, + luaVersion: "5.3", + }); + const rel = path.relative(root, p).split(path.sep).join("/"); + walk(ast, rel, { kind: "Module", line: MODULE_LINE }); + } catch (e) { + // skip files luaparse cannot parse + } + } + } +} + +const root = process.argv[2] || "."; +visitDir(root, root); +process.stdout.write(JSON.stringify({ nodes, edges })); diff --git a/evals/oracles/lua_oracle/package-lock.json b/evals/oracles/lua_oracle/package-lock.json new file mode 100644 index 000000000..28f41d4d7 --- /dev/null +++ b/evals/oracles/lua_oracle/package-lock.json @@ -0,0 +1,27 @@ +{ + "name": "lua_oracle", + "version": "0.1.0", + "lockfileVersion": 3, + "requires": true, + "packages": { + "": { + "name": "lua_oracle", + "version": "0.1.0", + "dependencies": { + "luaparse": "^0.3.1" + }, + "bin": { + "lua_oracle": "lua_ast.js" + } + }, + "node_modules/luaparse": { + "version": "0.3.1", + "resolved": "https://registry.npmjs.org/luaparse/-/luaparse-0.3.1.tgz", + "integrity": "sha512-b21h2bFEbtGXmVqguHogbyrMAA0wOHyp9u/rx+w6Yc9pW1t9YjhGUsp87lYcp7pFRqSWN/PhFkrdIqKEUzRjjQ==", + "license": "MIT", + "bin": { + "luaparse": "bin/luaparse" + } + } + } +} diff --git a/evals/oracles/lua_oracle/package.json b/evals/oracles/lua_oracle/package.json new file mode 100644 index 000000000..ed2aacbdd --- /dev/null +++ b/evals/oracles/lua_oracle/package.json @@ -0,0 +1,10 @@ +{ + "name": "lua_oracle", + "version": "0.1.0", + "private": true, + "description": "Authoritative Lua structure oracle for the cgr eval harness", + "bin": { "lua_oracle": "lua_ast.js" }, + "dependencies": { + "luaparse": "^0.3.1" + } +} diff --git a/evals/oracles/php_oracle.py b/evals/oracles/php_oracle.py new file mode 100644 index 000000000..3b68b2fbc --- /dev/null +++ b/evals/oracles/php_oracle.py @@ -0,0 +1,50 @@ +from __future__ import annotations + +import json +import shutil +import subprocess +from pathlib import Path + +from .. import constants as ec +from ..types_defs import GraphData, OraclePayload +from ._common import payload_to_graph + +_ORACLE_DIR = Path(__file__).parent / ec.PHP_ORACLE_DIRNAME +_SCRIPT = _ORACLE_DIR / ec.PHP_ORACLE_SCRIPT +_NODE_MODULES = _ORACLE_DIR / ec.NODE_MODULES_DIRNAME + + +def php_oracle_available() -> bool: + return ( + shutil.which(ec.NODE_BIN) is not None and shutil.which(ec.NPM_BIN) is not None + ) + + +def _ensure_deps() -> None: + if _NODE_MODULES.is_dir(): + return + npm = shutil.which(ec.NPM_BIN) + if npm is None: + return + subprocess.run( + [npm, ec.NPM_INSTALL, *ec.NPM_FLAGS], + cwd=str(_ORACLE_DIR), + capture_output=True, + text=True, + check=True, + ) + + +def run_php_oracle(target: Path) -> GraphData: + _ensure_deps() + node = shutil.which(ec.NODE_BIN) + if node is None: + return GraphData(nodes={}, edges=set(), name_edges=set()) + proc = subprocess.run( + [node, str(_SCRIPT), str(target)], + capture_output=True, + text=True, + check=True, + ) + payload: OraclePayload = json.loads(proc.stdout or "{}") + return payload_to_graph(payload) diff --git a/evals/oracles/php_oracle/package-lock.json b/evals/oracles/php_oracle/package-lock.json new file mode 100644 index 000000000..c040cfa11 --- /dev/null +++ b/evals/oracles/php_oracle/package-lock.json @@ -0,0 +1,24 @@ +{ + "name": "php_oracle", + "version": "0.1.0", + "lockfileVersion": 3, + "requires": true, + "packages": { + "": { + "name": "php_oracle", + "version": "0.1.0", + "dependencies": { + "php-parser": "^3.2.5" + }, + "bin": { + "php_oracle": "php_ast.js" + } + }, + "node_modules/php-parser": { + "version": "3.7.0", + "resolved": "https://registry.npmjs.org/php-parser/-/php-parser-3.7.0.tgz", + "integrity": "sha512-JRc1t78GZAEa+MuzVC5A5RJS1NDFTS4UnprUEu/NnsN9cyHbGZLUqghO9IQZUSCay62HYQiWd3PxyWAEF45zmA==", + "license": "BSD-3-Clause" + } + } +} diff --git a/evals/oracles/php_oracle/package.json b/evals/oracles/php_oracle/package.json new file mode 100644 index 000000000..7cd287fdd --- /dev/null +++ b/evals/oracles/php_oracle/package.json @@ -0,0 +1,10 @@ +{ + "name": "php_oracle", + "version": "0.1.0", + "private": true, + "description": "Authoritative PHP structure oracle for the cgr eval harness", + "bin": { "php_oracle": "php_ast.js" }, + "dependencies": { + "php-parser": "^3.2.5" + } +} diff --git a/evals/oracles/php_oracle/php_ast.js b/evals/oracles/php_oracle/php_ast.js new file mode 100644 index 000000000..a62b54da9 --- /dev/null +++ b/evals/oracles/php_oracle/php_ast.js @@ -0,0 +1,220 @@ +// Authoritative PHP structure oracle for the cgr eval harness. +// +// Parses every .php file with php-parser (a pure-JS PHP parser) and emits one +// JSON record per declaration, in cgr's NodeLabel vocabulary, joined on +// (kind, file, line). +// +// Mapping (PHP construct -> cgr NodeLabel), matching how cgr models PHP: +// +// class -> Class +// interface -> Interface (+ its methods -> Method) +// trait -> Class (cgr models a trait as a Class) +// enum -> Enum +// method (in named type) -> Method +// method (in anonymous class) -> Function (cgr models these as Functions) +// function -> Function +// closure / arrow fn -> Function (anonymous) +// +// A declaration's line is the line of its first attribute (`#[Attr]`) when +// present, matching cgr's node span; anonymous classes (`new class {...}`) get +// no Class node, like cgr. +// +// Containment edges (matching how cgr models PHP containment): +// +// DEFINES : the file module -> every named type and top-level function +// DEFINES_METHOD : the enclosing named type -> Method +// +// cgr keeps type containment flat (the file module DEFINES every named type, +// keyed at line 0); a Method binds to its enclosing class/interface/trait/enum; +// a Function/closure binds to its nearest enclosing function, else the module. +// An anonymous-class member is a Function (no DEFINES_METHOD). Output is a +// {nodes, edges} payload joining cgr on (kind, file, line). +// +// Run: node php_ast.js + +const phpParser = require("php-parser"); +const fs = require("fs"); +const path = require("path"); + +const IGNORED = new Set([".git", "node_modules", "vendor"]); +const MODULE_LINE = 0; +const nodes = []; +const edges = []; +const nameEdges = []; + +function emit(kind, file, line, endLine) { + nodes.push({ kind, file, line, end_line: endLine, name: "decl" }); +} + +function emitEdge(rel, file, pkind, pline, ckind, cline) { + edges.push({ + rel, + parent: { kind: pkind, file, line: pline }, + child: { kind: ckind, file, line: cline }, + }); +} + +function emitNameEdge(rel, file, skind, sline, targetName) { + nameEdges.push({ + rel, + source: { kind: skind, file, line: sline }, + target_name: targetName, + }); +} + +// (H) Simple name of a php-parser Name ref: its last namespace segment, matching +// (H) how cgr resolves bases by simple name (e.g. \App\Base -> Base). +function phpSimpleName(ref) { + const n = ref && ref.name ? ref.name : ""; + return n.split("\\").pop(); +} + +function asList(refs) { + if (!refs) return []; + return Array.isArray(refs) ? refs : [refs]; +} + +// (H) class extends -> INHERITS, implements -> IMPLEMENTS; interface extends +// (H) (an array) -> INHERITS (cgr models superinterfaces as inheritance). +function emitInheritance(node, file, kind, line) { + const extendsRel = "INHERITS"; + for (const ref of asList(node.extends)) { + emitNameEdge(extendsRel, file, kind, line, phpSimpleName(ref)); + } + for (const ref of asList(node.implements)) { + emitNameEdge("IMPLEMENTS", file, kind, line, phpSimpleName(ref)); + } +} + +function declLine(node) { + let line = node.loc.start.line; + if (Array.isArray(node.attrGroups)) { + for (const g of node.attrGroups) { + if (g.loc && g.loc.start.line < line) line = g.loc.start.line; + } + } + return line; +} + +function isAnonymous(node) { + return node.isAnonymous === true || node.name === null; +} + +function walkChildren(node, file, ctx) { + for (const k of Object.keys(node)) { + if (k === "loc") continue; + walk(node[k], file, ctx); + } +} + +// ctx: { container, typeRef, funcRef } +// container: "module" | "class" | "anon" | "function" +// typeRef: enclosing named type {kind,line} (DEFINES_METHOD parent) +// funcRef: enclosing function {kind,line} (DEFINES parent for nested fns) +function defineFunctionEdge(file, ctx, kind, line) { + if (kind === "Method") { + if (ctx.typeRef) { + emitEdge("DEFINES_METHOD", file, ctx.typeRef.kind, ctx.typeRef.line, "Method", line); + } + } else { + const parent = ctx.funcRef || { kind: "Module", line: MODULE_LINE }; + emitEdge("DEFINES", file, parent.kind, parent.line, "Function", line); + } +} + +function walk(node, file, ctx) { + if (node === null || typeof node !== "object") return; + if (Array.isArray(node)) { + for (const c of node) walk(c, file, ctx); + return; + } + switch (node.kind) { + case "class": { + if (isAnonymous(node)) { + // (H) Anonymous class: no node; its methods are Functions bound to the + // (H) enclosing function/module, so keep funcRef and mark the container. + walkChildren(node, file, { container: "anon", typeRef: null, funcRef: ctx.funcRef }); + } else { + const line = declLine(node); + emit("Class", file, line, node.loc.end.line); + emitEdge("DEFINES", file, "Module", MODULE_LINE, "Class", line); + emitInheritance(node, file, "Class", line); + walkChildren(node, file, { container: "class", typeRef: { kind: "Class", line }, funcRef: null }); + } + return; + } + case "interface": { + const line = declLine(node); + emit("Interface", file, line, node.loc.end.line); + emitEdge("DEFINES", file, "Module", MODULE_LINE, "Interface", line); + emitInheritance(node, file, "Interface", line); + walkChildren(node, file, { container: "class", typeRef: { kind: "Interface", line }, funcRef: null }); + return; + } + case "trait": { + const line = declLine(node); + emit("Class", file, line, node.loc.end.line); + emitEdge("DEFINES", file, "Module", MODULE_LINE, "Class", line); + walkChildren(node, file, { container: "class", typeRef: { kind: "Class", line }, funcRef: null }); + return; + } + case "enum": { + const line = declLine(node); + emit("Enum", file, line, node.loc.end.line); + emitEdge("DEFINES", file, "Module", MODULE_LINE, "Enum", line); + emitInheritance(node, file, "Enum", line); + walkChildren(node, file, { container: "class", typeRef: { kind: "Enum", line }, funcRef: null }); + return; + } + case "method": { + const kind = ctx.container === "anon" ? "Function" : "Method"; + const line = declLine(node); + emit(kind, file, line, node.loc.end.line); + defineFunctionEdge(file, ctx, kind, line); + walkChildren(node, file, { container: "function", typeRef: null, funcRef: { kind, line } }); + return; + } + case "function": { + const line = declLine(node); + emit("Function", file, line, node.loc.end.line); + defineFunctionEdge(file, ctx, "Function", line); + walkChildren(node, file, { container: "function", typeRef: null, funcRef: { kind: "Function", line } }); + return; + } + case "closure": + case "arrowfunc": { + const line = node.loc.start.line; + emit("Function", file, line, node.loc.end.line); + defineFunctionEdge(file, ctx, "Function", line); + walkChildren(node, file, { container: "function", typeRef: null, funcRef: { kind: "Function", line } }); + return; + } + default: + walkChildren(node, file, ctx); + } +} + +function visitDir(dir, root, parser) { + for (const entry of fs.readdirSync(dir, { withFileTypes: true })) { + const p = path.join(dir, entry.name); + if (entry.isDirectory()) { + if (!IGNORED.has(entry.name)) visitDir(p, root, parser); + } else if (entry.name.endsWith(".php")) { + try { + const ast = parser.parseCode(fs.readFileSync(p, "utf8")); + const rel = path.relative(root, p).split(path.sep).join("/"); + walk(ast, rel, { container: "module", typeRef: null, funcRef: null }); + } catch (e) { + // skip files php-parser cannot parse + } + } + } +} + +const root = process.argv[2] || "."; +const parser = new phpParser.Engine({ + parser: { extractDoc: false, suppressErrors: true }, + ast: { withPositions: true }, +}); +visitDir(root, root, parser); +process.stdout.write(JSON.stringify({ nodes, edges, name_edges: nameEdges })); diff --git a/evals/oracles/rs_oracle/Cargo.lock b/evals/oracles/rs_oracle/Cargo.lock new file mode 100644 index 000000000..500aceee2 --- /dev/null +++ b/evals/oracles/rs_oracle/Cargo.lock @@ -0,0 +1,46 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 4 + +[[package]] +name = "proc-macro2" +version = "1.0.106" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8fd00f0bb2e90d81d1044c2b32617f68fcb9fa3bb7640c23e9c748e53fb30934" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "quote" +version = "1.0.45" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41f2619966050689382d2b44f664f4bc593e129785a36d6ee376ddf37259b924" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "rs_oracle" +version = "0.1.0" +dependencies = [ + "proc-macro2", + "syn", +] + +[[package]] +name = "syn" +version = "2.0.118" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1b9ae57f904213ebb649ce6895b8a66c66f0203b9319718f69a5612a065b1422" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "unicode-ident" +version = "1.0.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6e4313cd5fcd3dad5cafa179702e2b244f760991f45397d14d4ebf38247da75" diff --git a/evals/oracles/rs_oracle/Cargo.toml b/evals/oracles/rs_oracle/Cargo.toml new file mode 100644 index 000000000..6381c7979 --- /dev/null +++ b/evals/oracles/rs_oracle/Cargo.toml @@ -0,0 +1,12 @@ +[package] +name = "rs_oracle" +version = "0.1.0" +edition = "2021" + +[[bin]] +name = "rs_oracle" +path = "src/main.rs" + +[dependencies] +syn = { version = "2", features = ["full", "visit"] } +proc-macro2 = { version = "1", features = ["span-locations"] } diff --git a/evals/oracles/rs_oracle/src/main.rs b/evals/oracles/rs_oracle/src/main.rs new file mode 100644 index 000000000..23b290378 --- /dev/null +++ b/evals/oracles/rs_oracle/src/main.rs @@ -0,0 +1,484 @@ +// Authoritative Rust structure oracle for the cgr eval harness. +// +// Parses every .rs file under a directory with `syn` (the de-facto standard Rust +// parser) and emits a JSON payload {nodes, edges}. Node "kind" fields use cgr's +// NodeLabel vocabulary and edges use cgr's RelationshipType vocabulary, so both +// join cgr's graph on (kind, file, line). +// +// Mapping (Rust item -> cgr NodeLabel): +// +// struct -> Class +// enum -> Enum +// union -> Union +// trait -> Interface (its methods -> Method) +// type alias -> Type +// fn -> Function (free fns, including those nested in fn bodies) +// impl method -> Method +// +// Containment edges (matching how cgr models Rust containment): +// +// DEFINES : enclosing module -> item / nested module +// DEFINES_METHOD : the method's owner type (or trait) -> Method +// +// cgr models a Rust module per file (keyed at line 0) plus a Module node per +// inline `mod` (keyed at its declaration line). An item inside `mod inner` is +// DEFINEd by the inner module; an impl method binds to its target type resolved +// within the impl's enclosing module path (falling back to ancestor modules). +// +// The node walk uses `syn::visit::Visit` so function-local definitions and +// closures are captured too; edges use an explicit item recursion that tracks +// the enclosing module, which is what carries containment. +// +// Run: cargo run --release -- + +use std::collections::HashMap; +use std::env; +use std::fs; +use std::path::Path; +use syn::spanned::Spanned; +use syn::visit::Visit; + +const IGNORED_DIRS: [&str; 4] = [".git", "target", "vendor", "node_modules"]; + +const KIND_CLASS: &str = "Class"; +const KIND_ENUM: &str = "Enum"; +const KIND_UNION: &str = "Union"; +const KIND_INTERFACE: &str = "Interface"; +const KIND_TYPE: &str = "Type"; +const KIND_FUNCTION: &str = "Function"; +const KIND_METHOD: &str = "Method"; +const KIND_MODULE: &str = "Module"; +const REL_DEFINES: &str = "DEFINES"; +const REL_DEFINES_METHOD: &str = "DEFINES_METHOD"; +const REL_INHERITS: &str = "INHERITS"; +const REL_IMPLEMENTS: &str = "IMPLEMENTS"; +const MODULE_LINE: usize = 0; + +fn esc(s: &str) -> String { + s.replace('\\', "\\\\").replace('"', "\\\"") +} + +fn node_json(kind: &str, file: &str, line: usize, end_line: usize, name: &str) -> String { + format!( + "{{\"kind\":\"{}\",\"file\":\"{}\",\"line\":{},\"end_line\":{},\"name\":\"{}\"}}", + kind, + esc(file), + line, + end_line, + esc(name) + ) +} + +fn edge_json( + rel: &str, + file: &str, + pkind: &str, + pline: usize, + ckind: &str, + cline: usize, +) -> String { + format!( + "{{\"rel\":\"{}\",\"parent\":{{\"kind\":\"{}\",\"file\":\"{}\",\"line\":{}}},\"child\":{{\"kind\":\"{}\",\"file\":\"{}\",\"line\":{}}}}}", + rel, + pkind, + esc(file), + pline, + ckind, + esc(file), + cline + ) +} + +fn name_edge_json( + rel: &str, + file: &str, + skind: &str, + sline: usize, + target_name: &str, +) -> String { + format!( + "{{\"rel\":\"{}\",\"source\":{{\"kind\":\"{}\",\"file\":\"{}\",\"line\":{}}},\"target_name\":\"{}\"}}", + rel, + skind, + esc(file), + sline, + esc(target_name) + ) +} + +// (H) Last path segment of a trait reference (`a::b::Trait` / `Trait` -> Trait). +fn trait_path_name(path: &syn::Path) -> Option { + path.segments.last().map(|s| s.ident.to_string()) +} + +// ---- node collection (every declaration, including nested/closures) ---- + +struct NodeCollector<'a> { + file: &'a str, + out: &'a mut Vec, +} + +impl<'a> NodeCollector<'a> { + fn emit(&mut self, kind: &str, line: usize, end_line: usize, name: &str) { + self.out.push(node_json(kind, self.file, line, end_line, name)); + } +} + +impl<'ast, 'a> Visit<'ast> for NodeCollector<'a> { + fn visit_item_struct(&mut self, node: &'ast syn::ItemStruct) { + self.emit(KIND_CLASS, node.ident.span().start().line, node.span().end().line, &node.ident.to_string()); + syn::visit::visit_item_struct(self, node); + } + fn visit_item_enum(&mut self, node: &'ast syn::ItemEnum) { + self.emit(KIND_ENUM, node.ident.span().start().line, node.span().end().line, &node.ident.to_string()); + syn::visit::visit_item_enum(self, node); + } + fn visit_item_union(&mut self, node: &'ast syn::ItemUnion) { + self.emit(KIND_UNION, node.ident.span().start().line, node.span().end().line, &node.ident.to_string()); + syn::visit::visit_item_union(self, node); + } + fn visit_item_type(&mut self, node: &'ast syn::ItemType) { + self.emit(KIND_TYPE, node.ident.span().start().line, node.span().end().line, &node.ident.to_string()); + syn::visit::visit_item_type(self, node); + } + fn visit_impl_item_type(&mut self, node: &'ast syn::ImplItemType) { + self.emit(KIND_TYPE, node.ident.span().start().line, node.span().end().line, &node.ident.to_string()); + syn::visit::visit_impl_item_type(self, node); + } + fn visit_trait_item_type(&mut self, node: &'ast syn::TraitItemType) { + self.emit(KIND_TYPE, node.ident.span().start().line, node.span().end().line, &node.ident.to_string()); + syn::visit::visit_trait_item_type(self, node); + } + fn visit_expr_closure(&mut self, node: &'ast syn::ExprClosure) { + self.emit(KIND_FUNCTION, node.span().start().line, node.span().end().line, "closure"); + syn::visit::visit_expr_closure(self, node); + } + fn visit_item_trait(&mut self, node: &'ast syn::ItemTrait) { + self.emit(KIND_INTERFACE, node.ident.span().start().line, node.span().end().line, &node.ident.to_string()); + syn::visit::visit_item_trait(self, node); + } + fn visit_item_fn(&mut self, node: &'ast syn::ItemFn) { + self.emit(KIND_FUNCTION, node.sig.ident.span().start().line, node.span().end().line, &node.sig.ident.to_string()); + syn::visit::visit_item_fn(self, node); + } + fn visit_impl_item_fn(&mut self, node: &'ast syn::ImplItemFn) { + self.emit(KIND_METHOD, node.sig.ident.span().start().line, node.span().end().line, &node.sig.ident.to_string()); + syn::visit::visit_impl_item_fn(self, node); + } + fn visit_trait_item_fn(&mut self, node: &'ast syn::TraitItemFn) { + self.emit(KIND_METHOD, node.sig.ident.span().start().line, node.span().end().line, &node.sig.ident.to_string()); + syn::visit::visit_trait_item_fn(self, node); + } +} + +// ---- closure containment ---- +// +// (H) A closure is DEFINEd by the nearest enclosing function-like scope: a free +// (H) fn or another closure (Function), or an impl/trait method (Method); at item +// (H) scope it falls back to the enclosing module. This mirrors cgr, which routes +// (H) every closure through its free-function path and binds it to its lexical +// (H) parent. The walk keeps a stack of enclosing function-likes so nested +// (H) closures bind to the closure that contains them, not the outer method. + +struct ClosureEdges<'a> { + file: &'a str, + edges: &'a mut Vec, + stack: Vec<(&'static str, usize)>, + module_line: usize, +} + +impl<'ast, 'a> Visit<'ast> for ClosureEdges<'a> { + fn visit_item_mod(&mut self, node: &'ast syn::ItemMod) { + if node.content.is_some() { + let saved = self.module_line; + self.module_line = node.ident.span().start().line; + syn::visit::visit_item_mod(self, node); + self.module_line = saved; + } + } + fn visit_item_fn(&mut self, node: &'ast syn::ItemFn) { + self.stack.push((KIND_FUNCTION, node.sig.ident.span().start().line)); + syn::visit::visit_item_fn(self, node); + self.stack.pop(); + } + fn visit_impl_item_fn(&mut self, node: &'ast syn::ImplItemFn) { + self.stack.push((KIND_METHOD, node.sig.ident.span().start().line)); + syn::visit::visit_impl_item_fn(self, node); + self.stack.pop(); + } + fn visit_trait_item_fn(&mut self, node: &'ast syn::TraitItemFn) { + self.stack.push((KIND_METHOD, node.sig.ident.span().start().line)); + syn::visit::visit_trait_item_fn(self, node); + self.stack.pop(); + } + fn visit_expr_closure(&mut self, node: &'ast syn::ExprClosure) { + let cline = node.span().start().line; + let (pkind, pline) = self + .stack + .last() + .copied() + .unwrap_or((KIND_MODULE, self.module_line)); + self.edges.push(edge_json( + REL_DEFINES, self.file, pkind, pline, KIND_FUNCTION, cline, + )); + self.stack.push((KIND_FUNCTION, cline)); + syn::visit::visit_expr_closure(self, node); + self.stack.pop(); + } +} + +// ---- edge collection (containment) ---- + +fn type_table_key(modpath: &str, name: &str) -> String { + format!("{}\u{0}{}", modpath, name) +} + +// collect_types records each module-scoped type so an impl can resolve its +// target to the type's (kind, line). +fn collect_types(items: &[syn::Item], modpath: &str, table: &mut HashMap) { + for item in items { + match item { + syn::Item::Struct(s) => { + table.insert( + type_table_key(modpath, &s.ident.to_string()), + (KIND_CLASS.into(), s.ident.span().start().line), + ); + } + syn::Item::Enum(e) => { + table.insert( + type_table_key(modpath, &e.ident.to_string()), + (KIND_ENUM.into(), e.ident.span().start().line), + ); + } + syn::Item::Union(u) => { + table.insert( + type_table_key(modpath, &u.ident.to_string()), + (KIND_UNION.into(), u.ident.span().start().line), + ); + } + syn::Item::Type(t) => { + table.insert( + type_table_key(modpath, &t.ident.to_string()), + (KIND_TYPE.into(), t.ident.span().start().line), + ); + } + syn::Item::Trait(tr) => { + table.insert( + type_table_key(modpath, &tr.ident.to_string()), + (KIND_INTERFACE.into(), tr.ident.span().start().line), + ); + } + syn::Item::Mod(m) => { + if let Some((_, content)) = &m.content { + let child = child_modpath(modpath, &m.ident.to_string()); + collect_types(content, &child, table); + } + } + _ => {} + } + } +} + +fn child_modpath(modpath: &str, name: &str) -> String { + if modpath.is_empty() { + name.to_string() + } else { + format!("{}::{}", modpath, name) + } +} + +// resolve_type finds a type by name starting in modpath and walking outward to +// ancestor modules and the crate root (Rust name resolution is lexical). +fn resolve_type( + modpath: &str, + name: &str, + table: &HashMap, +) -> Option<(String, usize)> { + let mut parts: Vec<&str> = if modpath.is_empty() { + Vec::new() + } else { + modpath.split("::").collect() + }; + loop { + let mp = parts.join("::"); + if let Some(v) = table.get(&type_table_key(&mp, name)) { + return Some(v.clone()); + } + if parts.is_empty() { + break; + } + parts.pop(); + } + None +} + +// impl_target_name pulls the bare type name off an impl's self type. +fn impl_target_name(ty: &syn::Type) -> Option { + match ty { + syn::Type::Path(tp) => tp.path.segments.last().map(|s| s.ident.to_string()), + syn::Type::Reference(r) => impl_target_name(&r.elem), + _ => None, + } +} + +fn process_edges( + items: &[syn::Item], + file: &str, + module_line: usize, + modpath: &str, + table: &HashMap, + edges: &mut Vec, + name_edges: &mut Vec, +) { + for item in items { + match item { + syn::Item::Struct(s) => edges.push(edge_json( + REL_DEFINES, file, KIND_MODULE, module_line, KIND_CLASS, s.ident.span().start().line, + )), + syn::Item::Enum(e) => edges.push(edge_json( + REL_DEFINES, file, KIND_MODULE, module_line, KIND_ENUM, e.ident.span().start().line, + )), + syn::Item::Union(u) => edges.push(edge_json( + REL_DEFINES, file, KIND_MODULE, module_line, KIND_UNION, u.ident.span().start().line, + )), + syn::Item::Type(t) => edges.push(edge_json( + REL_DEFINES, file, KIND_MODULE, module_line, KIND_TYPE, t.ident.span().start().line, + )), + syn::Item::Fn(f) => edges.push(edge_json( + REL_DEFINES, file, KIND_MODULE, module_line, KIND_FUNCTION, f.sig.ident.span().start().line, + )), + syn::Item::Trait(tr) => { + let tline = tr.ident.span().start().line; + edges.push(edge_json( + REL_DEFINES, file, KIND_MODULE, module_line, KIND_INTERFACE, tline, + )); + // (H) Supertrait bounds (`trait Sub: Super`) -> Sub INHERITS Super. + for bound in &tr.supertraits { + if let syn::TypeParamBound::Trait(tb) = bound { + if let Some(name) = trait_path_name(&tb.path) { + name_edges.push(name_edge_json( + REL_INHERITS, file, KIND_INTERFACE, tline, &name, + )); + } + } + } + for ti in &tr.items { + match ti { + syn::TraitItem::Fn(m) => edges.push(edge_json( + REL_DEFINES_METHOD, file, KIND_INTERFACE, tline, KIND_METHOD, + m.sig.ident.span().start().line, + )), + // (H) An associated type is a module-scoped Type declaration + // (H) in cgr's model (DEFINEd by the enclosing module). + syn::TraitItem::Type(t) => edges.push(edge_json( + REL_DEFINES, file, KIND_MODULE, module_line, KIND_TYPE, + t.ident.span().start().line, + )), + _ => {} + } + } + } + syn::Item::Impl(im) => { + let owner = impl_target_name(&im.self_ty) + .and_then(|name| resolve_type(modpath, &name, table)); + // (H) `impl Trait for Type` -> Type IMPLEMENTS Trait. + if let (Some((kind, tline)), Some((_, path, _))) = (&owner, &im.trait_) { + if let Some(name) = trait_path_name(path) { + name_edges.push(name_edge_json( + REL_IMPLEMENTS, file, kind, *tline, &name, + )); + } + } + for ii in &im.items { + match ii { + syn::ImplItem::Fn(m) => { + if let Some((kind, tline)) = &owner { + edges.push(edge_json( + REL_DEFINES_METHOD, file, kind, *tline, KIND_METHOD, + m.sig.ident.span().start().line, + )); + } + } + syn::ImplItem::Type(t) => edges.push(edge_json( + REL_DEFINES, file, KIND_MODULE, module_line, KIND_TYPE, + t.ident.span().start().line, + )), + _ => {} + } + } + } + syn::Item::Mod(m) => { + if let Some((_, content)) = &m.content { + let mline = m.ident.span().start().line; + edges.push(edge_json( + REL_DEFINES, file, KIND_MODULE, module_line, KIND_MODULE, mline, + )); + let child = child_modpath(modpath, &m.ident.to_string()); + process_edges(content, file, mline, &child, table, edges, name_edges); + } + } + _ => {} + } + } +} + +fn visit_dir( + dir: &Path, + root: &Path, + nodes: &mut Vec, + edges: &mut Vec, + name_edges: &mut Vec, +) { + let entries = match fs::read_dir(dir) { + Ok(entries) => entries, + Err(_) => return, + }; + for entry in entries.flatten() { + let path = entry.path(); + if path.is_dir() { + let name = path.file_name().and_then(|n| n.to_str()).unwrap_or(""); + if !IGNORED_DIRS.contains(&name) { + visit_dir(&path, root, nodes, edges, name_edges); + } + } else if path.extension().and_then(|e| e.to_str()) == Some("rs") { + if let Ok(src) = fs::read_to_string(&path) { + if let Ok(ast) = syn::parse_file(&src) { + let rel = path + .strip_prefix(root) + .unwrap_or(&path) + .to_string_lossy() + .replace('\\', "/"); + let mut collector = NodeCollector { file: &rel, out: nodes }; + collector.visit_file(&ast); + let mut table: HashMap = HashMap::new(); + collect_types(&ast.items, "", &mut table); + process_edges( + &ast.items, &rel, MODULE_LINE, "", &table, edges, name_edges, + ); + let mut closures = ClosureEdges { + file: &rel, + edges, + stack: Vec::new(), + module_line: MODULE_LINE, + }; + closures.visit_file(&ast); + } + } + } + } +} + +fn main() { + let root = env::args().nth(1).unwrap_or_else(|| ".".into()); + let root = Path::new(&root); + let mut nodes = Vec::new(); + let mut edges = Vec::new(); + let mut name_edges = Vec::new(); + visit_dir(root, root, &mut nodes, &mut edges, &mut name_edges); + println!( + "{{\"nodes\":[{}],\"edges\":[{}],\"name_edges\":[{}]}}", + nodes.join(","), + edges.join(","), + name_edges.join(",") + ); +} diff --git a/evals/oracles/rust_oracle.py b/evals/oracles/rust_oracle.py new file mode 100644 index 000000000..605d9ecc1 --- /dev/null +++ b/evals/oracles/rust_oracle.py @@ -0,0 +1,37 @@ +from __future__ import annotations + +import json +import shutil +import subprocess +from pathlib import Path + +from .. import constants as ec +from ..types_defs import GraphData, OraclePayload +from ._common import payload_to_graph + +_ORACLE_DIR = Path(__file__).parent / ec.RS_ORACLE_DIRNAME +_MANIFEST = _ORACLE_DIR / "Cargo.toml" + + +def rust_available() -> bool: + return shutil.which(ec.CARGO_BIN) is not None + + +def run_rust_oracle(target: Path) -> GraphData: + proc = subprocess.run( + [ + ec.CARGO_BIN, + ec.CARGO_RUN, + ec.CARGO_RELEASE, + ec.CARGO_QUIET, + ec.CARGO_MANIFEST, + str(_MANIFEST), + ec.CARGO_ARG_SEP, + str(target), + ], + capture_output=True, + text=True, + check=True, + ) + payload: OraclePayload = json.loads(proc.stdout or "{}") + return payload_to_graph(payload) diff --git a/evals/oracles/ts_oracle/package-lock.json b/evals/oracles/ts_oracle/package-lock.json new file mode 100644 index 000000000..88e302198 --- /dev/null +++ b/evals/oracles/ts_oracle/package-lock.json @@ -0,0 +1,31 @@ +{ + "name": "ts_oracle", + "version": "0.1.0", + "lockfileVersion": 3, + "requires": true, + "packages": { + "": { + "name": "ts_oracle", + "version": "0.1.0", + "dependencies": { + "typescript": "^5.9.3" + }, + "bin": { + "ts_oracle": "ts_ast.js" + } + }, + "node_modules/typescript": { + "version": "5.9.3", + "resolved": "https://registry.npmjs.org/typescript/-/typescript-5.9.3.tgz", + "integrity": "sha512-jl1vZzPDinLr9eUt3J/t7V6FgNEw9QjvBPdysz9KfQDD41fQrC2Y4vKQdiaUpFT4bXlb1RHhLpp8wtm6M5TgSw==", + "license": "Apache-2.0", + "bin": { + "tsc": "bin/tsc", + "tsserver": "bin/tsserver" + }, + "engines": { + "node": ">=14.17" + } + } + } +} diff --git a/evals/oracles/ts_oracle/package.json b/evals/oracles/ts_oracle/package.json new file mode 100644 index 000000000..72554f0bc --- /dev/null +++ b/evals/oracles/ts_oracle/package.json @@ -0,0 +1,10 @@ +{ + "name": "ts_oracle", + "version": "0.1.0", + "private": true, + "description": "Authoritative TypeScript structure oracle for the cgr eval harness", + "bin": { "ts_oracle": "ts_ast.js" }, + "dependencies": { + "typescript": "^5.9.3" + } +} diff --git a/evals/oracles/ts_oracle/ts_ast.js b/evals/oracles/ts_oracle/ts_ast.js new file mode 100644 index 000000000..f11f8da94 --- /dev/null +++ b/evals/oracles/ts_oracle/ts_ast.js @@ -0,0 +1,214 @@ +// Authoritative TypeScript structure oracle for the cgr eval harness. +// +// Parses every .ts/.tsx file under a directory with the TypeScript compiler API +// and emits one JSON record per declaration, in cgr's NodeLabel vocabulary, so +// records join cgr's graph on (kind, file, line). +// +// Mapping (TS construct -> cgr NodeLabel), matching how cgr models TypeScript: +// +// class -> Class +// interface -> Interface +// enum -> Enum +// type alias -> Type +// namespace / module -> Class (cgr treats it as a class container) +// function (top-level/in-fn) -> Function +// function (in namespace/class) -> Method +// const x = () => ... / fn expr -> Function (or Method inside a namespace) +// method / constructor -> Method +// +// Containment edges (matching how cgr models TypeScript containment): +// +// DEFINES : the file module -> every named type (class/interface/enum/ +// namespace, even when nested) and every Function +// DEFINES_METHOD : the enclosing class/namespace -> Method +// +// cgr keeps type containment flat (all types DEFINEd by the file module, keyed +// at line 0); a Method binds to its enclosing class/namespace; a Function binds +// to its nearest enclosing function, else the module. Output is a {nodes, edges} +// payload joining cgr on (kind, file, line). +// +// Run: node ts_ast.js + +const ts = require("typescript"); +const fs = require("fs"); +const path = require("path"); + +const IGNORED = new Set([".git", "node_modules", "vendor", "dist", "build", "out"]); +const MODULE_LINE = 0; +const nodes = []; +const edges = []; +const nameEdges = []; + +function emit(kind, file, line, name, endLine) { + nodes.push({ kind, file, line, end_line: endLine, name }); +} + +function emitEdge(rel, file, pkind, pline, ckind, cline) { + edges.push({ + rel, + parent: { kind: pkind, file, line: pline }, + child: { kind: ckind, file, line: cline }, + }); +} + +function emitNameEdge(rel, file, skind, sline, targetName) { + nameEdges.push({ + rel, + source: { kind: skind, file, line: sline }, + target_name: targetName, + }); +} + +// (H) Simple name of an extends/implements entry: the base expression's last +// (H) identifier (type arguments live separately, so they're already excluded). +function heritageSimpleName(typeNode) { + let expr = typeNode.expression || typeNode; + while (expr && expr.name && expr.expression) { + expr = expr.name; // (H) a.b.Base -> Base + } + return expr && expr.text ? expr.text : expr.getText(); +} + +// (H) A class's extends -> INHERITS, implements -> IMPLEMENTS; an interface's +// (H) extends -> INHERITS (cgr models superinterfaces as inheritance). +function emitHeritage(node, sf, file, kind, line) { + if (!node.heritageClauses) return; + for (const clause of node.heritageClauses) { + const isExtends = clause.token === ts.SyntaxKind.ExtendsKeyword; + const rel = isExtends ? "INHERITS" : "IMPLEMENTS"; + for (const t of clause.types) { + emitNameEdge(rel, file, kind, line, heritageSimpleName(t)); + } + } +} + +function lineOf(sf, node) { + return sf.getLineAndCharacterOfPosition(node.getStart(sf)).line + 1; +} + +// (H) Last line of a node's full span (its end position), for span/end_line +// (H) grading against cgr's end_line. +function endLineOf(sf, node) { + return sf.getLineAndCharacterOfPosition(node.getEnd()).line + 1; +} + +function methodKind(container) { + return container === "namespace" || container === "class" ? "Method" : "Function"; +} + +// ctx carries the file, the enclosing class/namespace ref (for Methods) and the +// enclosing function ref (for nested Functions). +function defineFunction(node, sf, file, container, ctx, kind, line) { + if (kind === "Method") { + if (ctx.typeRef) { + emitEdge("DEFINES_METHOD", file, ctx.typeRef.kind, ctx.typeRef.line, "Method", line); + } + } else { + const parent = ctx.funcRef || { kind: "Module", line: MODULE_LINE }; + emitEdge("DEFINES", file, parent.kind, parent.line, "Function", line); + } +} + +// container: "module" | "class" | "namespace" | "function" +function walk(node, sf, file, container, ctx) { + if (ts.isClassDeclaration(node) && node.name) { + const line = lineOf(sf, node); + emit("Class", file, line, node.name.text, endLineOf(sf, node)); + emitEdge("DEFINES", file, "Module", MODULE_LINE, "Class", line); + emitHeritage(node, sf, file, "Class", line); + const sub = { typeRef: { kind: "Class", line }, funcRef: null }; + node.members.forEach((m) => walk(m, sf, file, "class", sub)); + return; + } + if (ts.isInterfaceDeclaration(node) && node.name) { + const line = lineOf(sf, node); + emit("Interface", file, line, node.name.text, endLineOf(sf, node)); + emitEdge("DEFINES", file, "Module", MODULE_LINE, "Interface", line); + emitHeritage(node, sf, file, "Interface", line); + return; + } + if (ts.isEnumDeclaration(node) && node.name) { + const line = lineOf(sf, node); + emit("Enum", file, line, node.name.text, endLineOf(sf, node)); + emitEdge("DEFINES", file, "Module", MODULE_LINE, "Enum", line); + return; + } + if (ts.isTypeAliasDeclaration(node) && node.name) { + const line = lineOf(sf, node); + emit("Type", file, line, node.name.text, endLineOf(sf, node)); + emitEdge("DEFINES", file, "Module", MODULE_LINE, "Type", line); + return; + } + if (ts.isModuleDeclaration(node) && node.name) { + const line = lineOf(sf, node); + emit("Class", file, line, node.name.text || "", endLineOf(sf, node)); + emitEdge("DEFINES", file, "Module", MODULE_LINE, "Class", line); + const sub = { typeRef: { kind: "Class", line }, funcRef: null }; + if (node.body) node.body.forEachChild((c) => walk(c, sf, file, "namespace", sub)); + return; + } + if (ts.isFunctionDeclaration(node) && node.name) { + const kind = methodKind(container); + const line = lineOf(sf, node); + emit(kind, file, line, node.name.text, endLineOf(sf, node)); + defineFunction(node, sf, file, container, ctx, kind, line); + const sub = { typeRef: null, funcRef: { kind, line } }; + if (node.body) node.body.forEachChild((c) => walk(c, sf, file, "function", sub)); + return; + } + if (ts.isMethodDeclaration(node) || ts.isConstructorDeclaration(node)) { + const nm = ts.isConstructorDeclaration(node) + ? "constructor" + : node.name && ts.isIdentifier(node.name) + ? node.name.text + : node.name && node.name.text; + // (H) Class members are Methods; object-literal shorthand methods are modelled + // (H) by cgr as standalone Functions. + const kind = container === "class" ? "Method" : "Function"; + const line = lineOf(sf, node); + if (nm) { + emit(kind, file, line, nm, endLineOf(sf, node)); + defineFunction(node, sf, file, container, ctx, kind, line); + } + const sub = { typeRef: null, funcRef: { kind, line } }; + if (node.body) node.body.forEachChild((c) => walk(c, sf, file, "function", sub)); + return; + } + if (ts.isArrowFunction(node) || ts.isFunctionExpression(node)) { + // (H) cgr captures every arrow/function expression as a Function node (named + // by its variable when assigned, else anonymous), at the expression's own + // line. The name is irrelevant to the (kind, file, line) join. + const kind = methodKind(container); + const line = lineOf(sf, node); + emit(kind, file, line, "anonymous", endLineOf(sf, node)); + defineFunction(node, sf, file, container, ctx, kind, line); + const sub = { typeRef: null, funcRef: { kind, line } }; + node.forEachChild((c) => walk(c, sf, file, "function", sub)); + return; + } + node.forEachChild((c) => walk(c, sf, file, container, ctx)); +} + +function hasExt(name, exts) { + return exts.some((e) => name.endsWith(e)) && !name.endsWith(".d.ts"); +} + +function visitDir(dir, root, exts) { + for (const entry of fs.readdirSync(dir, { withFileTypes: true })) { + const p = path.join(dir, entry.name); + if (entry.isDirectory()) { + if (!IGNORED.has(entry.name)) visitDir(p, root, exts); + } else if (hasExt(entry.name, exts)) { + const src = fs.readFileSync(p, "utf8"); + const sf = ts.createSourceFile(p, src, ts.ScriptTarget.Latest, true); + const rel = path.relative(root, p).split(path.sep).join("/"); + const ctx = { typeRef: null, funcRef: null }; + sf.forEachChild((c) => walk(c, sf, rel, "module", ctx)); + } + } +} + +const root = process.argv[2] || "."; +const exts = process.argv.slice(3); +visitDir(root, root, exts.length ? exts : [".ts", ".tsx"]); +process.stdout.write(JSON.stringify({ nodes, edges, name_edges: nameEdges })); diff --git a/evals/oracles/typescript_oracle.py b/evals/oracles/typescript_oracle.py new file mode 100644 index 000000000..8be554268 --- /dev/null +++ b/evals/oracles/typescript_oracle.py @@ -0,0 +1,58 @@ +from __future__ import annotations + +import json +import shutil +import subprocess +from pathlib import Path + +from .. import constants as ec +from ..types_defs import GraphData, OraclePayload +from ._common import payload_to_graph + +_ORACLE_DIR = Path(__file__).parent / ec.TS_ORACLE_DIRNAME +_SCRIPT = _ORACLE_DIR / ec.TS_ORACLE_SCRIPT +_NODE_MODULES = _ORACLE_DIR / ec.NODE_MODULES_DIRNAME + + +def typescript_available() -> bool: + return ( + shutil.which(ec.NODE_BIN) is not None and shutil.which(ec.NPM_BIN) is not None + ) + + +def _ensure_deps() -> None: + if _NODE_MODULES.is_dir(): + return + npm = shutil.which(ec.NPM_BIN) + if npm is None: + return + subprocess.run( + [npm, ec.NPM_INSTALL, *ec.NPM_FLAGS], + cwd=str(_ORACLE_DIR), + capture_output=True, + text=True, + check=True, + ) + + +def _run(target: Path, suffixes: tuple[str, ...]) -> GraphData: + _ensure_deps() + node = shutil.which(ec.NODE_BIN) + if node is None: + return GraphData(nodes={}, edges=set(), name_edges=set()) + proc = subprocess.run( + [node, str(_SCRIPT), str(target), *suffixes], + capture_output=True, + text=True, + check=True, + ) + payload: OraclePayload = json.loads(proc.stdout or "{}") + return payload_to_graph(payload) + + +def run_typescript_oracle(target: Path) -> GraphData: + return _run(target, ec.TS_SUFFIXES) + + +def run_javascript_oracle(target: Path) -> GraphData: + return _run(target, ec.JS_SUFFIXES) diff --git a/evals/php_l1.py b/evals/php_l1.py new file mode 100644 index 000000000..6114f14fa --- /dev/null +++ b/evals/php_l1.py @@ -0,0 +1,51 @@ +from pathlib import Path +from typing import Annotated + +import typer +from loguru import logger + +from . import constants as ec +from . import logs as ls +from .cgr_graph import extract_cgr_php_graph +from .oracles import php_oracle_available, run_php_oracle +from .score import score_structure +from .structure_report import render, write_outputs + +_TITLE = "cgr L1 structure eval (PHP vs php-parser)" + + +def main( + target: Annotated[ + Path, typer.Option(help="Directory of PHP sources to evaluate.") + ] = Path(ec.GO_DEFAULT_TARGET), + project_name: Annotated[ + str, typer.Option(help="cgr project name; defaults to target dir name.") + ] = "", + out_dir: Annotated[ + Path, typer.Option(help="Directory for php_scores.csv and php_diff.json.") + ] = Path(ec.DEFAULT_OUT_DIR), +) -> None: + if not php_oracle_available(): + logger.error(ls.PHP_ORACLE_MISSING) + raise typer.Exit(code=1) + + target = target.resolve() + project = project_name or target.name + + logger.info(ls.PHP_EXTRACTING_CGR.format(target=target, project=project)) + cgr = extract_cgr_php_graph(target, project) + logger.success(ls.PHP_CGR_DONE.format(count=len(cgr.nodes))) + + logger.info(ls.PHP_EXTRACTING_ORACLE.format(binary=ec.NODE_BIN, target=target)) + oracle = run_php_oracle(target) + logger.success(ls.PHP_ORACLE_DONE.format(count=len(oracle.nodes))) + + result = score_structure( + cgr, oracle, ec.PHP_SCORED_NODE_KINDS, ec.SCORED_EDGE_TYPES, grade_spans=True + ) + write_outputs(result, out_dir, ec.PHP_SCORES_FILENAME, ec.PHP_DIFF_FILENAME) + render(result, _TITLE) + + +if __name__ == "__main__": + typer.run(main) diff --git a/evals/results/calls_diff.json b/evals/results/calls_diff.json new file mode 100644 index 000000000..648da8ae9 --- /dev/null +++ b/evals/results/calls_diff.json @@ -0,0 +1,3 @@ +{ + "missing": [] +} diff --git a/evals/results/diff.json b/evals/results/diff.json new file mode 100644 index 000000000..25699abc4 --- /dev/null +++ b/evals/results/diff.json @@ -0,0 +1,34 @@ +{ + "node:Module": { + "missing": [], + "extra": [] + }, + "node:Class": { + "missing": [], + "extra": [] + }, + "node:Function": { + "missing": [], + "extra": [] + }, + "node:Method": { + "missing": [], + "extra": [] + }, + "edge:DEFINES": { + "missing": [], + "extra": [] + }, + "edge:DEFINES_METHOD": { + "missing": [], + "extra": [] + }, + "name_edge:INHERITS": { + "missing": [], + "extra": [] + }, + "name_edge:IMPORTS": { + "missing": [], + "extra": [] + } +} diff --git a/evals/results/scores.csv b/evals/results/scores.csv new file mode 100644 index 000000000..b5c3f7ff6 --- /dev/null +++ b/evals/results/scores.csv @@ -0,0 +1,11 @@ +category,label,tp,fp,fn,precision,recall,f1 +node,Module,417,0,0,1.0,1.0,1.0 +node,Class,926,0,0,1.0,1.0,1.0 +node,Function,1955,0,0,1.0,1.0,1.0 +node,Method,3919,0,0,1.0,1.0,1.0 +node,ALL,7217,0,0,1.0,1.0,1.0 +edge,DEFINES,2742,0,0,1.0,1.0,1.0 +edge,DEFINES_METHOD,3919,0,0,1.0,1.0,1.0 +edge,ALL,6661,0,0,1.0,1.0,1.0 +edge,INHERITS,153,0,0,1.0,1.0,1.0 +edge,IMPORTS,1274,0,0,1.0,1.0,1.0 diff --git a/evals/rust_l1.py b/evals/rust_l1.py new file mode 100644 index 000000000..bc9b981ff --- /dev/null +++ b/evals/rust_l1.py @@ -0,0 +1,51 @@ +from pathlib import Path +from typing import Annotated + +import typer +from loguru import logger + +from . import constants as ec +from . import logs as ls +from .cgr_graph import extract_cgr_rust_graph +from .oracles import run_rust_oracle, rust_available +from .score import score_structure +from .structure_report import render, write_outputs + +_TITLE = "cgr L1 structure eval (Rust vs syn)" + + +def main( + target: Annotated[ + Path, typer.Option(help="Directory of Rust sources to evaluate.") + ] = Path(ec.GO_DEFAULT_TARGET), + project_name: Annotated[ + str, typer.Option(help="cgr project name; defaults to target dir name.") + ] = "", + out_dir: Annotated[ + Path, typer.Option(help="Directory for rs_scores.csv and rs_diff.json.") + ] = Path(ec.DEFAULT_OUT_DIR), +) -> None: + if not rust_available(): + logger.error(ls.RS_ORACLE_MISSING.format(binary=ec.CARGO_BIN)) + raise typer.Exit(code=1) + + target = target.resolve() + project = project_name or target.name + + logger.info(ls.RS_EXTRACTING_CGR.format(target=target, project=project)) + cgr = extract_cgr_rust_graph(target, project) + logger.success(ls.RS_CGR_DONE.format(count=len(cgr.nodes))) + + logger.info(ls.RS_EXTRACTING_ORACLE.format(binary=ec.CARGO_BIN, target=target)) + oracle = run_rust_oracle(target) + logger.success(ls.RS_ORACLE_DONE.format(count=len(oracle.nodes))) + + result = score_structure( + cgr, oracle, ec.RS_SCORED_NODE_KINDS, ec.SCORED_EDGE_TYPES, grade_spans=True + ) + write_outputs(result, out_dir, ec.RS_SCORES_FILENAME, ec.RS_DIFF_FILENAME) + render(result, _TITLE) + + +if __name__ == "__main__": + typer.run(main) diff --git a/evals/score.py b/evals/score.py new file mode 100644 index 000000000..12f7f985e --- /dev/null +++ b/evals/score.py @@ -0,0 +1,319 @@ +from statistics import fmean +from typing import TypeVar + +from codebase_rag import constants as cs + +from . import constants as ec +from .types_defs import ( + DiffBucket, + EdgeKey, + GraphData, + LocationStats, + NameEdge, + NodeKey, + ScoreResult, + ScoreRow, +) + +T = TypeVar("T") + + +def score(cgr: GraphData, oracle: GraphData) -> ScoreResult: + rows: list[ScoreRow] = [] + diff: dict[str, DiffBucket] = {} + + cgr_nodes_all: set[NodeKey] = set() + oracle_nodes_all: set[NodeKey] = set() + for kind in ec.SCORED_NODE_KINDS: + cgr_set = {k for k in cgr.nodes if k.kind == kind.value} + oracle_set = {k for k in oracle.nodes if k.kind == kind.value} + cgr_nodes_all |= cgr_set + oracle_nodes_all |= oracle_set + row = _prf(ec.Category.NODE.value, kind.value, cgr_set, oracle_set) + if row is not None: + rows.append(row) + diff[ec.DIFF_NODE_PREFIX + kind.value] = _node_bucket( + cgr_set, oracle_set, cgr, oracle + ) + node_aggregate = _prf( + ec.Category.NODE.value, ec.AGGREGATE_LABEL, cgr_nodes_all, oracle_nodes_all + ) + if node_aggregate is not None: + rows.append(node_aggregate) + + cgr_edges_all: set[EdgeKey] = set() + oracle_edges_all: set[EdgeKey] = set() + for edge_type in ec.SCORED_EDGE_TYPES: + cgr_set_e = {e for e in cgr.edges if e.rel_type == edge_type.value} + oracle_set_e = {e for e in oracle.edges if e.rel_type == edge_type.value} + cgr_edges_all |= cgr_set_e + oracle_edges_all |= oracle_set_e + row = _prf(ec.Category.EDGE.value, edge_type.value, cgr_set_e, oracle_set_e) + if row is not None: + rows.append(row) + diff[ec.DIFF_EDGE_PREFIX + edge_type.value] = _edge_bucket( + cgr_set_e, oracle_set_e + ) + edge_aggregate = _prf( + ec.Category.EDGE.value, ec.AGGREGATE_LABEL, cgr_edges_all, oracle_edges_all + ) + if edge_aggregate is not None: + rows.append(edge_aggregate) + + for name_edge_type in ec.SCORED_NAME_EDGE_TYPES: + cgr_set_n = {e for e in cgr.name_edges if e.rel_type == name_edge_type.value} + oracle_set_n = { + e for e in oracle.name_edges if e.rel_type == name_edge_type.value + } + row = _prf( + ec.Category.EDGE.value, name_edge_type.value, cgr_set_n, oracle_set_n + ) + if row is not None: + rows.append(row) + diff[ec.DIFF_NAME_EDGE_PREFIX + name_edge_type.value] = _name_edge_bucket( + cgr_set_n, oracle_set_n + ) + + # (H) The Python ast oracle records real end_lineno, so spans are graded like + # (H) the native-oracle languages (Class/Function/Method; Module is excluded). + span_result = score_span(cgr, oracle, ec.SPANNED_NODE_KINDS_TUPLE) + rows.extend(span_result.rows) + diff.update(span_result.diff) + + return ScoreResult(rows=rows, location=_location_stats(cgr, oracle), diff=diff) + + +def score_node_kinds( + cgr: GraphData, oracle: GraphData, kinds: tuple[cs.NodeLabel, ...] +) -> ScoreResult: + rows: list[ScoreRow] = [] + diff: dict[str, DiffBucket] = {} + cgr_all: set[NodeKey] = set() + oracle_all: set[NodeKey] = set() + for kind in kinds: + cgr_set = {k for k in cgr.nodes if k.kind == kind.value} + oracle_set = {k for k in oracle.nodes if k.kind == kind.value} + cgr_all |= cgr_set + oracle_all |= oracle_set + row = _prf(ec.Category.NODE.value, kind.value, cgr_set, oracle_set) + if row is not None: + rows.append(row) + diff[ec.DIFF_NODE_PREFIX + kind.value] = _node_bucket( + cgr_set, oracle_set, cgr, oracle + ) + aggregate = _prf(ec.Category.NODE.value, ec.AGGREGATE_LABEL, cgr_all, oracle_all) + if aggregate is not None: + rows.append(aggregate) + return ScoreResult(rows=rows, location=LocationStats(0, 0, 0, 0.0, 0), diff=diff) + + +def score_edge_types( + cgr: GraphData, oracle: GraphData, edge_types: tuple[cs.RelationshipType, ...] +) -> ScoreResult: + rows: list[ScoreRow] = [] + diff: dict[str, DiffBucket] = {} + cgr_all: set[EdgeKey] = set() + oracle_all: set[EdgeKey] = set() + for edge_type in edge_types: + cgr_set = {e for e in cgr.edges if e.rel_type == edge_type.value} + oracle_set = {e for e in oracle.edges if e.rel_type == edge_type.value} + cgr_all |= cgr_set + oracle_all |= oracle_set + row = _prf(ec.Category.EDGE.value, edge_type.value, cgr_set, oracle_set) + if row is not None: + rows.append(row) + diff[ec.DIFF_EDGE_PREFIX + edge_type.value] = _edge_bucket( + cgr_set, oracle_set + ) + aggregate = _prf(ec.Category.EDGE.value, ec.AGGREGATE_LABEL, cgr_all, oracle_all) + if aggregate is not None: + rows.append(aggregate) + return ScoreResult(rows=rows, location=LocationStats(0, 0, 0, 0.0, 0), diff=diff) + + +def score_name_edge_types( + cgr: GraphData, + oracle: GraphData, + name_edge_types: tuple[cs.RelationshipType, ...], +) -> ScoreResult: + rows: list[ScoreRow] = [] + diff: dict[str, DiffBucket] = {} + cgr_all: set[NameEdge] = set() + oracle_all: set[NameEdge] = set() + for edge_type in name_edge_types: + cgr_set = {e for e in cgr.name_edges if e.rel_type == edge_type.value} + oracle_set = {e for e in oracle.name_edges if e.rel_type == edge_type.value} + cgr_all |= cgr_set + oracle_all |= oracle_set + row = _prf(ec.Category.EDGE.value, edge_type.value, cgr_set, oracle_set) + if row is not None: + rows.append(row) + diff[ec.DIFF_NAME_EDGE_PREFIX + edge_type.value] = _name_edge_bucket( + cgr_set, oracle_set + ) + aggregate = _prf(ec.Category.EDGE.value, ec.AGGREGATE_LABEL, cgr_all, oracle_all) + if aggregate is not None: + rows.append(aggregate) + return ScoreResult(rows=rows, location=LocationStats(0, 0, 0, 0.0, 0), diff=diff) + + +_SpanKey = tuple[str, str, int, int] + + +def score_span( + cgr: GraphData, oracle: GraphData, kinds: tuple[cs.NodeLabel, ...] +) -> ScoreResult: + # (H) Grade node SPANS (end_line) only on nodes both sides identify by + # (H) (kind, file, start), so an end_line disagreement is not masked by, nor + # (H) conflated with, a node-identity miss. Restricted to the shared key set, + # (H) fp and fn each count one end_line mismatch (precision == recall). + rows: list[ScoreRow] = [] + diff: dict[str, DiffBucket] = {} + cgr_all: set[_SpanKey] = set() + oracle_all: set[_SpanKey] = set() + shared = cgr.nodes.keys() & oracle.nodes.keys() + for kind in kinds: + keys = {k for k in shared if k.kind == kind.value} + cgr_set = {(k.kind, k.file, k.start_line, cgr.nodes[k].end_line) for k in keys} + oracle_set = { + (k.kind, k.file, k.start_line, oracle.nodes[k].end_line) for k in keys + } + cgr_all |= cgr_set + oracle_all |= oracle_set + row = _prf(ec.Category.SPAN.value, kind.value, cgr_set, oracle_set) + if row is not None: + rows.append(row) + diff[ec.DIFF_SPAN_PREFIX + kind.value] = _span_bucket(cgr_set, oracle_set) + aggregate = _prf(ec.Category.SPAN.value, ec.AGGREGATE_LABEL, cgr_all, oracle_all) + if aggregate is not None: + rows.append(aggregate) + return ScoreResult(rows=rows, location=LocationStats(0, 0, 0, 0.0, 0), diff=diff) + + +def _fmt_span(span: _SpanKey) -> str: + kind, file, start, end = span + return ec.SPAN_REPR.format(kind=kind, file=file, start=start, end=end) + + +def _span_bucket(cgr_set: set[_SpanKey], oracle_set: set[_SpanKey]) -> DiffBucket: + missing = [_fmt_span(s) for s in sorted(oracle_set - cgr_set)] + extra = [_fmt_span(s) for s in sorted(cgr_set - oracle_set)] + return DiffBucket(missing=missing, extra=extra) + + +def score_structure( + cgr: GraphData, + oracle: GraphData, + node_kinds: tuple[cs.NodeLabel, ...], + edge_types: tuple[cs.RelationshipType, ...], + grade_spans: bool = False, +) -> ScoreResult: + node_result = score_node_kinds(cgr, oracle, node_kinds) + edge_result = score_edge_types(cgr, oracle, edge_types) + # (H) Inheritance name-edges only produce rows when a side has them, so this + # (H) is a no-op for languages without inheritance (Go, Lua). + name_result = score_name_edge_types(cgr, oracle, ec.INHERITANCE_NAME_EDGE_TYPES) + # (H) Spans are opt-in per language: only oracles that emit end_line can grade + # (H) them, else every multi-line node reads as a mismatch against the start. + span_result = ( + score_span(cgr, oracle, node_kinds) + if grade_spans + else ScoreResult(rows=[], location=LocationStats(0, 0, 0, 0.0, 0), diff={}) + ) + return ScoreResult( + rows=node_result.rows + edge_result.rows + name_result.rows + span_result.rows, + location=node_result.location, + diff={ + **node_result.diff, + **edge_result.diff, + **name_result.diff, + **span_result.diff, + }, + ) + + +def _fmt_name_edge(edge: NameEdge) -> str: + return ec.NAME_EDGE_REPR.format( + rel=edge.rel_type, + sfile=edge.source.file, + sstart=edge.source.start_line, + target=edge.target_name, + ) + + +def _name_edge_bucket(cgr_set: set[NameEdge], oracle_set: set[NameEdge]) -> DiffBucket: + missing = [_fmt_name_edge(e) for e in sorted(oracle_set - cgr_set)] + extra = [_fmt_name_edge(e) for e in sorted(cgr_set - oracle_set)] + return DiffBucket(missing=missing, extra=extra) + + +def _prf(category: str, label: str, cgr: set[T], oracle: set[T]) -> ScoreRow | None: + tp = len(cgr & oracle) + fp = len(cgr - oracle) + fn = len(oracle - cgr) + if tp + fp + fn == 0: + return None + precision = tp / (tp + fp) if tp + fp else 0.0 + recall = tp / (tp + fn) if tp + fn else 0.0 + f1 = 2 * precision * recall / (precision + recall) if precision + recall else 0.0 + return ScoreRow( + category=category, + label=label, + tp=tp, + fp=fp, + fn=fn, + precision=round(precision, ec.ROUND_DIGITS), + recall=round(recall, ec.ROUND_DIGITS), + f1=round(f1, ec.ROUND_DIGITS), + ) + + +def _fmt_node(key: NodeKey, name: str) -> str: + return ec.NODE_REPR.format( + kind=key.kind, file=key.file, start=key.start_line, name=name + ) + + +def _fmt_edge(edge: EdgeKey) -> str: + return ec.EDGE_REPR.format( + rel=edge.rel_type, + pfile=edge.parent.file, + pstart=edge.parent.start_line, + cfile=edge.child.file, + cstart=edge.child.start_line, + ) + + +def _node_bucket( + cgr_set: set[NodeKey], + oracle_set: set[NodeKey], + cgr: GraphData, + oracle: GraphData, +) -> DiffBucket: + missing = [_fmt_node(k, oracle.nodes[k].name) for k in sorted(oracle_set - cgr_set)] + extra = [_fmt_node(k, cgr.nodes[k].name) for k in sorted(cgr_set - oracle_set)] + return DiffBucket(missing=missing, extra=extra) + + +def _edge_bucket(cgr_set: set[EdgeKey], oracle_set: set[EdgeKey]) -> DiffBucket: + missing = [_fmt_edge(e) for e in sorted(oracle_set - cgr_set)] + extra = [_fmt_edge(e) for e in sorted(cgr_set - oracle_set)] + return DiffBucket(missing=missing, extra=extra) + + +def _location_stats(cgr: GraphData, oracle: GraphData) -> LocationStats: + shared = [ + k + for k in cgr.nodes.keys() & oracle.nodes.keys() + if k.kind in ec.SPANNED_NODE_KINDS + ] + deltas = [abs(cgr.nodes[k].end_line - oracle.nodes[k].end_line) for k in shared] + if not deltas: + return LocationStats(0, 0, 0, 0.0, 0) + return LocationStats( + matched=len(deltas), + end_exact=sum(1 for d in deltas if d == 0), + end_within_one=sum(1 for d in deltas if d <= 1), + mean_abs_delta=round(fmean(deltas), ec.ROUND_DIGITS), + max_abs_delta=max(deltas), + ) diff --git a/evals/structure_report.py b/evals/structure_report.py new file mode 100644 index 000000000..526396e55 --- /dev/null +++ b/evals/structure_report.py @@ -0,0 +1,49 @@ +import csv +import json +from pathlib import Path + +from loguru import logger +from rich.console import Console +from rich.table import Table + +from . import constants as ec +from . import logs as ls +from .types_defs import ScoreResult + +_console = Console() + + +def write_outputs( + result: ScoreResult, out_dir: Path, scores_filename: str, diff_filename: str +) -> None: + out_dir.mkdir(parents=True, exist_ok=True) + scores_path = out_dir / scores_filename + with scores_path.open("w", newline="", encoding="utf-8") as handle: + writer = csv.DictWriter(handle, fieldnames=list(ec.CSV_FIELDS)) + writer.writeheader() + for row in result.rows: + writer.writerow(row) + logger.success(ls.WROTE_SCORES.format(path=scores_path)) + + diff_path = out_dir / diff_filename + diff_path.write_text(json.dumps(result.diff, indent=2), encoding="utf-8") + logger.success(ls.WROTE_DIFF.format(path=diff_path)) + + +def render(result: ScoreResult, title: str) -> None: + table = Table(title=title) + for column in ec.CSV_FIELDS: + justify = "left" if column in ec.LEFT_COLUMNS else "right" + table.add_column(column, justify=justify) + for row in result.rows: + table.add_row( + row["category"], + row["label"], + str(row["tp"]), + str(row["fp"]), + str(row["fn"]), + f"{row['precision']:.4f}", + f"{row['recall']:.4f}", + f"{row['f1']:.4f}", + ) + _console.print(table) diff --git a/evals/ts_l1.py b/evals/ts_l1.py new file mode 100644 index 000000000..5b710ca4a --- /dev/null +++ b/evals/ts_l1.py @@ -0,0 +1,51 @@ +from pathlib import Path +from typing import Annotated + +import typer +from loguru import logger + +from . import constants as ec +from . import logs as ls +from .cgr_graph import extract_cgr_ts_graph +from .oracles import run_typescript_oracle, typescript_available +from .score import score_structure +from .structure_report import render, write_outputs + +_TITLE = "cgr L1 structure eval (TypeScript vs tsc)" + + +def main( + target: Annotated[ + Path, typer.Option(help="Directory of TypeScript sources to evaluate.") + ] = Path(ec.GO_DEFAULT_TARGET), + project_name: Annotated[ + str, typer.Option(help="cgr project name; defaults to target dir name.") + ] = "", + out_dir: Annotated[ + Path, typer.Option(help="Directory for ts_scores.csv and ts_diff.json.") + ] = Path(ec.DEFAULT_OUT_DIR), +) -> None: + if not typescript_available(): + logger.error(ls.TS_ORACLE_MISSING) + raise typer.Exit(code=1) + + target = target.resolve() + project = project_name or target.name + + logger.info(ls.TS_EXTRACTING_CGR.format(target=target, project=project)) + cgr = extract_cgr_ts_graph(target, project) + logger.success(ls.TS_CGR_DONE.format(count=len(cgr.nodes))) + + logger.info(ls.TS_EXTRACTING_ORACLE.format(binary=ec.NODE_BIN, target=target)) + oracle = run_typescript_oracle(target) + logger.success(ls.TS_ORACLE_DONE.format(count=len(oracle.nodes))) + + result = score_structure( + cgr, oracle, ec.TS_SCORED_NODE_KINDS, ec.SCORED_EDGE_TYPES, grade_spans=True + ) + write_outputs(result, out_dir, ec.TS_SCORES_FILENAME, ec.TS_DIFF_FILENAME) + render(result, _TITLE) + + +if __name__ == "__main__": + typer.run(main) diff --git a/evals/types_defs.py b/evals/types_defs.py new file mode 100644 index 000000000..23e382fa2 --- /dev/null +++ b/evals/types_defs.py @@ -0,0 +1,95 @@ +from typing import NamedTuple, NotRequired, TypedDict + + +class NodeKey(NamedTuple): + kind: str + file: str + start_line: int + + +class DefNode(NamedTuple): + key: NodeKey + name: str + end_line: int + + +class EdgeKey(NamedTuple): + rel_type: str + parent: NodeKey + child: NodeKey + + +class NameEdge(NamedTuple): + rel_type: str + source: NodeKey + target_name: str + + +class GraphData(NamedTuple): + nodes: dict[NodeKey, DefNode] + edges: set[EdgeKey] + name_edges: set[NameEdge] + + +class ScoreRow(TypedDict): + category: str + label: str + tp: int + fp: int + fn: int + precision: float + recall: float + f1: float + + +class LocationStats(NamedTuple): + matched: int + end_exact: int + end_within_one: int + mean_abs_delta: float + max_abs_delta: int + + +class DiffBucket(TypedDict): + missing: list[str] + extra: list[str] + + +class ScoreResult(NamedTuple): + rows: list[ScoreRow] + location: LocationStats + diff: dict[str, DiffBucket] + + +class OracleRecord(TypedDict): + kind: str + file: str + line: int + name: str + # (H) Optional so oracles that have not yet adopted span emission keep working + # (H) (records_to_nodes falls back to the start line). + end_line: NotRequired[int] + + +class OracleNodeRef(TypedDict): + kind: str + file: str + line: int + + +class OracleEdge(TypedDict): + rel: str + parent: OracleNodeRef + child: OracleNodeRef + + +class OracleNameEdge(TypedDict): + rel: str + source: OracleNodeRef + target_name: str + + +class OraclePayload(TypedDict): + nodes: list[OracleRecord] + edges: list[OracleEdge] + name_edges: list[OracleNameEdge] diff --git a/funding.json b/funding.json new file mode 100644 index 000000000..b8b3a6f69 --- /dev/null +++ b/funding.json @@ -0,0 +1,108 @@ +{ + "$schema": "https://fundingjson.org/schema/v1.1.0.json", + "version": "v1.1.0", + "entity": { + "type": "individual", + "role": "owner", + "name": "Vitali Avagyan", + "email": "eheva87@gmail.com", + "description": "Creator and maintainer of Code-Graph-RAG, an open source tool for AI-powered codebase understanding via knowledge graphs.", + "webpageUrl": { + "url": "https://code-graph-rag.com" + } + }, + "projects": [ + { + "guid": "code-graph-rag", + "name": "Code-Graph-RAG", + "description": "An open source retrieval-augmented generation system that analyzes multi-language codebases using Tree-sitter, builds comprehensive knowledge graphs, and enables natural language querying and editing of codebase structure and relationships. Supports 11 programming languages with a unified graph schema and functions as an MCP server for AI assistant integration.", + "webpageUrl": { + "url": "https://code-graph-rag.com" + }, + "repositoryUrl": { + "url": "https://codeberg.org/vitali87/code-graph-rag" + }, + "licenses": [ + "spdx:MIT" + ], + "tags": [ + "rag", + "knowledge-graph", + "code-analysis", + "tree-sitter", + "mcp-server", + "developer-tools", + "ai", + "graph-database", + "semantic-search", + "python" + ] + } + ], + "funding": { + "channels": [ + { + "guid": "github-sponsors", + "type": "payment-provider", + "address": "https://github.com/sponsors/vitali87", + "description": "GitHub Sponsors" + }, + { + "guid": "buy-me-a-coffee", + "type": "payment-provider", + "address": "https://buymeacoffee.com/vitali87", + "description": "Buy Me a Coffee" + } + ], + "plans": [ + { + "guid": "one-time-any", + "status": "active", + "name": "One-time donation", + "description": "Support Code-Graph-RAG development with a one-time contribution of any amount.", + "amount": 0, + "currency": "USD", + "frequency": "one-time", + "channels": [ + "github-sponsors", + "buy-me-a-coffee" + ] + }, + { + "guid": "monthly-supporter", + "status": "active", + "name": "Monthly supporter", + "description": "Recurring monthly support for ongoing development, security maintenance, and new language support.", + "amount": 0, + "currency": "USD", + "frequency": "monthly", + "channels": [ + "github-sponsors", + "buy-me-a-coffee" + ] + }, + { + "guid": "annual-sponsor", + "status": "active", + "name": "Annual sponsor", + "description": "Yearly sponsorship for sustained development of Code-Graph-RAG as open infrastructure for AI-powered codebase understanding.", + "amount": 25000, + "currency": "USD", + "frequency": "yearly", + "channels": [ + "github-sponsors" + ] + } + ], + "history": [ + { + "year": 2025, + "income": 0, + "expenses": 0, + "taxes": 0, + "currency": "USD", + "description": "Project launched in 2025. No external funding received." + } + ] + } +} diff --git a/mkdocs.yml b/mkdocs.yml new file mode 100644 index 000000000..f3fac35d5 --- /dev/null +++ b/mkdocs.yml @@ -0,0 +1,121 @@ +site_name: Code-Graph-RAG +site_url: https://docs.code-graph-rag.com +site_description: >- + Graph-based RAG system that parses multi-language codebases with Tree-sitter, + builds knowledge graphs, and enables natural language querying, editing, + and optimization. +site_author: Vitali Avagyan + +repo_name: vitali87/code-graph-rag +repo_url: https://codeberg.org/vitali87/code-graph-rag +edit_uri: _edit/branch/main/docs/ + +copyright: Copyright © 2024 Vitali Avagyan + +theme: + name: material + custom_dir: docs/overrides + logo: assets/logo-icon.png + favicon: assets/favicon.png + font: + text: Inter + code: JetBrains Mono + palette: + - scheme: slate + primary: custom + accent: custom + toggle: + icon: material/brightness-4 + name: Switch to light mode + - scheme: default + primary: custom + accent: custom + toggle: + icon: material/brightness-7 + name: Switch to dark mode + features: + - navigation.instant + - navigation.tracking + - navigation.tabs + - navigation.sections + - navigation.expand + - navigation.top + - search.suggest + - search.highlight + - content.code.copy + - content.code.annotate + - content.tabs.link + - toc.follow + icon: + repo: fontawesome/brands/github + +plugins: + - search + - minify: + minify_html: true + +markdown_extensions: + - admonition + - pymdownx.details + - pymdownx.superfences + - pymdownx.highlight: + anchor_linenums: true + line_spans: __span + pygments_lang_class: true + - pymdownx.inlinehilite + - pymdownx.tabbed: + alternate_style: true + - pymdownx.snippets + - pymdownx.emoji: + emoji_index: !!python/name:material.extensions.emoji.twemoji + emoji_generator: !!python/name:material.extensions.emoji.to_svg + - tables + - attr_list + - md_in_html + - toc: + permalink: true + +nav: + - Home: index.md + - Getting Started: + - Installation: getting-started/installation.md + - Configuration: getting-started/configuration.md + - Quick Start: getting-started/quickstart.md + - User Guide: + - CLI Reference: guide/cli-reference.md + - Interactive Querying: guide/interactive-querying.md + - Code Optimization: guide/code-optimization.md + - Graph Export: guide/graph-export.md + - Real-Time Updates: guide/realtime-updates.md + - MCP Server: guide/mcp-server.md + - Python SDK: + - Overview: sdk/overview.md + - Graph Loader: sdk/graph-loader.md + - Cypher Generator: sdk/cypher-generator.md + - Semantic Search: sdk/semantic-search.md + - Architecture: + - Overview: architecture/overview.md + - Graph Schema: architecture/graph-schema.md + - Language Support: architecture/language-support.md + - Advanced: + - Adding Languages: advanced/adding-languages.md + - Ignore Patterns: advanced/ignore-patterns.md + - Building Binaries: advanced/building-binaries.md + - Troubleshooting: advanced/troubleshooting.md + - Contributing: contributing.md + +# Internal analysis artifacts kept in the repo but not published to the doc site nav. +not_in_nav: | + /reports/ + /TODO.md + +extra_css: + - stylesheets/extra.css + +extra: + social: + - icon: fontawesome/brands/git-alt + link: https://codeberg.org/vitali87/code-graph-rag + - icon: fontawesome/brands/python + link: https://pypi.org/project/code-graph-rag/ + generator: false diff --git a/optimize/memory_profile.py b/optimize/memory_profile.py new file mode 100644 index 000000000..eaf98c2e3 --- /dev/null +++ b/optimize/memory_profile.py @@ -0,0 +1,665 @@ +"""Memory allocation profiler for code-graph-rag. + +Profiles the main data structures and parsing pipeline using tracemalloc. +Does NOT require external services (Memgraph, Qdrant). +""" + +import gc +import json +import sys +import tracemalloc +from collections import OrderedDict, defaultdict +from pathlib import Path +from textwrap import dedent + +PROJECT_ROOT = Path(__file__).resolve().parent.parent + +sys.path.insert(0, str(PROJECT_ROOT)) + + +def format_bytes(size: int) -> str: + for unit in ("B", "KiB", "MiB", "GiB"): + if abs(size) < 1024: + return f"{size:.1f} {unit}" + size /= 1024 # type: ignore[assignment] + return f"{size:.1f} TiB" + + +def snapshot_diff(label: str, snap1: tracemalloc.Snapshot, snap2: tracemalloc.Snapshot, top_n: int = 15) -> dict: + stats = snap2.compare_to(snap1, "lineno") + total_diff = sum(s.size_diff for s in stats if s.size_diff > 0) + result = { + "label": label, + "total_new_alloc": total_diff, + "total_new_alloc_human": format_bytes(total_diff), + "top_allocators": [], + } + for stat in stats[:top_n]: + if stat.size_diff > 0: + result["top_allocators"].append({ + "file": str(stat.traceback), + "size_diff": stat.size_diff, + "size_diff_human": format_bytes(stat.size_diff), + "count_diff": stat.count_diff, + }) + return result + + +def measure_object_sizes() -> dict: + """Measure sizes of core Python data structures used in the codebase.""" + results = {} + + # 1. FunctionRegistryTrie: dict + trie node overhead + from codebase_rag.graph_updater import FunctionRegistryTrie + + trie = FunctionRegistryTrie() + gc.collect() + tracemalloc.clear_traces() + snap_before = tracemalloc.take_snapshot() + + for i in range(10_000): + qn = f"project.module_{i // 100}.class_{i // 10}.func_{i}" + trie.insert(qn, "Function") + + gc.collect() + snap_after = tracemalloc.take_snapshot() + results["FunctionRegistryTrie_10k_insert"] = snapshot_diff( + "FunctionRegistryTrie: insert 10k qualified names", snap_before, snap_after + ) + results["FunctionRegistryTrie_10k_insert"]["entries_size"] = sys.getsizeof(trie._entries) + results["FunctionRegistryTrie_10k_insert"]["entry_count"] = len(trie._entries) + + # Measure trie overhead vs flat dict + flat_dict = {} + gc.collect() + tracemalloc.clear_traces() + snap_before = tracemalloc.take_snapshot() + for i in range(10_000): + qn = f"project.module_{i // 100}.class_{i // 10}.func_{i}" + flat_dict[qn] = "Function" + gc.collect() + snap_after = tracemalloc.take_snapshot() + results["flat_dict_10k_baseline"] = snapshot_diff( + "Flat dict: 10k entries baseline", snap_before, snap_after + ) + + # 2. SimpleNameLookup: defaultdict[str, set[str]] + simple_lookup: defaultdict[str, set[str]] = defaultdict(set) + gc.collect() + tracemalloc.clear_traces() + snap_before = tracemalloc.take_snapshot() + for i in range(10_000): + simple_name = f"func_{i % 500}" + qn = f"project.module_{i // 100}.class_{i // 10}.{simple_name}" + simple_lookup[simple_name].add(qn) + gc.collect() + snap_after = tracemalloc.take_snapshot() + results["SimpleNameLookup_10k"] = snapshot_diff( + "SimpleNameLookup: 10k entries, 500 unique names", snap_before, snap_after + ) + + # 3. BoundedASTCache with OrderedDict + from codebase_rag.graph_updater import BoundedASTCache + + cache = BoundedASTCache(max_entries=5000, max_memory_mb=512) + gc.collect() + tracemalloc.clear_traces() + snap_before = tracemalloc.take_snapshot() + + # Simulate storing mock entries (can't use real AST nodes without tree-sitter parsing) + for i in range(1000): + key = Path(f"/fake/path/module_{i}.py") + # Use a placeholder tuple since we can't create real AST nodes without parsing + cache.cache[key] = (None, "python") # type: ignore + gc.collect() + snap_after = tracemalloc.take_snapshot() + results["BoundedASTCache_1k_entries"] = snapshot_diff( + "BoundedASTCache (OrderedDict): 1k entries", snap_before, snap_after + ) + + # 4. node_buffer in MemgraphIngestor pattern + node_buffer: list[tuple[str, dict[str, str | int | float | bool | list[str] | None]]] = [] + gc.collect() + tracemalloc.clear_traces() + snap_before = tracemalloc.take_snapshot() + for i in range(5000): + node_buffer.append(( + "Function", + { + "qualified_name": f"project.mod_{i // 50}.cls_{i // 10}.fn_{i}", + "name": f"fn_{i}", + "start_line": i * 10, + "end_line": i * 10 + 15, + "path": f"src/mod_{i // 50}/cls_{i // 10}.py", + }, + )) + gc.collect() + snap_after = tracemalloc.take_snapshot() + results["node_buffer_5k"] = snapshot_diff( + "node_buffer: 5k buffered nodes", snap_before, snap_after + ) + + # 5. _rel_groups in MemgraphIngestor pattern + rel_groups: defaultdict[tuple, list[dict]] = defaultdict(list) + gc.collect() + tracemalloc.clear_traces() + snap_before = tracemalloc.take_snapshot() + for i in range(10_000): + pattern = ("Function", "qualified_name", "CALLS", "Function", "qualified_name") + rel_groups[pattern].append({ + "from_val": f"project.mod.fn_{i}", + "to_val": f"project.mod.fn_{i + 1}", + "props": {}, + }) + gc.collect() + snap_after = tracemalloc.take_snapshot() + results["rel_groups_10k"] = snapshot_diff( + "rel_groups: 10k buffered relationships", snap_before, snap_after + ) + + # 6. import_mapping pattern + import_mapping: dict[str, dict[str, str]] = {} + gc.collect() + tracemalloc.clear_traces() + snap_before = tracemalloc.take_snapshot() + for i in range(2000): + module_qn = f"project.module_{i}" + imports = {} + for j in range(20): + imports[f"import_{j}"] = f"external.package_{j}.symbol_{j}" + import_mapping[module_qn] = imports + gc.collect() + snap_after = tracemalloc.take_snapshot() + results["import_mapping_2k_modules"] = snapshot_diff( + "import_mapping: 2k modules x 20 imports each", snap_before, snap_after + ) + + # 7. class_inheritance pattern + class_inheritance: dict[str, list[str]] = {} + gc.collect() + tracemalloc.clear_traces() + snap_before = tracemalloc.take_snapshot() + for i in range(3000): + class_qn = f"project.module_{i // 30}.Class_{i}" + parents = [f"project.module_{i // 30}.BaseClass_{j}" for j in range(3)] + class_inheritance[class_qn] = parents + gc.collect() + snap_after = tracemalloc.take_snapshot() + results["class_inheritance_3k"] = snapshot_diff( + "class_inheritance: 3k classes x 3 parents", snap_before, snap_after + ) + + return results + + +def measure_tree_sitter_parsing() -> dict: + """Profile memory during tree-sitter parsing of actual Python files.""" + results = {} + + try: + from tree_sitter import Language, Parser + import tree_sitter_python + + py_language = Language(tree_sitter_python.language()) + parser = Parser(py_language) + except Exception as e: + return {"error": f"tree-sitter setup failed: {e}"} + + # Find Python files in the project itself + py_files = sorted(PROJECT_ROOT.glob("codebase_rag/**/*.py")) + if not py_files: + return {"error": "No Python files found"} + + # Profile parsing all project files + gc.collect() + tracemalloc.clear_traces() + snap_before = tracemalloc.take_snapshot() + + trees = [] + total_bytes_parsed = 0 + for f in py_files: + try: + source = f.read_bytes() + total_bytes_parsed += len(source) + tree = parser.parse(source) + trees.append((f, tree)) + except Exception: + pass + + gc.collect() + snap_after = tracemalloc.take_snapshot() + results["parse_all_project_files"] = snapshot_diff( + f"Parse {len(trees)} Python files ({format_bytes(total_bytes_parsed)} source)", + snap_before, snap_after + ) + results["parse_all_project_files"]["file_count"] = len(trees) + results["parse_all_project_files"]["source_bytes"] = total_bytes_parsed + + # Profile AST node retention + gc.collect() + tracemalloc.clear_traces() + snap_before = tracemalloc.take_snapshot() + + root_nodes = [tree.root_node for _, tree in trees] + + gc.collect() + snap_after = tracemalloc.take_snapshot() + results["ast_node_retention"] = snapshot_diff( + f"Retaining {len(root_nodes)} AST root nodes", snap_before, snap_after + ) + + # Profile what happens when we walk AST nodes (simulating function extraction) + gc.collect() + tracemalloc.clear_traces() + snap_before = tracemalloc.take_snapshot() + + all_function_nodes = [] + for root in root_nodes: + stack = [root] + while stack: + node = stack.pop() + if node.type in ("function_definition", "class_definition"): + all_function_nodes.append(node) + stack.extend(node.children) + + gc.collect() + snap_after = tracemalloc.take_snapshot() + results["ast_walk_function_extraction"] = snapshot_diff( + f"Walking ASTs, collected {len(all_function_nodes)} function/class nodes", + snap_before, snap_after, + ) + results["ast_walk_function_extraction"]["function_class_count"] = len(all_function_nodes) + + # Cleanup + del trees, root_nodes, all_function_nodes + + return results + + +def measure_graph_loader_json() -> dict: + """Profile GraphLoader JSON loading and indexing with synthetic data.""" + results = {} + + # Create synthetic graph JSON + nodes = [] + relationships = [] + for i in range(5000): + nodes.append({ + "node_id": i, + "labels": ["Function"], + "properties": { + "qualified_name": f"project.module_{i // 50}.class_{i // 10}.func_{i}", + "name": f"func_{i}", + "start_line": i * 10, + "end_line": i * 10 + 15, + "path": f"src/module_{i // 50}/class_{i // 10}.py", + }, + }) + for i in range(8000): + relationships.append({ + "from_id": i % 5000, + "to_id": (i + 1) % 5000, + "type": "CALLS", + "properties": {}, + }) + + graph_data = { + "nodes": nodes, + "relationships": relationships, + "metadata": { + "total_nodes": len(nodes), + "total_relationships": len(relationships), + "exported_at": "2024-01-01T00:00:00Z", + }, + } + + # Write temp file + tmp_path = PROJECT_ROOT / "optimize" / "_tmp_graph.json" + with open(tmp_path, "w") as f: + json.dump(graph_data, f) + + try: + from codebase_rag.graph_loader import GraphLoader + + gc.collect() + tracemalloc.clear_traces() + snap_before = tracemalloc.take_snapshot() + + loader = GraphLoader(str(tmp_path)) + loader.load() + + gc.collect() + snap_after = tracemalloc.take_snapshot() + results["graph_loader_5k_nodes_8k_rels"] = snapshot_diff( + "GraphLoader: load 5k nodes + 8k relationships from JSON", + snap_before, snap_after, + ) + + # Measure index building + gc.collect() + tracemalloc.clear_traces() + snap_before = tracemalloc.take_snapshot() + + loader._build_property_index("qualified_name") + + gc.collect() + snap_after = tracemalloc.take_snapshot() + results["graph_loader_property_index"] = snapshot_diff( + "GraphLoader: build property index on qualified_name", + snap_before, snap_after, + ) + + except Exception as e: + results["error"] = str(e) + finally: + tmp_path.unlink(missing_ok=True) + + return results + + +def measure_embedding_cache() -> dict: + """Profile EmbeddingCache with simulated embeddings.""" + results = {} + + try: + from codebase_rag.embedder import EmbeddingCache + + cache = EmbeddingCache() + gc.collect() + tracemalloc.clear_traces() + snap_before = tracemalloc.take_snapshot() + + # Simulate 2k embeddings, each 768-dim float vector + for i in range(2000): + content = f"def function_{i}(x, y): return x + y + {i}" + embedding = [float(j) / 768.0 for j in range(768)] + cache.put(content, embedding) + + gc.collect() + snap_after = tracemalloc.take_snapshot() + results["embedding_cache_2k_768dim"] = snapshot_diff( + "EmbeddingCache: 2k entries x 768-dim embeddings", + snap_before, snap_after, + ) + results["embedding_cache_2k_768dim"]["cache_dict_size"] = sys.getsizeof(cache._cache) + results["embedding_cache_2k_768dim"]["entry_count"] = len(cache) + + except Exception as e: + results["error"] = str(e) + + return results + + +def measure_gc_pressure() -> dict: + """Measure GC pressure by tracking collections during workload simulation.""" + results = {} + + gc.collect() + gc_stats_before = gc.get_stats() + gc.disable() + + # Simulate a typical file processing workload creating many temporary objects + temp_objects_created = 0 + for i in range(1000): + # Simulate tree-sitter query results (lists of tuples, dicts) + captures = {"function": [f"node_{j}" for j in range(20)]} + for func_name in captures["function"]: + # Simulate qualified name construction (many string concatenations) + parts = ["project", f"module_{i}", f"class_{i // 10}", func_name] + qn = ".".join(parts) + # Simulate property dict construction + props = { + "qualified_name": qn, + "name": func_name, + "start_line": i * 10, + "end_line": i * 10 + 15, + } + temp_objects_created += 1 + del props + + gc.enable() + gc.collect() + gc_stats_after = gc.get_stats() + + results["gc_pressure_simulation"] = { + "label": "GC pressure during simulated file processing (1k files x 20 funcs)", + "temp_objects_created": temp_objects_created, + "gc_gen0_before": gc_stats_before[0], + "gc_gen0_after": gc_stats_after[0], + "gc_gen1_before": gc_stats_before[1], + "gc_gen1_after": gc_stats_after[1], + "gc_gen2_before": gc_stats_before[2], + "gc_gen2_after": gc_stats_after[2], + } + + return results + + +def measure_string_duplication() -> dict: + """Estimate memory wasted on duplicated strings in typical data structures.""" + results = {} + + gc.collect() + tracemalloc.clear_traces() + snap_before = tracemalloc.take_snapshot() + + # Simulate how property dicts repeat the same key strings thousands of times + all_dicts: list[dict] = [] + for i in range(5000): + d = { + "qualified_name": f"project.mod_{i // 50}.cls_{i // 10}.fn_{i}", + "name": f"fn_{i}", + "start_line": i * 10, + "end_line": i * 10 + 15, + "path": f"src/mod_{i // 50}/cls_{i // 10}.py", + } + all_dicts.append(d) + + gc.collect() + snap_after = tracemalloc.take_snapshot() + results["property_dict_duplication_5k"] = snapshot_diff( + "5k property dicts with repeated key strings", snap_before, snap_after + ) + + # Compare: same data using tuples (no key duplication) + gc.collect() + tracemalloc.clear_traces() + snap_before = tracemalloc.take_snapshot() + + all_tuples: list[tuple] = [] + for i in range(5000): + t = ( + f"project.mod_{i // 50}.cls_{i // 10}.fn_{i}", + f"fn_{i}", + i * 10, + i * 10 + 15, + f"src/mod_{i // 50}/cls_{i // 10}.py", + ) + all_tuples.append(t) + + gc.collect() + snap_after = tracemalloc.take_snapshot() + results["property_tuple_alternative_5k"] = snapshot_diff( + "5k tuples (no key duplication) as alternative", snap_before, snap_after + ) + + return results + + +def measure_peak_usage_full_pipeline() -> dict: + """Simulate the full pipeline memory envelope. + + This exercises the complete data structure lifecycle: + 1. Build FunctionRegistryTrie + 2. Build import mappings + 3. Build class inheritance + 4. Buffer nodes and relationships + 5. Measure peak + """ + results = {} + + gc.collect() + tracemalloc.clear_traces() + snap_baseline = tracemalloc.take_snapshot() + + # Phase 1: Build FunctionRegistryTrie + from codebase_rag.graph_updater import FunctionRegistryTrie + + simple_name_lookup: defaultdict[str, set[str]] = defaultdict(set) + trie = FunctionRegistryTrie(simple_name_lookup=simple_name_lookup) + + for i in range(15_000): + simple_name = f"func_{i % 1000}" + qn = f"project.module_{i // 150}.class_{i // 15}.{simple_name}" + trie.insert(qn, "Function") + simple_name_lookup[simple_name].add(qn) + + gc.collect() + snap_phase1 = tracemalloc.take_snapshot() + results["phase1_trie_15k"] = snapshot_diff( + "Phase 1: FunctionRegistryTrie + SimpleNameLookup (15k entries)", + snap_baseline, snap_phase1, + ) + + # Phase 2: Import mappings + import_mapping: dict[str, dict[str, str]] = {} + for i in range(1500): + module_qn = f"project.module_{i}" + imports = {f"sym_{j}": f"ext.pkg_{j}.sym_{j}" for j in range(25)} + import_mapping[module_qn] = imports + + gc.collect() + snap_phase2 = tracemalloc.take_snapshot() + results["phase2_imports_1500_modules"] = snapshot_diff( + "Phase 2: import_mapping (1500 modules x 25 imports)", + snap_phase1, snap_phase2, + ) + + # Phase 3: Class inheritance + class_inheritance: dict[str, list[str]] = {} + for i in range(5000): + class_qn = f"project.module_{i // 50}.Class_{i}" + parents = [f"project.module_{i // 50}.Base_{j}" for j in range(2)] + class_inheritance[class_qn] = parents + + gc.collect() + snap_phase3 = tracemalloc.take_snapshot() + results["phase3_inheritance_5k"] = snapshot_diff( + "Phase 3: class_inheritance (5k classes x 2 parents)", + snap_phase2, snap_phase3, + ) + + # Phase 4: Node + relationship buffers + node_buffer: list[tuple[str, dict]] = [] + for i in range(10_000): + node_buffer.append(( + "Function", + { + "qualified_name": f"project.mod_{i // 100}.cls_{i // 10}.fn_{i}", + "name": f"fn_{i}", + "start_line": i * 5, + "end_line": i * 5 + 10, + }, + )) + + rel_groups: defaultdict[tuple, list[dict]] = defaultdict(list) + for i in range(20_000): + pattern = ("Function", "qualified_name", "CALLS", "Function", "qualified_name") + rel_groups[pattern].append({ + "from_val": f"project.mod.fn_{i}", + "to_val": f"project.mod.fn_{i + 1}", + "props": {}, + }) + + gc.collect() + snap_phase4 = tracemalloc.take_snapshot() + results["phase4_buffers_10k_nodes_20k_rels"] = snapshot_diff( + "Phase 4: node_buffer (10k) + rel_groups (20k)", + snap_phase3, snap_phase4, + ) + + # Total from baseline + results["total_pipeline_memory"] = snapshot_diff( + "TOTAL: Full pipeline memory (all phases combined)", + snap_baseline, snap_phase4, + ) + + # Peak usage + current, peak = tracemalloc.get_traced_memory() + results["peak_traced_memory"] = { + "current": current, + "current_human": format_bytes(current), + "peak": peak, + "peak_human": format_bytes(peak), + } + + return results + + +def main() -> None: + tracemalloc.start(25) # 25 frames for stack traces + + all_results: dict[str, dict] = {} + + print("=" * 70) + print("MEMORY ALLOCATION PROFILING REPORT") + print("=" * 70) + + print("\n[1/7] Measuring core data structure sizes...") + all_results["data_structures"] = measure_object_sizes() + + print("[2/7] Profiling tree-sitter parsing...") + all_results["tree_sitter"] = measure_tree_sitter_parsing() + + print("[3/7] Profiling GraphLoader JSON loading...") + all_results["graph_loader"] = measure_graph_loader_json() + + print("[4/7] Profiling EmbeddingCache...") + all_results["embedding_cache"] = measure_embedding_cache() + + print("[5/7] Measuring GC pressure...") + all_results["gc_pressure"] = measure_gc_pressure() + + print("[6/7] Measuring string duplication overhead...") + all_results["string_duplication"] = measure_string_duplication() + + print("[7/7] Measuring peak usage in full pipeline simulation...") + all_results["full_pipeline"] = measure_peak_usage_full_pipeline() + + tracemalloc.stop() + + # Print summary report + print("\n" + "=" * 70) + print("RESULTS SUMMARY") + print("=" * 70) + + for section_name, section_data in all_results.items(): + print(f"\n--- {section_name.upper()} ---") + for key, value in section_data.items(): + if isinstance(value, dict) and "label" in value: + total = value.get("total_new_alloc_human", value.get("peak_human", "N/A")) + print(f" {value['label']}") + print(f" Total new allocation: {total}") + if "top_allocators" in value: + for i, alloc in enumerate(value["top_allocators"][:5]): + print(f" [{i+1}] {alloc['size_diff_human']} ({alloc['count_diff']} objects) - {alloc['file'][:80]}") + elif isinstance(value, dict) and "current_human" in value: + print(f" Current traced: {value['current_human']}") + print(f" Peak traced: {value['peak_human']}") + elif isinstance(value, dict) and "temp_objects_created" in value: + print(f" {value['label']}") + print(f" Temp objects created: {value['temp_objects_created']}") + for gen in range(3): + before = value[f"gc_gen{gen}_before"] + after = value[f"gc_gen{gen}_after"] + print(f" Gen{gen}: collections {before['collections']} -> {after['collections']}, collected {before['collected']} -> {after['collected']}") + + # Save detailed JSON + output_path = PROJECT_ROOT / "optimize" / "memory_profile_results.json" + with open(output_path, "w") as f: + json.dump(all_results, f, indent=2, default=str) + print(f"\nDetailed results saved to: {output_path}") + + +if __name__ == "__main__": + main() diff --git a/optimize/memory_profile_results.json b/optimize/memory_profile_results.json new file mode 100644 index 000000000..f8cb642db --- /dev/null +++ b/optimize/memory_profile_results.json @@ -0,0 +1,1482 @@ +{ + "data_structures": { + "FunctionRegistryTrie_10k_insert": { + "label": "FunctionRegistryTrie: insert 10k qualified names", + "total_new_alloc": 3681520, + "total_new_alloc_human": "3.5 MiB", + "top_allocators": [ + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/codebase_rag/graph_updater.py:56", + "size_diff": 1079880, + "size_diff_human": "1.0 MiB", + "count_diff": 8999 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/codebase_rag/graph_updater.py:51", + "size_diff": 1062648, + "size_diff_human": "1.0 MiB", + "count_diff": 13203 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:61", + "size_diff": 776790, + "size_diff_human": "758.6 KiB", + "count_diff": 10000 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/codebase_rag/graph_updater.py:46", + "size_diff": 553818, + "size_diff_human": "540.8 KiB", + "count_diff": 11101 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/codebase_rag/graph_updater.py:44", + "size_diff": 207672, + "size_diff_human": "202.8 KiB", + "count_diff": 2 + }, + { + "file": "/Users/vitaliavagyan/.local/share/uv/python/cpython-3.12.2-macos-aarch64-none/lib/python3.12/tracemalloc.py:560", + "size_diff": 312, + "size_diff_human": "312.0 B", + "count_diff": 2 + }, + { + "file": "/Users/vitaliavagyan/.local/share/uv/python/cpython-3.12.2-macos-aarch64-none/lib/python3.12/tracemalloc.py:423", + "size_diff": 312, + "size_diff_human": "312.0 B", + "count_diff": 2 + }, + { + "file": "/Users/vitaliavagyan/.local/share/uv/python/cpython-3.12.2-macos-aarch64-none/lib/python3.12/tracemalloc.py:558", + "size_diff": 56, + "size_diff_human": "56.0 B", + "count_diff": 1 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:60", + "size_diff": 32, + "size_diff_human": "32.0 B", + "count_diff": 1 + } + ], + "entries_size": 207616, + "entry_count": 10000 + }, + "flat_dict_10k_baseline": { + "label": "Flat dict: 10k entries baseline", + "total_new_alloc": 985022, + "total_new_alloc_human": "961.9 KiB", + "top_allocators": [ + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:78", + "size_diff": 776790, + "size_diff_human": "758.6 KiB", + "count_diff": 10000 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:79", + "size_diff": 207552, + "size_diff_human": "202.7 KiB", + "count_diff": 1 + }, + { + "file": "/Users/vitaliavagyan/.local/share/uv/python/cpython-3.12.2-macos-aarch64-none/lib/python3.12/tracemalloc.py:560", + "size_diff": 296, + "size_diff_human": "296.0 B", + "count_diff": 2 + }, + { + "file": "/Users/vitaliavagyan/.local/share/uv/python/cpython-3.12.2-macos-aarch64-none/lib/python3.12/tracemalloc.py:423", + "size_diff": 296, + "size_diff_human": "296.0 B", + "count_diff": 2 + }, + { + "file": "/Users/vitaliavagyan/.local/share/uv/python/cpython-3.12.2-macos-aarch64-none/lib/python3.12/tracemalloc.py:558", + "size_diff": 56, + "size_diff_human": "56.0 B", + "count_diff": 1 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:77", + "size_diff": 32, + "size_diff_human": "32.0 B", + "count_diff": 1 + } + ] + }, + "SimpleNameLookup_10k": { + "label": "SimpleNameLookup: 10k entries, 500 unique names", + "total_new_alloc": 1935779, + "total_new_alloc_human": "1.8 MiB", + "top_allocators": [ + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:94", + "size_diff": 1144992, + "size_diff_human": "1.1 MiB", + "count_diff": 1001 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:93", + "size_diff": 765700, + "size_diff_human": "747.8 KiB", + "count_diff": 10000 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:92", + "size_diff": 24439, + "size_diff_human": "23.9 KiB", + "count_diff": 501 + }, + { + "file": "/Users/vitaliavagyan/.local/share/uv/python/cpython-3.12.2-macos-aarch64-none/lib/python3.12/tracemalloc.py:560", + "size_diff": 280, + "size_diff_human": "280.0 B", + "count_diff": 2 + }, + { + "file": "/Users/vitaliavagyan/.local/share/uv/python/cpython-3.12.2-macos-aarch64-none/lib/python3.12/tracemalloc.py:423", + "size_diff": 280, + "size_diff_human": "280.0 B", + "count_diff": 2 + }, + { + "file": "/Users/vitaliavagyan/.local/share/uv/python/cpython-3.12.2-macos-aarch64-none/lib/python3.12/tracemalloc.py:558", + "size_diff": 56, + "size_diff_human": "56.0 B", + "count_diff": 1 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:91", + "size_diff": 32, + "size_diff_human": "32.0 B", + "count_diff": 1 + } + ] + }, + "BoundedASTCache_1k_entries": { + "label": "BoundedASTCache (OrderedDict): 1k entries", + "total_new_alloc": 585087, + "total_new_alloc_human": "571.4 KiB", + "top_allocators": [ + { + "file": "/Users/vitaliavagyan/.local/share/uv/python/cpython-3.12.2-macos-aarch64-none/lib/python3.12/pathlib.py:404", + "size_diff": 141935, + "size_diff_human": "138.6 KiB", + "count_diff": 3001 + }, + { + "file": "/Users/vitaliavagyan/.local/share/uv/python/cpython-3.12.2-macos-aarch64-none/lib/python3.12/pathlib.py:1167", + "size_diff": 104000, + "size_diff_human": "101.6 KiB", + "count_diff": 1000 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:113", + "size_diff": 85272, + "size_diff_human": "83.3 KiB", + "count_diff": 1002 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:111", + "size_diff": 64890, + "size_diff_human": "63.4 KiB", + "count_diff": 1000 + }, + { + "file": "/Users/vitaliavagyan/.local/share/uv/python/cpython-3.12.2-macos-aarch64-none/lib/python3.12/pathlib.py:432", + "size_diff": 64890, + "size_diff_human": "63.4 KiB", + "count_diff": 1000 + }, + { + "file": "/Users/vitaliavagyan/.local/share/uv/python/cpython-3.12.2-macos-aarch64-none/lib/python3.12/pathlib.py:359", + "size_diff": 55944, + "size_diff_human": "54.6 KiB", + "count_diff": 999 + }, + { + "file": "/Users/vitaliavagyan/.local/share/uv/python/cpython-3.12.2-macos-aarch64-none/lib/python3.12/pathlib.py:528", + "size_diff": 35540, + "size_diff_human": "34.7 KiB", + "count_diff": 1000 + }, + { + "file": "/Users/vitaliavagyan/.local/share/uv/python/cpython-3.12.2-macos-aarch64-none/lib/python3.12/pathlib.py:377", + "size_diff": 32000, + "size_diff_human": "31.2 KiB", + "count_diff": 1000 + }, + { + "file": "/Users/vitaliavagyan/.local/share/uv/python/cpython-3.12.2-macos-aarch64-none/lib/python3.12/tracemalloc.py:560", + "size_diff": 264, + "size_diff_human": "264.0 B", + "count_diff": 2 + }, + { + "file": "/Users/vitaliavagyan/.local/share/uv/python/cpython-3.12.2-macos-aarch64-none/lib/python3.12/tracemalloc.py:423", + "size_diff": 264, + "size_diff_human": "264.0 B", + "count_diff": 2 + }, + { + "file": "/Users/vitaliavagyan/.local/share/uv/python/cpython-3.12.2-macos-aarch64-none/lib/python3.12/tracemalloc.py:558", + "size_diff": 56, + "size_diff_human": "56.0 B", + "count_diff": 1 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:110", + "size_diff": 32, + "size_diff_human": "32.0 B", + "count_diff": 1 + } + ] + }, + "node_buffer_5k": { + "label": "node_buffer: 5k buffered nodes", + "total_new_alloc": 2460116, + "total_new_alloc_human": "2.3 MiB", + "top_allocators": [ + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:128", + "size_diff": 920000, + "size_diff_human": "898.4 KiB", + "count_diff": 10000 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:129", + "size_diff": 352290, + "size_diff_human": "344.0 KiB", + "count_diff": 5000 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:126", + "size_diff": 321600, + "size_diff_human": "314.1 KiB", + "count_diff": 4997 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:133", + "size_diff": 308400, + "size_diff_human": "301.2 KiB", + "count_diff": 5000 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:130", + "size_diff": 238890, + "size_diff_human": "233.3 KiB", + "count_diff": 5000 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:132", + "size_diff": 159200, + "size_diff_human": "155.5 KiB", + "count_diff": 4975 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:131", + "size_diff": 159168, + "size_diff_human": "155.4 KiB", + "count_diff": 4974 + }, + { + "file": "/Users/vitaliavagyan/.local/share/uv/python/cpython-3.12.2-macos-aarch64-none/lib/python3.12/tracemalloc.py:560", + "size_diff": 240, + "size_diff_human": "240.0 B", + "count_diff": 2 + }, + { + "file": "/Users/vitaliavagyan/.local/share/uv/python/cpython-3.12.2-macos-aarch64-none/lib/python3.12/tracemalloc.py:423", + "size_diff": 240, + "size_diff_human": "240.0 B", + "count_diff": 2 + }, + { + "file": "/Users/vitaliavagyan/.local/share/uv/python/cpython-3.12.2-macos-aarch64-none/lib/python3.12/tracemalloc.py:558", + "size_diff": 56, + "size_diff_human": "56.0 B", + "count_diff": 1 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:125", + "size_diff": 32, + "size_diff_human": "32.0 B", + "count_diff": 1 + } + ] + }, + "rel_groups_10k": { + "label": "rel_groups: 10k buffered relationships", + "total_new_alloc": 3763656, + "total_new_alloc_human": "3.6 MiB", + "top_allocators": [ + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:149", + "size_diff": 1925336, + "size_diff_human": "1.8 MiB", + "count_diff": 20003 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:152", + "size_diff": 640000, + "size_diff_human": "625.0 KiB", + "count_diff": 10000 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:151", + "size_diff": 598894, + "size_diff_human": "584.9 KiB", + "count_diff": 10000 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:150", + "size_diff": 598890, + "size_diff_human": "584.9 KiB", + "count_diff": 10000 + }, + { + "file": "/Users/vitaliavagyan/.local/share/uv/python/cpython-3.12.2-macos-aarch64-none/lib/python3.12/tracemalloc.py:560", + "size_diff": 224, + "size_diff_human": "224.0 B", + "count_diff": 2 + }, + { + "file": "/Users/vitaliavagyan/.local/share/uv/python/cpython-3.12.2-macos-aarch64-none/lib/python3.12/tracemalloc.py:423", + "size_diff": 224, + "size_diff_human": "224.0 B", + "count_diff": 2 + }, + { + "file": "/Users/vitaliavagyan/.local/share/uv/python/cpython-3.12.2-macos-aarch64-none/lib/python3.12/tracemalloc.py:558", + "size_diff": 56, + "size_diff_human": "56.0 B", + "count_diff": 1 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:147", + "size_diff": 32, + "size_diff_human": "32.0 B", + "count_diff": 1 + } + ] + }, + "import_mapping_2k_modules": { + "label": "import_mapping: 2k modules x 20 imports each", + "total_new_alloc": 5839298, + "total_new_alloc_human": "5.6 MiB", + "top_allocators": [ + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:169", + "size_diff": 5540000, + "size_diff_human": "5.3 MiB", + "count_diff": 82000 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:167", + "size_diff": 128000, + "size_diff_human": "125.0 KiB", + "count_diff": 2000 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:166", + "size_diff": 118890, + "size_diff_human": "116.1 KiB", + "count_diff": 2000 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:170", + "size_diff": 51904, + "size_diff_human": "50.7 KiB", + "count_diff": 1 + }, + { + "file": "/Users/vitaliavagyan/.local/share/uv/python/cpython-3.12.2-macos-aarch64-none/lib/python3.12/tracemalloc.py:560", + "size_diff": 208, + "size_diff_human": "208.0 B", + "count_diff": 2 + }, + { + "file": "/Users/vitaliavagyan/.local/share/uv/python/cpython-3.12.2-macos-aarch64-none/lib/python3.12/tracemalloc.py:423", + "size_diff": 208, + "size_diff_human": "208.0 B", + "count_diff": 2 + }, + { + "file": "/Users/vitaliavagyan/.local/share/uv/python/cpython-3.12.2-macos-aarch64-none/lib/python3.12/tracemalloc.py:558", + "size_diff": 56, + "size_diff_human": "56.0 B", + "count_diff": 1 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:165", + "size_diff": 32, + "size_diff_human": "32.0 B", + "count_diff": 1 + } + ] + }, + "class_inheritance_3k": { + "label": "class_inheritance: 3k classes x 3 parents", + "total_new_alloc": 1202898, + "total_new_alloc_human": "1.1 MiB", + "top_allocators": [ + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:184", + "size_diff": 893044, + "size_diff_human": "872.1 KiB", + "count_diff": 14999 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:183", + "size_diff": 205590, + "size_diff_human": "200.8 KiB", + "count_diff": 3000 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:185", + "size_diff": 103792, + "size_diff_human": "101.4 KiB", + "count_diff": 1 + }, + { + "file": "/Users/vitaliavagyan/.local/share/uv/python/cpython-3.12.2-macos-aarch64-none/lib/python3.12/tracemalloc.py:560", + "size_diff": 192, + "size_diff_human": "192.0 B", + "count_diff": 2 + }, + { + "file": "/Users/vitaliavagyan/.local/share/uv/python/cpython-3.12.2-macos-aarch64-none/lib/python3.12/tracemalloc.py:423", + "size_diff": 192, + "size_diff_human": "192.0 B", + "count_diff": 2 + }, + { + "file": "/Users/vitaliavagyan/.local/share/uv/python/cpython-3.12.2-macos-aarch64-none/lib/python3.12/tracemalloc.py:558", + "size_diff": 56, + "size_diff_human": "56.0 B", + "count_diff": 1 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:182", + "size_diff": 32, + "size_diff_human": "32.0 B", + "count_diff": 1 + } + ] + } + }, + "tree_sitter": { + "parse_all_project_files": { + "label": "Parse 343 Python files (5.4 MiB source)", + "total_new_alloc": 88243514, + "total_new_alloc_human": "84.2 MiB", + "top_allocators": [ + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:224", + "size_diff": 82541776, + "size_diff_human": "78.7 MiB", + "count_diff": 903039 + }, + { + "file": "/Users/vitaliavagyan/.local/share/uv/python/cpython-3.12.2-macos-aarch64-none/lib/python3.12/pathlib.py:1020", + "size_diff": 5679234, + "size_diff_human": "5.4 MiB", + "count_diff": 337 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:225", + "size_diff": 22024, + "size_diff_human": "21.5 KiB", + "count_diff": 344 + }, + { + "file": "/Users/vitaliavagyan/.local/share/uv/python/cpython-3.12.2-macos-aarch64-none/lib/python3.12/tracemalloc.py:560", + "size_diff": 168, + "size_diff_human": "168.0 B", + "count_diff": 2 + }, + { + "file": "/Users/vitaliavagyan/.local/share/uv/python/cpython-3.12.2-macos-aarch64-none/lib/python3.12/tracemalloc.py:423", + "size_diff": 168, + "size_diff_human": "168.0 B", + "count_diff": 2 + }, + { + "file": "/Users/vitaliavagyan/.local/share/uv/python/cpython-3.12.2-macos-aarch64-none/lib/python3.12/tracemalloc.py:558", + "size_diff": 56, + "size_diff_human": "56.0 B", + "count_diff": 1 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:218", + "size_diff": 56, + "size_diff_human": "56.0 B", + "count_diff": 1 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:223", + "size_diff": 32, + "size_diff_human": "32.0 B", + "count_diff": 1 + } + ], + "file_count": 343, + "source_bytes": 5668113 + }, + "ast_node_retention": { + "label": "Retaining 343 AST root nodes", + "total_new_alloc": 25128, + "total_new_alloc_human": "24.5 KiB", + "top_allocators": [ + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:243", + "size_diff": 24768, + "size_diff_human": "24.2 KiB", + "count_diff": 344 + }, + { + "file": "/Users/vitaliavagyan/.local/share/uv/python/cpython-3.12.2-macos-aarch64-none/lib/python3.12/tracemalloc.py:560", + "size_diff": 152, + "size_diff_human": "152.0 B", + "count_diff": 2 + }, + { + "file": "/Users/vitaliavagyan/.local/share/uv/python/cpython-3.12.2-macos-aarch64-none/lib/python3.12/tracemalloc.py:423", + "size_diff": 152, + "size_diff_human": "152.0 B", + "count_diff": 2 + }, + { + "file": "/Users/vitaliavagyan/.local/share/uv/python/cpython-3.12.2-macos-aarch64-none/lib/python3.12/tracemalloc.py:558", + "size_diff": 56, + "size_diff_human": "56.0 B", + "count_diff": 1 + } + ] + }, + "ast_walk_function_extraction": { + "label": "Walking ASTs, collected 5578 function/class nodes", + "total_new_alloc": 91566344, + "total_new_alloc_human": "87.3 MiB", + "top_allocators": [ + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:263", + "size_diff": 91518856, + "size_diff_human": "87.3 MiB", + "count_diff": 1673834 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:262", + "size_diff": 47104, + "size_diff_human": "46.0 KiB", + "count_diff": 1 + }, + { + "file": "/Users/vitaliavagyan/.local/share/uv/python/cpython-3.12.2-macos-aarch64-none/lib/python3.12/tracemalloc.py:560", + "size_diff": 136, + "size_diff_human": "136.0 B", + "count_diff": 2 + }, + { + "file": "/Users/vitaliavagyan/.local/share/uv/python/cpython-3.12.2-macos-aarch64-none/lib/python3.12/tracemalloc.py:423", + "size_diff": 136, + "size_diff_human": "136.0 B", + "count_diff": 2 + }, + { + "file": "/Users/vitaliavagyan/.local/share/uv/python/cpython-3.12.2-macos-aarch64-none/lib/python3.12/tracemalloc.py:558", + "size_diff": 56, + "size_diff_human": "56.0 B", + "count_diff": 1 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:258", + "size_diff": 56, + "size_diff_human": "56.0 B", + "count_diff": 1 + } + ], + "function_class_count": 5578 + } + }, + "graph_loader": { + "graph_loader_5k_nodes_8k_rels": { + "label": "GraphLoader: load 5k nodes + 8k relationships from JSON", + "total_new_alloc": 9476802, + "total_new_alloc_human": "9.0 MiB", + "top_allocators": [ + { + "file": "/Users/vitaliavagyan/.local/share/uv/python/cpython-3.12.2-macos-aarch64-none/lib/python3.12/json/decoder.py:353", + "size_diff": 6787632, + "size_diff_human": "6.5 MiB", + "count_diff": 111693 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/codebase_rag/graph_loader.py:74", + "size_diff": 770760, + "size_diff_human": "752.7 KiB", + "count_diff": 16000 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/codebase_rag/graph_loader.py:83", + "size_diff": 587480, + "size_diff_human": "573.7 KiB", + "count_diff": 10001 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/codebase_rag/graph_loader.py:82", + "size_diff": 587480, + "size_diff_human": "573.7 KiB", + "count_diff": 10001 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/codebase_rag/graph_loader.py:61", + "size_diff": 443080, + "size_diff_human": "432.7 KiB", + "count_diff": 10001 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/codebase_rag/graph_loader.py:68", + "size_diff": 147480, + "size_diff_human": "144.0 KiB", + "count_diff": 1 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/codebase_rag/graph_loader.py:80", + "size_diff": 67168, + "size_diff_human": "65.6 KiB", + "count_diff": 1 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/codebase_rag/graph_loader.py:70", + "size_diff": 41880, + "size_diff_human": "40.9 KiB", + "count_diff": 2 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/codebase_rag/graph_loader.py:66", + "size_diff": 41824, + "size_diff_human": "40.8 KiB", + "count_diff": 1 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/.venv/lib/python3.12/site-packages/loguru/_logger.py:2003", + "size_diff": 200, + "size_diff_human": "200.0 B", + "count_diff": 4 + }, + { + "file": "/Users/vitaliavagyan/.local/share/uv/python/cpython-3.12.2-macos-aarch64-none/lib/python3.12/tracemalloc.py:560", + "size_diff": 120, + "size_diff_human": "120.0 B", + "count_diff": 2 + }, + { + "file": "/Users/vitaliavagyan/.local/share/uv/python/cpython-3.12.2-macos-aarch64-none/lib/python3.12/tracemalloc.py:423", + "size_diff": 120, + "size_diff_human": "120.0 B", + "count_diff": 2 + }, + { + "file": "/Users/vitaliavagyan/.local/share/uv/python/cpython-3.12.2-macos-aarch64-none/lib/python3.12/pathlib.py:404", + "size_diff": 120, + "size_diff_human": "120.0 B", + "count_diff": 2 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/codebase_rag/graph_loader.py:52", + "size_diff": 120, + "size_diff_human": "120.0 B", + "count_diff": 1 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/.venv/lib/python3.12/site-packages/loguru/_handler.py:120", + "size_diff": 120, + "size_diff_human": "120.0 B", + "count_diff": 1 + } + ] + }, + "graph_loader_property_index": { + "label": "GraphLoader: build property index on qualified_name", + "total_new_alloc": 544224, + "total_new_alloc_human": "531.5 KiB", + "top_allocators": [ + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/codebase_rag/graph_loader.py:99", + "size_diff": 440120, + "size_diff_human": "429.8 KiB", + "count_diff": 10001 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/codebase_rag/graph_loader.py:100", + "size_diff": 103856, + "size_diff_human": "101.4 KiB", + "count_diff": 2 + }, + { + "file": "/Users/vitaliavagyan/.local/share/uv/python/cpython-3.12.2-macos-aarch64-none/lib/python3.12/tracemalloc.py:560", + "size_diff": 96, + "size_diff_human": "96.0 B", + "count_diff": 2 + }, + { + "file": "/Users/vitaliavagyan/.local/share/uv/python/cpython-3.12.2-macos-aarch64-none/lib/python3.12/tracemalloc.py:423", + "size_diff": 96, + "size_diff_human": "96.0 B", + "count_diff": 2 + }, + { + "file": "/Users/vitaliavagyan/.local/share/uv/python/cpython-3.12.2-macos-aarch64-none/lib/python3.12/tracemalloc.py:558", + "size_diff": 56, + "size_diff_human": "56.0 B", + "count_diff": 1 + } + ] + } + }, + "embedding_cache": { + "embedding_cache_2k_768dim": { + "label": "EmbeddingCache: 2k entries x 768-dim embeddings", + "total_new_alloc": 50998237, + "total_new_alloc_human": "48.6 MiB", + "top_allocators": [ + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:375", + "size_diff": 50736000, + "size_diff_human": "48.4 MiB", + "count_diff": 1540000 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/codebase_rag/embedder.py:26", + "size_diff": 210000, + "size_diff_human": "205.1 KiB", + "count_diff": 2000 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/codebase_rag/embedder.py:32", + "size_diff": 51904, + "size_diff_human": "50.7 KiB", + "count_diff": 1 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:374", + "size_diff": 85, + "size_diff_human": "85.0 B", + "count_diff": 1 + }, + { + "file": "/Users/vitaliavagyan/.local/share/uv/python/cpython-3.12.2-macos-aarch64-none/lib/python3.12/tracemalloc.py:560", + "size_diff": 80, + "size_diff_human": "80.0 B", + "count_diff": 2 + }, + { + "file": "/Users/vitaliavagyan/.local/share/uv/python/cpython-3.12.2-macos-aarch64-none/lib/python3.12/tracemalloc.py:423", + "size_diff": 80, + "size_diff_human": "80.0 B", + "count_diff": 2 + }, + { + "file": "/Users/vitaliavagyan/.local/share/uv/python/cpython-3.12.2-macos-aarch64-none/lib/python3.12/tracemalloc.py:558", + "size_diff": 56, + "size_diff_human": "56.0 B", + "count_diff": 1 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:373", + "size_diff": 32, + "size_diff_human": "32.0 B", + "count_diff": 1 + } + ], + "cache_dict_size": 51968, + "entry_count": 2000 + } + }, + "gc_pressure": { + "gc_pressure_simulation": { + "label": "GC pressure during simulated file processing (1k files x 20 funcs)", + "temp_objects_created": 20000, + "gc_gen0_before": { + "collections": 1785, + "collected": 8016, + "uncollectable": 0 + }, + "gc_gen0_after": { + "collections": 1785, + "collected": 8016, + "uncollectable": 0 + }, + "gc_gen1_before": { + "collections": 155, + "collected": 1262, + "uncollectable": 0 + }, + "gc_gen1_after": { + "collections": 155, + "collected": 1262, + "uncollectable": 0 + }, + "gc_gen2_before": { + "collections": 40, + "collected": 279, + "uncollectable": 0 + }, + "gc_gen2_after": { + "collections": 41, + "collected": 279, + "uncollectable": 0 + } + } + }, + "string_duplication": { + "property_dict_duplication_5k": { + "label": "5k property dicts with repeated key strings", + "total_new_alloc": 2180068, + "total_new_alloc_human": "2.1 MiB", + "top_allocators": [ + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:449", + "size_diff": 920000, + "size_diff_human": "898.4 KiB", + "count_diff": 10000 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:450", + "size_diff": 352290, + "size_diff_human": "344.0 KiB", + "count_diff": 5000 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:454", + "size_diff": 308400, + "size_diff_human": "301.2 KiB", + "count_diff": 5000 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:451", + "size_diff": 238890, + "size_diff_human": "233.3 KiB", + "count_diff": 5000 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:453", + "size_diff": 159200, + "size_diff_human": "155.5 KiB", + "count_diff": 4975 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:452", + "size_diff": 159168, + "size_diff_human": "155.4 KiB", + "count_diff": 4974 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:456", + "size_diff": 41824, + "size_diff_human": "40.8 KiB", + "count_diff": 1 + }, + { + "file": "/Users/vitaliavagyan/.local/share/uv/python/cpython-3.12.2-macos-aarch64-none/lib/python3.12/tracemalloc.py:560", + "size_diff": 80, + "size_diff_human": "80.0 B", + "count_diff": 2 + }, + { + "file": "/Users/vitaliavagyan/.local/share/uv/python/cpython-3.12.2-macos-aarch64-none/lib/python3.12/tracemalloc.py:423", + "size_diff": 72, + "size_diff_human": "72.0 B", + "count_diff": 2 + }, + { + "file": "/Users/vitaliavagyan/.local/share/uv/python/cpython-3.12.2-macos-aarch64-none/lib/python3.12/tracemalloc.py:558", + "size_diff": 56, + "size_diff_human": "56.0 B", + "count_diff": 1 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:447", + "size_diff": 56, + "size_diff_human": "56.0 B", + "count_diff": 1 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:448", + "size_diff": 32, + "size_diff_human": "32.0 B", + "count_diff": 1 + } + ] + }, + "property_tuple_alternative_5k": { + "label": "5k tuples (no key duplication) as alternative", + "total_new_alloc": 1660012, + "total_new_alloc_human": "1.6 MiB", + "top_allocators": [ + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:471", + "size_diff": 400000, + "size_diff_human": "390.6 KiB", + "count_diff": 5000 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:472", + "size_diff": 352290, + "size_diff_human": "344.0 KiB", + "count_diff": 5000 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:476", + "size_diff": 308400, + "size_diff_human": "301.2 KiB", + "count_diff": 5000 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:473", + "size_diff": 238890, + "size_diff_human": "233.3 KiB", + "count_diff": 5000 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:475", + "size_diff": 159200, + "size_diff_human": "155.5 KiB", + "count_diff": 4975 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:474", + "size_diff": 159168, + "size_diff_human": "155.4 KiB", + "count_diff": 4974 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:478", + "size_diff": 41824, + "size_diff_human": "40.8 KiB", + "count_diff": 1 + }, + { + "file": "/Users/vitaliavagyan/.local/share/uv/python/cpython-3.12.2-macos-aarch64-none/lib/python3.12/tracemalloc.py:560", + "size_diff": 80, + "size_diff_human": "80.0 B", + "count_diff": 2 + }, + { + "file": "/Users/vitaliavagyan/.local/share/uv/python/cpython-3.12.2-macos-aarch64-none/lib/python3.12/tracemalloc.py:423", + "size_diff": 72, + "size_diff_human": "72.0 B", + "count_diff": 2 + }, + { + "file": "/Users/vitaliavagyan/.local/share/uv/python/cpython-3.12.2-macos-aarch64-none/lib/python3.12/tracemalloc.py:558", + "size_diff": 56, + "size_diff_human": "56.0 B", + "count_diff": 1 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:470", + "size_diff": 32, + "size_diff_human": "32.0 B", + "count_diff": 1 + } + ] + } + }, + "full_pipeline": { + "phase1_trie_15k": { + "label": "Phase 1: FunctionRegistryTrie + SimpleNameLookup (15k entries)", + "total_new_alloc": 6411617, + "total_new_alloc_human": "6.1 MiB", + "top_allocators": [ + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/codebase_rag/graph_updater.py:56", + "size_diff": 1679760, + "size_diff_human": "1.6 MiB", + "count_diff": 13998 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/codebase_rag/graph_updater.py:51", + "size_diff": 1574648, + "size_diff_human": "1.5 MiB", + "count_diff": 18203 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:513", + "size_diff": 1150200, + "size_diff_human": "1.1 MiB", + "count_diff": 15000 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/codebase_rag/graph_updater.py:46", + "size_diff": 788278, + "size_diff_human": "769.8 KiB", + "count_diff": 16101 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:515", + "size_diff": 754088, + "size_diff_human": "736.4 KiB", + "count_diff": 2002 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/codebase_rag/graph_updater.py:44", + "size_diff": 415088, + "size_diff_human": "405.4 KiB", + "count_diff": 1 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:512", + "size_diff": 48939, + "size_diff_human": "47.8 KiB", + "count_diff": 1001 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:509", + "size_diff": 176, + "size_diff_human": "176.0 B", + "count_diff": 2 + }, + { + "file": "/Users/vitaliavagyan/.local/share/uv/python/cpython-3.12.2-macos-aarch64-none/lib/python3.12/tracemalloc.py:560", + "size_diff": 80, + "size_diff_human": "80.0 B", + "count_diff": 2 + }, + { + "file": "/Users/vitaliavagyan/.local/share/uv/python/cpython-3.12.2-macos-aarch64-none/lib/python3.12/tracemalloc.py:423", + "size_diff": 72, + "size_diff_human": "72.0 B", + "count_diff": 2 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:508", + "size_diff": 72, + "size_diff_human": "72.0 B", + "count_diff": 1 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/codebase_rag/graph_updater.py:40", + "size_diff": 64, + "size_diff_human": "64.0 B", + "count_diff": 1 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/codebase_rag/graph_updater.py:39", + "size_diff": 64, + "size_diff_human": "64.0 B", + "count_diff": 1 + }, + { + "file": "/Users/vitaliavagyan/.local/share/uv/python/cpython-3.12.2-macos-aarch64-none/lib/python3.12/tracemalloc.py:558", + "size_diff": 56, + "size_diff_human": "56.0 B", + "count_diff": 1 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:511", + "size_diff": 32, + "size_diff_human": "32.0 B", + "count_diff": 1 + } + ] + }, + "phase2_imports_1500_modules": { + "label": "Phase 2: import_mapping (1500 modules x 25 imports)", + "total_new_alloc": 5287898, + "total_new_alloc_human": "5.0 MiB", + "top_allocators": [ + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:528", + "size_diff": 5140500, + "size_diff_human": "4.9 MiB", + "count_diff": 78000 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:527", + "size_diff": 88890, + "size_diff_human": "86.8 KiB", + "count_diff": 1500 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:529", + "size_diff": 51904, + "size_diff_human": "50.7 KiB", + "count_diff": 1 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:39", + "size_diff": 2888, + "size_diff_human": "2.8 KiB", + "count_diff": 31 + }, + { + "file": "/Users/vitaliavagyan/.local/share/uv/python/cpython-3.12.2-macos-aarch64-none/lib/python3.12/tracemalloc.py:173", + "size_diff": 1872, + "size_diff_human": "1.8 KiB", + "count_diff": 15 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:23", + "size_diff": 768, + "size_diff_human": "768.0 B", + "count_diff": 16 + }, + { + "file": "/Users/vitaliavagyan/.local/share/uv/python/cpython-3.12.2-macos-aarch64-none/lib/python3.12/tracemalloc.py:503", + "size_diff": 192, + "size_diff_human": "192.0 B", + "count_diff": 6 + }, + { + "file": "/Users/vitaliavagyan/.local/share/uv/python/cpython-3.12.2-macos-aarch64-none/lib/python3.12/tracemalloc.py:502", + "size_diff": 192, + "size_diff_human": "192.0 B", + "count_diff": 6 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:31", + "size_diff": 184, + "size_diff_human": "184.0 B", + "count_diff": 2 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:519", + "size_diff": 120, + "size_diff_human": "120.0 B", + "count_diff": 1 + }, + { + "file": "/Users/vitaliavagyan/.local/share/uv/python/cpython-3.12.2-macos-aarch64-none/lib/python3.12/tracemalloc.py:560", + "size_diff": 80, + "size_diff_human": "80.0 B", + "count_diff": 2 + }, + { + "file": "/Users/vitaliavagyan/.local/share/uv/python/cpython-3.12.2-macos-aarch64-none/lib/python3.12/tracemalloc.py:423", + "size_diff": 72, + "size_diff_human": "72.0 B", + "count_diff": 2 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:525", + "size_diff": 64, + "size_diff_human": "64.0 B", + "count_diff": 1 + }, + { + "file": "/Users/vitaliavagyan/.local/share/uv/python/cpython-3.12.2-macos-aarch64-none/lib/python3.12/tracemalloc.py:558", + "size_diff": 56, + "size_diff_human": "56.0 B", + "count_diff": 1 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:35", + "size_diff": 56, + "size_diff_human": "56.0 B", + "count_diff": 1 + } + ] + }, + "phase3_inheritance_5k": { + "label": "Phase 3: class_inheritance (5k classes x 2 parents)", + "total_new_alloc": 1542592, + "total_new_alloc_human": "1.5 MiB", + "top_allocators": [ + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:542", + "size_diff": 1089000, + "size_diff_human": "1.0 MiB", + "count_diff": 20000 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:541", + "size_diff": 343390, + "size_diff_human": "335.3 KiB", + "count_diff": 5000 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:543", + "size_diff": 103792, + "size_diff_human": "101.4 KiB", + "count_diff": 1 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:39", + "size_diff": 2888, + "size_diff_human": "2.8 KiB", + "count_diff": 31 + }, + { + "file": "/Users/vitaliavagyan/.local/share/uv/python/cpython-3.12.2-macos-aarch64-none/lib/python3.12/tracemalloc.py:173", + "size_diff": 1961, + "size_diff_human": "1.9 KiB", + "count_diff": 15 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:23", + "size_diff": 765, + "size_diff_human": "765.0 B", + "count_diff": 16 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:31", + "size_diff": 184, + "size_diff_human": "184.0 B", + "count_diff": 2 + }, + { + "file": "/Users/vitaliavagyan/.local/share/uv/python/cpython-3.12.2-macos-aarch64-none/lib/python3.12/tracemalloc.py:502", + "size_diff": 160, + "size_diff_human": "160.0 B", + "count_diff": 5 + }, + { + "file": "/Users/vitaliavagyan/.local/share/uv/python/cpython-3.12.2-macos-aarch64-none/lib/python3.12/tracemalloc.py:560", + "size_diff": 80, + "size_diff_human": "80.0 B", + "count_diff": 2 + }, + { + "file": "/Users/vitaliavagyan/.local/share/uv/python/cpython-3.12.2-macos-aarch64-none/lib/python3.12/tracemalloc.py:423", + "size_diff": 72, + "size_diff_human": "72.0 B", + "count_diff": 2 + }, + { + "file": "/Users/vitaliavagyan/.local/share/uv/python/cpython-3.12.2-macos-aarch64-none/lib/python3.12/tracemalloc.py:503", + "size_diff": 64, + "size_diff_human": "64.0 B", + "count_diff": 2 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:539", + "size_diff": 64, + "size_diff_human": "64.0 B", + "count_diff": 1 + }, + { + "file": "/Users/vitaliavagyan/.local/share/uv/python/cpython-3.12.2-macos-aarch64-none/lib/python3.12/tracemalloc.py:558", + "size_diff": 56, + "size_diff_human": "56.0 B", + "count_diff": 1 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:35", + "size_diff": 56, + "size_diff_human": "56.0 B", + "count_diff": 1 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:540", + "size_diff": 32, + "size_diff_human": "32.0 B", + "count_diff": 1 + } + ] + }, + "phase4_buffers_10k_nodes_20k_rels": { + "label": "Phase 4: node_buffer (10k) + rel_groups (20k)", + "total_new_alloc": 11864970, + "total_new_alloc_human": "11.3 MiB", + "top_allocators": [ + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:568", + "size_diff": 3853176, + "size_diff_human": "3.7 MiB", + "count_diff": 40003 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:557", + "size_diff": 1840000, + "size_diff_human": "1.8 MiB", + "count_diff": 20000 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:571", + "size_diff": 1280000, + "size_diff_human": "1.2 MiB", + "count_diff": 20000 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:570", + "size_diff": 1208894, + "size_diff_human": "1.2 MiB", + "count_diff": 20000 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:569", + "size_diff": 1208890, + "size_diff_human": "1.2 MiB", + "count_diff": 20000 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:558", + "size_diff": 706790, + "size_diff_human": "690.2 KiB", + "count_diff": 10000 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:555", + "size_diff": 645120, + "size_diff_human": "630.0 KiB", + "count_diff": 10001 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:559", + "size_diff": 478890, + "size_diff_human": "467.7 KiB", + "count_diff": 10000 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:561", + "size_diff": 318400, + "size_diff_human": "310.9 KiB", + "count_diff": 9950 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:560", + "size_diff": 318336, + "size_diff_human": "310.9 KiB", + "count_diff": 9948 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:39", + "size_diff": 2888, + "size_diff_human": "2.8 KiB", + "count_diff": 31 + }, + { + "file": "/Users/vitaliavagyan/.local/share/uv/python/cpython-3.12.2-macos-aarch64-none/lib/python3.12/tracemalloc.py:173", + "size_diff": 1961, + "size_diff_human": "1.9 KiB", + "count_diff": 15 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:23", + "size_diff": 765, + "size_diff_human": "765.0 B", + "count_diff": 16 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:31", + "size_diff": 184, + "size_diff_human": "184.0 B", + "count_diff": 2 + }, + { + "file": "/Users/vitaliavagyan/.local/share/uv/python/cpython-3.12.2-macos-aarch64-none/lib/python3.12/tracemalloc.py:126", + "size_diff": 96, + "size_diff_human": "96.0 B", + "count_diff": 3 + } + ] + }, + "total_pipeline_memory": { + "label": "TOTAL: Full pipeline memory (all phases combined)", + "total_new_alloc": 25106981, + "total_new_alloc_human": "23.9 MiB", + "top_allocators": [ + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:528", + "size_diff": 5140500, + "size_diff_human": "4.9 MiB", + "count_diff": 78000 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:568", + "size_diff": 3853176, + "size_diff_human": "3.7 MiB", + "count_diff": 40003 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:557", + "size_diff": 1840000, + "size_diff_human": "1.8 MiB", + "count_diff": 20000 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/codebase_rag/graph_updater.py:56", + "size_diff": 1679760, + "size_diff_human": "1.6 MiB", + "count_diff": 13998 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/codebase_rag/graph_updater.py:51", + "size_diff": 1574648, + "size_diff_human": "1.5 MiB", + "count_diff": 18203 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:571", + "size_diff": 1280000, + "size_diff_human": "1.2 MiB", + "count_diff": 20000 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:570", + "size_diff": 1208894, + "size_diff_human": "1.2 MiB", + "count_diff": 20000 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:569", + "size_diff": 1208890, + "size_diff_human": "1.2 MiB", + "count_diff": 20000 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:513", + "size_diff": 1150200, + "size_diff_human": "1.1 MiB", + "count_diff": 15000 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:542", + "size_diff": 1089000, + "size_diff_human": "1.0 MiB", + "count_diff": 20000 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/codebase_rag/graph_updater.py:46", + "size_diff": 788278, + "size_diff_human": "769.8 KiB", + "count_diff": 16101 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:515", + "size_diff": 754088, + "size_diff_human": "736.4 KiB", + "count_diff": 2002 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:558", + "size_diff": 706790, + "size_diff_human": "690.2 KiB", + "count_diff": 10000 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:555", + "size_diff": 645120, + "size_diff_human": "630.0 KiB", + "count_diff": 10001 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:559", + "size_diff": 478890, + "size_diff_human": "467.7 KiB", + "count_diff": 10000 + } + ] + }, + "peak_traced_memory": { + "current": 25128953, + "current_human": "24.0 MiB", + "peak": 25135561, + "peak_human": "24.0 MiB" + } + } +} diff --git a/optimize/profile_io.py b/optimize/profile_io.py new file mode 100644 index 000000000..c71d98ecd --- /dev/null +++ b/optimize/profile_io.py @@ -0,0 +1,431 @@ +import hashlib +import json +import statistics +import sys +import time +from collections import defaultdict +from pathlib import Path + +sys.path.insert(0, str(Path(__file__).resolve().parent.parent)) + +from codebase_rag import constants as cs +from codebase_rag.graph_updater import _hash_file, _load_hash_cache, _save_hash_cache +from codebase_rag.parser_loader import load_parsers +from codebase_rag.parsers.utils import safe_decode_with_fallback +from codebase_rag.services.protobuf_service import ProtobufFileIngestor +from codebase_rag.utils.path_utils import should_skip_path + + +REPO_PATH = Path(__file__).resolve().parent.parent +RUNS = 5 + + +def benchmark(func, *args, runs=RUNS, label=""): + times = [] + result = None + for _ in range(runs): + start = time.perf_counter() + result = func(*args) + elapsed = time.perf_counter() - start + times.append(elapsed) + avg = statistics.mean(times) + std = statistics.stdev(times) if len(times) > 1 else 0.0 + med = statistics.median(times) + return { + "label": label, + "avg_ms": avg * 1000, + "median_ms": med * 1000, + "std_ms": std * 1000, + "min_ms": min(times) * 1000, + "max_ms": max(times) * 1000, + "runs": runs, + "result": result, + } + + +def collect_py_files(): + files = [] + for f in REPO_PATH.rglob("*.py"): + if not should_skip_path(f, REPO_PATH): + files.append(f) + return files + + +def profile_file_hashing(files): + print("\n=== FILE HASHING (SHA-256) ===") + results = [] + total_bytes = 0 + for f in files: + total_bytes += f.stat().st_size + + def hash_all(): + for f in files: + _hash_file(f) + + r = benchmark(hash_all, label=f"hash {len(files)} files ({total_bytes/1024:.0f} KB)") + results.append(r) + print(f" {r['label']}: avg={r['avg_ms']:.2f}ms, median={r['median_ms']:.2f}ms, std={r['std_ms']:.2f}ms") + + per_file_ms = r['avg_ms'] / len(files) if files else 0 + print(f" Per file average: {per_file_ms:.3f}ms") + print(f" Throughput: {total_bytes / (r['avg_ms']/1000) / 1024 / 1024:.1f} MB/s") + + single_sizes = [(f, f.stat().st_size) for f in files] + single_sizes.sort(key=lambda x: x[1], reverse=True) + for f, sz in single_sizes[:5]: + r2 = benchmark(_hash_file, f, runs=10, label=f"hash {f.relative_to(REPO_PATH)} ({sz}B)") + results.append(r2) + print(f" {r2['label']}: avg={r2['avg_ms']:.3f}ms") + + return results + + +def profile_file_reading(files): + print("\n=== FILE READING (read_bytes + parse) ===") + results = [] + + def read_all_bytes(): + for f in files: + f.read_bytes() + + total_bytes = sum(f.stat().st_size for f in files) + r = benchmark(read_all_bytes, label=f"read_bytes {len(files)} files ({total_bytes/1024:.0f} KB)") + results.append(r) + print(f" {r['label']}: avg={r['avg_ms']:.2f}ms, median={r['median_ms']:.2f}ms") + print(f" Throughput: {total_bytes / (r['avg_ms']/1000) / 1024 / 1024:.1f} MB/s") + + def read_all_text(): + for f in files: + f.read_text(encoding="utf-8") + + r2 = benchmark(read_all_text, label=f"read_text {len(files)} files") + results.append(r2) + print(f" {r2['label']}: avg={r2['avg_ms']:.2f}ms, median={r2['median_ms']:.2f}ms") + + return results + + +def profile_tree_sitter_parsing(files): + print("\n=== TREE-SITTER PARSING ===") + results = [] + parsers, queries = load_parsers() + py_parser = parsers.get(cs.SupportedLanguage.PYTHON) + if not py_parser: + print(" Python parser not available, skipping") + return results + + py_files = [f for f in files if f.suffix == ".py"] + file_bytes = [(f, f.read_bytes()) for f in py_files] + + def parse_all(): + for f, src in file_bytes: + py_parser.parse(src) + + r = benchmark(parse_all, label=f"parse {len(py_files)} Python files") + results.append(r) + print(f" {r['label']}: avg={r['avg_ms']:.2f}ms, median={r['median_ms']:.2f}ms") + per_file_ms = r['avg_ms'] / len(py_files) if py_files else 0 + print(f" Per file average: {per_file_ms:.3f}ms") + + file_bytes_sorted = sorted(file_bytes, key=lambda x: len(x[1]), reverse=True) + for f, src in file_bytes_sorted[:5]: + r2 = benchmark(py_parser.parse, src, runs=10, + label=f"parse {f.relative_to(REPO_PATH)} ({len(src)}B)") + results.append(r2) + print(f" {r2['label']}: avg={r2['avg_ms']:.3f}ms") + + return results + + +def profile_json_serialization(): + print("\n=== JSON SERIALIZATION ===") + results = [] + + small = {"key": "value", "num": 42, "arr": [1, 2, 3]} + r = benchmark(json.dumps, small, runs=1000, label="json.dumps small dict") + results.append(r) + print(f" {r['label']}: avg={r['avg_ms']:.4f}ms") + + medium_nodes = [ + {"node_id": i, "labels": ["Function"], "properties": {"name": f"func_{i}", "path": f"src/mod_{i//10}.py", "start_line": i*10, "end_line": i*10+5}} + for i in range(1000) + ] + medium_rels = [ + {"from_id": i, "to_id": (i+1) % 1000, "type": "CALLS", "properties": {}} + for i in range(2000) + ] + medium = {"nodes": medium_nodes, "relationships": medium_rels, "metadata": {"total_nodes": 1000, "total_relationships": 2000}} + + r2 = benchmark(json.dumps, medium, runs=5, label=f"json.dumps graph (1K nodes, 2K rels, {len(json.dumps(medium))/1024:.0f}KB)") + results.append(r2) + print(f" {r2['label']}: avg={r2['avg_ms']:.2f}ms") + + json_str = json.dumps(medium) + r3 = benchmark(json.loads, json_str, runs=5, label=f"json.loads graph ({len(json_str)/1024:.0f}KB)") + results.append(r3) + print(f" {r3['label']}: avg={r3['avg_ms']:.2f}ms") + + large_nodes = medium_nodes * 10 + large_rels = medium_rels * 10 + large = {"nodes": large_nodes, "relationships": large_rels, "metadata": {"total_nodes": 10000, "total_relationships": 20000}} + large_json = json.dumps(large) + r4 = benchmark(json.dumps, large, runs=3, label=f"json.dumps large graph (10K nodes, 20K rels, {len(large_json)/1024:.0f}KB)") + results.append(r4) + print(f" {r4['label']}: avg={r4['avg_ms']:.2f}ms") + + r5 = benchmark(json.loads, large_json, runs=3, label=f"json.loads large graph ({len(large_json)/1024:.0f}KB)") + results.append(r5) + print(f" {r5['label']}: avg={r5['avg_ms']:.2f}ms") + + with_indent = lambda d: json.dumps(d, indent=2, ensure_ascii=False) + r6 = benchmark(with_indent, large, runs=3, label=f"json.dumps large graph (indent=2)") + results.append(r6) + print(f" {r6['label']}: avg={r6['avg_ms']:.2f}ms") + + return results + + +def profile_protobuf_serialization(): + print("\n=== PROTOBUF SERIALIZATION ===") + results = [] + try: + import codec.schema_pb2 as pb + except ImportError: + print(" protobuf schema not available, skipping") + return results + + import tempfile, shutil + tmp_dir = Path(tempfile.mkdtemp()) + try: + ingestor = ProtobufFileIngestor(output_path=str(tmp_dir)) + + for i in range(100): + ingestor.ensure_node_batch("Function", { + "qualified_name": f"project.mod.func_{i}", + "name": f"func_{i}", + "path": f"src/mod.py", + "start_line": i * 10, + "end_line": i * 10 + 5, + }) + for i in range(200): + ingestor.ensure_relationship_batch( + ("Function", "qualified_name", f"project.mod.func_{i % 100}"), + "CALLS", + ("Function", "qualified_name", f"project.mod.func_{(i+1) % 100}"), + ) + + def flush_protobuf(): + ingestor.flush_all() + + r = benchmark(flush_protobuf, runs=5, label="protobuf flush (100 nodes, 200 rels)") + results.append(r) + print(f" {r['label']}: avg={r['avg_ms']:.2f}ms") + + index_file = tmp_dir / "graph_code_index.pb" + if index_file.exists(): + size = index_file.stat().st_size + print(f" Output size: {size} bytes") + + def read_protobuf(): + idx = pb.GraphCodeIndex() + idx.ParseFromString(index_file.read_bytes()) + return idx + + r2 = benchmark(read_protobuf, runs=10, label=f"protobuf parse ({size}B)") + results.append(r2) + print(f" {r2['label']}: avg={r2['avg_ms']:.3f}ms") + + for node_path in tmp_dir.iterdir(): + if node_path.suffix == ".pb": + sz = node_path.stat().st_size + print(f" Protobuf file: {node_path.name} ({sz} bytes)") + + finally: + shutil.rmtree(tmp_dir) + + return results + + +def profile_hash_cache_io(): + print("\n=== HASH CACHE I/O ===") + results = [] + + import tempfile + tmp = Path(tempfile.mkdtemp()) + try: + cache_data = {f"path/to/file_{i}.py": hashlib.sha256(f"content_{i}".encode()).hexdigest() for i in range(1000)} + cache_path = tmp / ".file_hashes.json" + + r = benchmark(_save_hash_cache, cache_path, cache_data, runs=5, label=f"save hash cache ({len(cache_data)} entries)") + results.append(r) + print(f" {r['label']}: avg={r['avg_ms']:.2f}ms, size={cache_path.stat().st_size/1024:.1f}KB") + + r2 = benchmark(_load_hash_cache, cache_path, runs=5, label=f"load hash cache ({len(cache_data)} entries)") + results.append(r2) + print(f" {r2['label']}: avg={r2['avg_ms']:.2f}ms") + finally: + import shutil + shutil.rmtree(tmp) + + return results + + +def profile_file_traversal(): + print("\n=== FILESYSTEM TRAVERSAL ===") + results = [] + + def rglob_all(): + return list(REPO_PATH.rglob("*")) + + r = benchmark(rglob_all, runs=5, label="rglob('*') entire repo") + results.append(r) + all_paths = r['result'] + print(f" {r['label']}: avg={r['avg_ms']:.2f}ms, found {len(all_paths)} paths") + + def rglob_with_filter(): + eligible = [] + for f in REPO_PATH.rglob("*"): + if f.is_file() and not should_skip_path(f, REPO_PATH): + eligible.append(f) + return eligible + + r2 = benchmark(rglob_with_filter, runs=5, label="rglob + should_skip_path filter") + results.append(r2) + eligible = r2['result'] + print(f" {r2['label']}: avg={r2['avg_ms']:.2f}ms, eligible {len(eligible)} files") + + overhead_ms = r2['avg_ms'] - r['avg_ms'] + print(f" Filter overhead: {overhead_ms:.2f}ms") + + return results + + +def profile_source_extraction(): + print("\n=== SOURCE EXTRACTION ===") + results = [] + from codebase_rag.utils.source_extraction import extract_source_lines + + py_files = [f for f in REPO_PATH.rglob("*.py") + if not should_skip_path(f, REPO_PATH) and f.stat().st_size > 100] + if not py_files: + print(" No Python files found") + return results + + target = py_files[0] + line_count = len(target.read_text().splitlines()) + + def extract_50_lines(): + return extract_source_lines(target, 1, min(50, line_count)) + + r = benchmark(extract_50_lines, runs=20, label=f"extract 50 lines from {target.relative_to(REPO_PATH)}") + results.append(r) + print(f" {r['label']}: avg={r['avg_ms']:.3f}ms") + + def extract_all_files_10_lines(): + for f in py_files[:50]: + extract_source_lines(f, 1, 10) + + r2 = benchmark(extract_all_files_10_lines, runs=5, label=f"extract 10 lines from {min(50, len(py_files))} files") + results.append(r2) + print(f" {r2['label']}: avg={r2['avg_ms']:.2f}ms") + + return results + + +def profile_embedding_cache_io(): + print("\n=== EMBEDDING CACHE I/O ===") + results = [] + import tempfile + + from codebase_rag.embedder import EmbeddingCache + + tmp = Path(tempfile.mkdtemp()) + try: + cache = EmbeddingCache(path=tmp / "embedding_cache.json") + for i in range(500): + cache.put(f"def func_{i}(): pass", [float(j) / 768 for j in range(768)]) + + def save_cache(): + cache.save() + + r = benchmark(save_cache, runs=5, label=f"save embedding cache ({len(cache)} entries, 768-dim)") + results.append(r) + size = (tmp / "embedding_cache.json").stat().st_size + print(f" {r['label']}: avg={r['avg_ms']:.2f}ms, size={size/1024/1024:.2f}MB") + + def load_cache(): + new_cache = EmbeddingCache(path=tmp / "embedding_cache.json") + new_cache.load() + return new_cache + + r2 = benchmark(load_cache, runs=5, label=f"load embedding cache ({size/1024/1024:.2f}MB)") + results.append(r2) + print(f" {r2['label']}: avg={r2['avg_ms']:.2f}ms") + print(f" Throughput: {size / (r2['avg_ms']/1000) / 1024 / 1024:.1f} MB/s") + finally: + import shutil + shutil.rmtree(tmp) + + return results + + +def profile_directory_structure(): + print("\n=== DIRECTORY STRUCTURE IDENTIFICATION ===") + results = [] + from codebase_rag.language_spec import LANGUAGE_SPECS + + package_indicators = set() + for spec in LANGUAGE_SPECS.values(): + package_indicators.update(spec.package_indicators) + + def identify_packages(): + dirs = set() + for p in REPO_PATH.rglob("*"): + if p.is_dir() and not should_skip_path(p, REPO_PATH): + dirs.add(p) + packages = 0 + for d in dirs: + for indicator in package_indicators: + if (d / indicator).exists(): + packages += 1 + break + return packages + + r = benchmark(identify_packages, runs=5, label="identify package structure") + results.append(r) + print(f" {r['label']}: avg={r['avg_ms']:.2f}ms, packages={r['result']}") + + return results + + +def main(): + print("=" * 70) + print("I/O AND SERIALIZATION LATENCY PROFILE") + print(f"Repo: {REPO_PATH}") + print("=" * 70) + + all_results = [] + files = collect_py_files() + print(f"\nPython files for profiling: {len(files)}") + + all_results.extend(profile_file_traversal()) + all_results.extend(profile_file_reading(files)) + all_results.extend(profile_file_hashing(files)) + all_results.extend(profile_tree_sitter_parsing(files)) + all_results.extend(profile_source_extraction()) + all_results.extend(profile_json_serialization()) + all_results.extend(profile_protobuf_serialization()) + all_results.extend(profile_hash_cache_io()) + all_results.extend(profile_embedding_cache_io()) + all_results.extend(profile_directory_structure()) + + print("\n" + "=" * 70) + print("RANKED SUMMARY (by avg wall-clock time)") + print("=" * 70) + ranked = sorted(all_results, key=lambda x: x['avg_ms'], reverse=True) + for i, r in enumerate(ranked, 1): + print(f" {i:2d}. [{r['avg_ms']:10.2f}ms] {r['label']}") + + +if __name__ == "__main__": + main() diff --git a/pyproject.toml b/pyproject.toml index 12160521b..fa8464872 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,28 +1,65 @@ [project] name = "code-graph-rag" -version = "0.0.60" +version = "0.0.187" description = "The ultimate RAG for your monorepo. Query, understand, and edit multi-language codebases with the power of AI and knowledge graphs" -readme = "README.md" +readme = "PYPI_README.md" requires-python = ">=3.12" +license = "MIT" +classifiers = [ + "Development Status :: 4 - Beta", + "Environment :: Console", + "Intended Audience :: Developers", + "Operating System :: OS Independent", + "Topic :: Scientific/Engineering :: Artificial Intelligence", + "Topic :: Software Development :: Code Generators", + "Topic :: Software Development :: Libraries :: Python Modules", + "Programming Language :: Python :: 3 :: Only", + "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", + "Programming Language :: Python :: 3.14", +] +keywords = [ + "rag", + "retrieval-augmented-generation", + "knowledge-graph", + "code-analysis", + "tree-sitter", + "mcp", + "mcp-server", + "llm", + "graph-database", + "semantic-search", + "codebase", + "memgraph", + "developer-tools", + "monorepo", +] dependencies = [ "loguru>=0.7.3", - "mcp>=1.21.1", - "pydantic-ai>=1.27.0", - "pydantic-settings>=2.0.0", - "pymgclient>=1.4.0", - "python-dotenv>=1.1.0", + "mcp>=1.25.0", + "pydantic-ai>=1.102.0", + "pydantic-settings>=2.12.0", + "pymgclient>=1.5.1", + "python-dotenv>=1.2.1", + "tiktoken>=0.12.0", "toml>=0.10.2", - "tree-sitter-python>=0.23.6", - "tree-sitter==0.25.0", + "tree-sitter-python>=0.25.0", + "tree-sitter==0.25.2", "watchdog>=6.0.0", - "typer>=0.12.5", - "rich>=13.7.1", - "prompt-toolkit>=3.0.0", + "typer>=0.21.1", + "rich>=14.2.0", + "prompt-toolkit>=3.0.52", "diff-match-patch>=20241021", - "click>=8.0.0", - "protobuf>=5.27.0", + "click>=8.3.1", + "protobuf>=6.33.5", "defusedxml>=0.7.1", - "huggingface-hub[hf-xet]>=0.36.0", + "huggingface-hub[hf-xet]>=1.7.2", + # TODO: remove once pydantic-ai is upgraded to a release whose code and + # metadata agree on the griffe package. pydantic-ai-slim 1.102.0 imports + # `griffe` at runtime but only declares the renamed `griffelib`, so a clean + # `uv sync` omits griffe and leaves codebase_rag unimportable; declare it + # explicitly to keep the environment reproducible. + "griffe>=1.0,<2", ] [project.scripts] @@ -32,8 +69,12 @@ cgr = "codebase_rag.cli:app" [tool.uv] package = true -[tool.setuptools] -packages = ["codebase_rag", "codec"] +[tool.setuptools.packages.find] +include = ["codebase_rag*", "codec*", "cgr*"] +exclude = ["*.tests", "*.tests.*"] + +[tool.setuptools.package-data] +codebase_rag = ["docker-compose.yaml"] [project.optional-dependencies] test = [ @@ -42,6 +83,7 @@ test = [ "pytest-cov>=4.0.0", "pytest-xdist>=3.8.0", "testcontainers>=4.9.0", + "libclang>=18.1.1", ] treesitter-full = [ @@ -52,8 +94,10 @@ treesitter-full = [ "tree-sitter-go>=0.23.4", "tree-sitter-scala>=0.24.0", "tree-sitter-java>=0.23.5", + "tree-sitter-c>=0.24.1", "tree-sitter-cpp>=0.23.0", "tree-sitter-lua>=0.0.19", + "tree-sitter-php>=0.24.1", ] semantic = [ @@ -65,7 +109,7 @@ semantic = [ [tool.ruff] line-length = 88 target-version = "py312" -exclude = ["codec/"] +exclude = ["codec/", "benchmarks/", "optimize/"] [tool.ruff.lint] select = ["E", "F", "W", "I", "UP", "PL", "T201"] @@ -83,6 +127,7 @@ ignore = [ [tool.ruff.lint.per-file-ignores] "**/tests/**" = ["T201"] +"benchmarks/**" = ["T201"] [tool.ruff.format] quote-style = "double" @@ -91,7 +136,7 @@ quote-style = "double" python-version = "3.12" [tool.ty.src] -exclude = ["codebase_rag/tests/test_cypher_queries.py", "codebase_rag/tests/test_code_retrieval.py", "codebase_rag/tests/test_call_resolver.py"] +exclude = ["codebase_rag/tests/test_cypher_queries.py", "codebase_rag/tests/test_code_retrieval.py", "codebase_rag/tests/test_call_resolver.py", "benchmarks/", "optimize/"] [tool.pytest.ini_options] asyncio_mode = "auto" @@ -113,6 +158,7 @@ dev = [ "pre-commit>=4.2.0", "pyinstaller>=6.14.1", "pylint>=4.0.4", + "pytest>=9.0.2", "radon>=6.0.1", "ruff>=0.5.5", "semgrep>=1.79.0", @@ -121,7 +167,15 @@ dev = [ "types-toml>=0.10.8.20240310", "vulture>=2.14", ] +docs = [ + "mkdocs>=1.6.1,<2", + "mkdocs-material>=9.7.3", + "mkdocs-minify-plugin>=0.8.0", +] +fuzz = [ + "atheris>=2.3.0", +] [tool.bandit] -exclude_dirs = ["codebase_rag/tests", "scripts"] +exclude_dirs = ["codebase_rag/tests", "scripts", "benchmarks", "optimize"] skips = ["B101"] diff --git a/realtime_updater.py b/realtime_updater.py index 4fd95d5bc..f3bc21f65 100644 --- a/realtime_updater.py +++ b/realtime_updater.py @@ -1,4 +1,5 @@ import sys +import threading import time from pathlib import Path from typing import Annotated @@ -14,7 +15,10 @@ from codebase_rag.config import settings from codebase_rag.constants import ( CYPHER_DELETE_CALLS, + CYPHER_DELETE_FILE, CYPHER_DELETE_MODULE, + DEFAULT_DEBOUNCE_SECONDS, + DEFAULT_MAX_WAIT_SECONDS, IGNORE_PATTERNS, IGNORE_SUFFIXES, KEY_PATH, @@ -32,11 +36,47 @@ class CodeChangeEventHandler(FileSystemEventHandler): - def __init__(self, updater: GraphUpdater): + """ + Handles file system events with debouncing to prevent redundant graph updates. + + The handler implements a hybrid debounce strategy: + - Debounce: Waits for a quiet period after the last change before processing + - Max wait: Ensures updates happen within a maximum time window, even during + continuous editing + + This prevents the graph update process from running repeatedly when a file + is saved multiple times in quick succession (common during active development). + """ + + def __init__( + self, + updater: GraphUpdater, + debounce_seconds: float = DEFAULT_DEBOUNCE_SECONDS, + max_wait_seconds: float = DEFAULT_MAX_WAIT_SECONDS, + ): self.updater = updater self.ignore_patterns = IGNORE_PATTERNS self.ignore_suffixes = IGNORE_SUFFIXES - logger.info(logs.WATCHER_ACTIVE) + + # (H) Debounce configuration + self.debounce_seconds = debounce_seconds + self.max_wait_seconds = max_wait_seconds + self.debounce_enabled = debounce_seconds > 0 + + # (H) Thread-safe state for tracking pending changes + self.timers: dict[str, threading.Timer] = {} + self.first_event_time: dict[str, float] = {} + self.pending_events: dict[str, FileSystemEvent] = {} + self.lock = threading.Lock() + + if self.debounce_enabled: + logger.info( + logs.WATCHER_DEBOUNCE_ACTIVE.format( + debounce=debounce_seconds, max_wait=max_wait_seconds + ) + ) + else: + logger.info(logs.WATCHER_ACTIVE) def _is_relevant(self, path_str: str) -> bool: path = Path(path_str) @@ -65,6 +105,99 @@ def dispatch(self, event: FileSystemEvent) -> None: if event.is_directory or not self._is_relevant(src_path): return + if not self.debounce_enabled: + # (H) No debouncing - process immediately (legacy behavior) + self._process_change(event) + return + + # (H) Debounced processing with hybrid approach + path = Path(src_path) + relative_path_str = str(path.relative_to(self.updater.repo_path)) + current_time = time.time() + + with self.lock: + # (H) Track the first event time for max-wait calculation + if relative_path_str not in self.first_event_time: + self.first_event_time[relative_path_str] = current_time + logger.info( + logs.CHANGE_DEBOUNCING.format( + event_type=event.event_type, + name=path.name, + debounce=self.debounce_seconds, + ) + ) + + # (H) Always store the latest event for this file + self.pending_events[relative_path_str] = event + + # (H) Cancel any existing timer for this file + if relative_path_str in self.timers: + self.timers[relative_path_str].cancel() + logger.debug(logs.DEBOUNCE_RESET.format(path=relative_path_str)) + + # (H) Check if max wait time has been exceeded + time_since_first = current_time - self.first_event_time[relative_path_str] + + if time_since_first >= self.max_wait_seconds: + # (H) Max wait exceeded - process immediately + logger.info( + logs.DEBOUNCE_MAX_WAIT.format( + max_wait=self.max_wait_seconds, path=relative_path_str + ) + ) + self._schedule_immediate_processing(relative_path_str) + else: + # (H) Schedule debounced processing + remaining_wait = self.max_wait_seconds - time_since_first + effective_delay = min(self.debounce_seconds, remaining_wait) + timer = threading.Timer( + effective_delay, + self._process_debounced_change, + args=[relative_path_str], + ) + timer.daemon = True + self.timers[relative_path_str] = timer + timer.start() + + logger.debug( + logs.DEBOUNCE_SCHEDULED.format( + path=relative_path_str, + debounce=self.debounce_seconds, + remaining=f"{remaining_wait:.1f}", + ) + ) + + def _schedule_immediate_processing(self, relative_path_str: str) -> None: + """Process a file change immediately (called when max wait is exceeded).""" + # (H) Use a zero-delay timer to process in the timer thread + timer = threading.Timer( + 0, self._process_debounced_change, args=[relative_path_str] + ) + timer.daemon = True + self.timers[relative_path_str] = timer + timer.start() + + def _process_debounced_change(self, relative_path_str: str) -> None: + """Process a debounced file change after the timer fires.""" + with self.lock: + # (H) Retrieve and clear pending state for this file + event = self.pending_events.pop(relative_path_str, None) + self.first_event_time.pop(relative_path_str, None) + self.timers.pop(relative_path_str, None) + + if event is None: + logger.warning(logs.DEBOUNCE_NO_EVENT.format(path=relative_path_str)) + return + + logger.info(logs.DEBOUNCE_PROCESSING.format(path=relative_path_str)) + self._process_change(event) + + def _process_change(self, event: FileSystemEvent) -> None: + """Execute the actual graph update for a file change.""" + src_path = event.src_path + if isinstance(src_path, bytes): + src_path = src_path.decode() + ingestor = self.updater.ingestor if not isinstance(ingestor, QueryProtocol): logger.warning(logs.WATCHER_SKIP_NO_QUERY) @@ -73,18 +206,31 @@ def dispatch(self, event: FileSystemEvent) -> None: path = Path(src_path) relative_path_str = str(path.relative_to(self.updater.repo_path)) + # (H) Only process events that actually change file content + # (H) Skip read-only events like "opened", "closed_no_write" that don't modify the file + relevant_events = { + EventType.MODIFIED, + EventType.CREATED, + EventType.DELETED, # (H) watchdog deletion event + } + if event.event_type not in relevant_events: + return + logger.warning( logs.CHANGE_DETECTED.format(event_type=event.event_type, path=path) ) - # (H) Step 1 + # (H) Step 1: Delete existing nodes for this file path + # (H) Delete Module node and its children (for code files) ingestor.execute_write(CYPHER_DELETE_MODULE, {KEY_PATH: relative_path_str}) + # (H) Delete File node (for all files including non-code like .md, .json) + ingestor.execute_write(CYPHER_DELETE_FILE, {KEY_PATH: relative_path_str}) logger.debug(logs.DELETION_QUERY.format(path=relative_path_str)) - # (H) Step 2 + # (H) Step 2: Clear in-memory state self.updater.remove_file_from_state(path) - # (H) Step 3 + # (H) Step 3: Re-parse code files and create File nodes for ALL files if event.event_type in (EventType.MODIFIED, EventType.CREATED): lang_config = get_language_spec(path.suffix) if ( @@ -101,18 +247,28 @@ def dispatch(self, event: FileSystemEvent) -> None: root_node, language = result self.updater.ast_cache[path] = (root_node, language) + # (H) Create File node for ALL files (code and non-code like .md, .json, etc.) + self.updater.factory.structure_processor.process_generic_file( + path, path.name + ) + # (H) Step 4 logger.info(logs.RECALC_CALLS) ingestor.execute_write(CYPHER_DELETE_CALLS) self.updater._process_function_calls() - # (H) Step 5 + # (H) Step 5: Flush changes to database self.updater.ingestor.flush_all() logger.success(logs.GRAPH_UPDATED.format(name=path.name)) def start_watcher( - repo_path: str, host: str, port: int, batch_size: int | None = None + repo_path: str, + host: str, + port: int, + batch_size: int | None = None, + debounce_seconds: float = DEFAULT_DEBOUNCE_SECONDS, + max_wait_seconds: float = DEFAULT_MAX_WAIT_SECONDS, ) -> None: repo_path_obj = Path(repo_path).resolve() parsers, queries = load_parsers() @@ -123,11 +279,27 @@ def start_watcher( host=host, port=port, batch_size=effective_batch_size, + username=settings.MEMGRAPH_USERNAME, + password=settings.MEMGRAPH_PASSWORD, ) as ingestor: - _run_watcher_loop(ingestor, repo_path_obj, parsers, queries) + _run_watcher_loop( + ingestor, + repo_path_obj, + parsers, + queries, + debounce_seconds, + max_wait_seconds, + ) -def _run_watcher_loop(ingestor, repo_path_obj, parsers, queries): +def _run_watcher_loop( + ingestor, + repo_path_obj, + parsers, + queries, + debounce_seconds: float, + max_wait_seconds: float, +): updater = GraphUpdater(ingestor, repo_path_obj, parsers, queries) # (H) Initial full scan builds the complete context for real-time updates @@ -135,7 +307,11 @@ def _run_watcher_loop(ingestor, repo_path_obj, parsers, queries): updater.run() logger.success(logs.INITIAL_SCAN_DONE) - event_handler = CodeChangeEventHandler(updater) + event_handler = CodeChangeEventHandler( + updater, + debounce_seconds=debounce_seconds, + max_wait_seconds=max_wait_seconds, + ) observer = Observer() observer.schedule(event_handler, str(repo_path_obj), recursive=True) observer.start() @@ -157,6 +333,12 @@ def _validate_positive_int(value: int | None) -> int | None: return value +def _validate_non_negative_float(value: float) -> float: + if value < 0: + raise typer.BadParameter(te.INVALID_NON_NEGATIVE_FLOAT.format(value=value)) + return value + + def main( repo_path: Annotated[str, typer.Argument(help=ch.HELP_REPO_PATH_WATCH)], host: Annotated[ @@ -172,11 +354,62 @@ def main( callback=_validate_positive_int, ), ] = None, + debounce: Annotated[ + float, + typer.Option( + "--debounce", + "-d", + help=ch.HELP_DEBOUNCE, + callback=_validate_non_negative_float, + ), + ] = DEFAULT_DEBOUNCE_SECONDS, + max_wait: Annotated[ + float, + typer.Option( + "--max-wait", + "-m", + help=ch.HELP_MAX_WAIT, + callback=_validate_non_negative_float, + ), + ] = DEFAULT_MAX_WAIT_SECONDS, ) -> None: + """ + Watch a repository for file changes and update the knowledge graph in real-time. + + The watcher uses a hybrid debouncing strategy to efficiently handle rapid file saves: + + - DEBOUNCE: After a file change, waits for a quiet period before processing. + This batches rapid saves into a single update. + + - MAX_WAIT: Ensures updates happen within a maximum time window, even during + continuous editing. Prevents indefinite delays. + + Examples: + + # Default settings (5s debounce, 30s max wait) + python realtime_updater.py /path/to/repo + + # More aggressive batching for background monitoring + python realtime_updater.py /path/to/repo --debounce 10 --max-wait 60 + + # Quick feedback for demos + python realtime_updater.py /path/to/repo --debounce 2 --max-wait 10 + + # Disable debouncing (legacy behavior) + python realtime_updater.py /path/to/repo --debounce 0 + """ logger.remove() logger.add(sys.stdout, format=REALTIME_LOGGER_FORMAT, level=LOG_LEVEL_INFO) logger.info(logs.LOGGER_CONFIGURED) - start_watcher(repo_path, host, port, batch_size) + + # (H) Validate max_wait is greater than debounce when both are enabled + if debounce > 0 and max_wait > 0 and max_wait < debounce: + logger.warning( + logs.DEBOUNCE_MAX_WAIT_ADJUSTED.format(max_wait=max_wait, debounce=debounce) + ) + max_wait = debounce + + start_watcher(repo_path, host, port, batch_size, debounce, max_wait) if __name__ == "__main__": diff --git a/scripts/hooks/generate_readme.py b/scripts/hooks/generate_readme.py index 88394ff55..51d6bbeec 100644 --- a/scripts/hooks/generate_readme.py +++ b/scripts/hooks/generate_readme.py @@ -1,4 +1,5 @@ #!/usr/bin/env python3 +import hashlib import subprocess import sys from pathlib import Path @@ -6,6 +7,8 @@ repo_root = Path(__file__).parent.parent.parent readme_path = repo_root / "README.md" +before = hashlib.sha256(readme_path.read_bytes()).hexdigest() + result = subprocess.run( ["uv", "run", "python", "scripts/generate_readme.py"], check=False, @@ -18,5 +21,9 @@ sys.stderr.write(result.stderr) sys.exit(result.returncode) -subprocess.run(["git", "add", "README.md"], cwd=repo_root, check=True) +after = hashlib.sha256(readme_path.read_bytes()).hexdigest() + +if before != after: + subprocess.run(["git", "add", "README.md"], cwd=repo_root, check=True) + sys.exit(1) sys.exit(0) diff --git a/scripts/release.sh b/scripts/release.sh new file mode 100755 index 000000000..eea3f351a --- /dev/null +++ b/scripts/release.sh @@ -0,0 +1,51 @@ +#!/usr/bin/env bash +set -euo pipefail + +# Local release: sync server.json to the pyproject version, then build, verify, +# and publish that version to PyPI and create the matching git tag and GitHub +# Release. Use this when the GitHub Actions publish workflow is unavailable +# (e.g. billing disabled). +# +# Credentials: twine prompts for a PyPI token (username __token__). To avoid the +# prompt, export TWINE_USERNAME=__token__ and TWINE_PASSWORD=pypi-... or set up +# ~/.pypirc beforehand. + +VERSION=$(grep -E '^version = ' pyproject.toml | sed 's/version = "\(.*\)"/\1/') +TAG="v${VERSION}" + +echo "==> Releasing ${TAG}" + +if [ -n "$(git status --porcelain)" ]; then + echo "Error: working tree is not clean. Commit or stash changes first." >&2 + exit 1 +fi + +if git rev-parse "${TAG}" >/dev/null 2>&1; then + echo "Error: tag ${TAG} already exists. Bump the version in pyproject.toml first." >&2 + exit 1 +fi + +echo "==> Syncing server.json to ${VERSION}" +perl -i -pe 's/"version": "[^"]*"/"version": "'"${VERSION}"'"/g' server.json +if [ -n "$(git status --porcelain server.json)" ]; then + git commit -m "chore: sync server.json version to ${VERSION}" server.json +fi + +echo "==> Building distributions" +rm -rf dist/ +uv build + +echo "==> Checking distributions" +uvx twine check dist/* + +echo "==> Uploading to PyPI" +uvx twine upload dist/* + +echo "==> Tagging and creating GitHub Release" +git tag "${TAG}" +git push origin "${TAG}" +# Note: this fires the publish.yml workflow, which will fail harmlessly while +# Actions billing is unavailable. PyPI is already published by the step above. +gh release create "${TAG}" --generate-notes --target main + +echo "==> Released ${TAG} at https://pypi.org/project/code-graph-rag/${VERSION}/" diff --git a/server.json b/server.json new file mode 100644 index 000000000..4827da69b --- /dev/null +++ b/server.json @@ -0,0 +1,78 @@ +{ + "$schema": "https://static.modelcontextprotocol.io/schemas/2025-12-11/server.schema.json", + "name": "io.github.vitali87/code-graph-rag", + "title": "Code-Graph-RAG", + "description": "Graph-based RAG system for multi-language codebases. Parse, index, query, and edit code using knowledge graphs and natural language.", + "websiteUrl": "https://code-graph-rag.com", + "repository": { + "url": "https://codeberg.org/vitali87/code-graph-rag", + "source": "github" + }, + "version": "0.0.187", + "packages": [ + { + "registryType": "pypi", + "registryBaseUrl": "https://pypi.org", + "identifier": "code-graph-rag", + "version": "0.0.187", + "runtimeHint": "uvx", + "transport": { + "type": "stdio" + }, + "packageArguments": [ + { + "type": "positional", + "value": "mcp-server" + } + ], + "environmentVariables": [ + { + "name": "ORCHESTRATOR_PROVIDER", + "description": "LLM provider for the orchestrator agent (openai, anthropic, google, azure, cohere, ollama)", + "default": "anthropic" + }, + { + "name": "ORCHESTRATOR_MODEL", + "description": "Model name for the orchestrator agent", + "default": "claude-sonnet-4-20250514" + }, + { + "name": "ORCHESTRATOR_API_KEY", + "description": "API key for the orchestrator LLM provider", + "isRequired": true, + "isSecret": true + }, + { + "name": "CYPHER_PROVIDER", + "description": "LLM provider for Cypher query generation (openai, anthropic, google, azure, cohere, ollama)", + "default": "anthropic" + }, + { + "name": "CYPHER_MODEL", + "description": "Model name for Cypher query generation", + "default": "claude-sonnet-4-20250514" + }, + { + "name": "CYPHER_API_KEY", + "description": "API key for the Cypher LLM provider", + "isRequired": true, + "isSecret": true + }, + { + "name": "MEMGRAPH_HOST", + "description": "Hostname of the Memgraph database", + "default": "localhost" + }, + { + "name": "MEMGRAPH_PORT", + "description": "Port of the Memgraph database", + "default": "7687" + }, + { + "name": "TARGET_REPO_PATH", + "description": "Path to the repository to analyze (auto-detected from working directory if not set)" + } + ] + } + ] +} diff --git a/sonar-project.properties b/sonar-project.properties new file mode 100644 index 000000000..796dc31c5 --- /dev/null +++ b/sonar-project.properties @@ -0,0 +1,13 @@ +sonar.projectKey=vitali87_code-graph-rag +sonar.organization=vitali87 +sonar.projectName=code-graph-rag + +sonar.sources=codebase_rag +sonar.tests=codebase_rag/tests +sonar.exclusions=**/__pycache__/**,**/*.pyc,codebase_rag/tests/** +sonar.security.exclusions=codebase_rag/tests/** + +sonar.python.version=3.12 +sonar.python.coverage.reportPaths=coverage.xml + +sonar.sourceEncoding=UTF-8 diff --git a/uv.lock b/uv.lock index aa1977b86..5b81e297c 100644 --- a/uv.lock +++ b/uv.lock @@ -19,6 +19,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/8f/78/eb55fabaab41abc53f52c0918a9a8c0f747807e5306273f51120fd695957/ag_ui_protocol-0.1.10-py3-none-any.whl", hash = "sha256:c81e6981f30aabdf97a7ee312bfd4df0cd38e718d9fc10019c7d438128b93ab5", size = 7889, upload-time = "2025-11-06T15:17:15.325Z" }, ] +[[package]] +name = "aiofile" +version = "3.11.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "caio" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/48/41/2fea7e193e061ce54eacc3b7bc0e6a99e4fcff43c78cf0a76dd781ed8334/aiofile-3.11.1.tar.gz", hash = "sha256:1f91912c6643d2a4e49ca4ae3514f0bf3867ce948a36d99a6411b8f4755f4cf9", size = 19342, upload-time = "2026-05-16T08:18:33.538Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/67/cd/0d76dfc5de72bde52f55f53e925c7d152d9c7906634ec1e0cbc7e8d4ad93/aiofile-3.11.1-py3-none-any.whl", hash = "sha256:ce77d14ac07f77bc2b757834a5c129321f3f705c474593deed5ab209079a52c9", size = 20446, upload-time = "2026-05-16T08:18:32.051Z" }, +] + [[package]] name = "aiohappyeyeballs" version = "2.6.1" @@ -30,7 +42,7 @@ wheels = [ [[package]] name = "aiohttp" -version = "3.13.3" +version = "3.14.1" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "aiohappyeyeballs" }, @@ -39,78 +51,93 @@ dependencies = [ { name = "frozenlist" }, { name = "multidict" }, { name = "propcache" }, + { name = "typing-extensions", marker = "python_full_version < '3.13'" }, { name = "yarl" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/50/42/32cf8e7704ceb4481406eb87161349abb46a57fee3f008ba9cb610968646/aiohttp-3.13.3.tar.gz", hash = "sha256:a949eee43d3782f2daae4f4a2819b2cb9b0c5d3b7f7a927067cc84dafdbb9f88", size = 7844556, upload-time = "2026-01-03T17:33:05.204Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/a0/be/4fc11f202955a69e0db803a12a062b8379c970c7c84f4882b6da17337cc1/aiohttp-3.13.3-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:b903a4dfee7d347e2d87697d0713be59e0b87925be030c9178c5faa58ea58d5c", size = 739732, upload-time = "2026-01-03T17:30:14.23Z" }, - { url = "https://files.pythonhosted.org/packages/97/2c/621d5b851f94fa0bb7430d6089b3aa970a9d9b75196bc93bb624b0db237a/aiohttp-3.13.3-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:a45530014d7a1e09f4a55f4f43097ba0fd155089372e105e4bff4ca76cb1b168", size = 494293, upload-time = "2026-01-03T17:30:15.96Z" }, - { url = "https://files.pythonhosted.org/packages/5d/43/4be01406b78e1be8320bb8316dc9c42dbab553d281c40364e0f862d5661c/aiohttp-3.13.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:27234ef6d85c914f9efeb77ff616dbf4ad2380be0cda40b4db086ffc7ddd1b7d", size = 493533, upload-time = "2026-01-03T17:30:17.431Z" }, - { url = "https://files.pythonhosted.org/packages/8d/a8/5a35dc56a06a2c90d4742cbf35294396907027f80eea696637945a106f25/aiohttp-3.13.3-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d32764c6c9aafb7fb55366a224756387cd50bfa720f32b88e0e6fa45b27dcf29", size = 1737839, upload-time = "2026-01-03T17:30:19.422Z" }, - { url = "https://files.pythonhosted.org/packages/bf/62/4b9eeb331da56530bf2e198a297e5303e1c1ebdceeb00fe9b568a65c5a0c/aiohttp-3.13.3-cp312-cp312-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:b1a6102b4d3ebc07dad44fbf07b45bb600300f15b552ddf1851b5390202ea2e3", size = 1703932, upload-time = "2026-01-03T17:30:21.756Z" }, - { url = "https://files.pythonhosted.org/packages/7c/f6/af16887b5d419e6a367095994c0b1332d154f647e7dc2bd50e61876e8e3d/aiohttp-3.13.3-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:c014c7ea7fb775dd015b2d3137378b7be0249a448a1612268b5a90c2d81de04d", size = 1771906, upload-time = "2026-01-03T17:30:23.932Z" }, - { url = "https://files.pythonhosted.org/packages/ce/83/397c634b1bcc24292fa1e0c7822800f9f6569e32934bdeef09dae7992dfb/aiohttp-3.13.3-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:2b8d8ddba8f95ba17582226f80e2de99c7a7948e66490ef8d947e272a93e9463", size = 1871020, upload-time = "2026-01-03T17:30:26Z" }, - { url = "https://files.pythonhosted.org/packages/86/f6/a62cbbf13f0ac80a70f71b1672feba90fdb21fd7abd8dbf25c0105fb6fa3/aiohttp-3.13.3-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9ae8dd55c8e6c4257eae3a20fd2c8f41edaea5992ed67156642493b8daf3cecc", size = 1755181, upload-time = "2026-01-03T17:30:27.554Z" }, - { url = "https://files.pythonhosted.org/packages/0a/87/20a35ad487efdd3fba93d5843efdfaa62d2f1479eaafa7453398a44faf13/aiohttp-3.13.3-cp312-cp312-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:01ad2529d4b5035578f5081606a465f3b814c542882804e2e8cda61adf5c71bf", size = 1561794, upload-time = "2026-01-03T17:30:29.254Z" }, - { url = "https://files.pythonhosted.org/packages/de/95/8fd69a66682012f6716e1bc09ef8a1a2a91922c5725cb904689f112309c4/aiohttp-3.13.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:bb4f7475e359992b580559e008c598091c45b5088f28614e855e42d39c2f1033", size = 1697900, upload-time = "2026-01-03T17:30:31.033Z" }, - { url = "https://files.pythonhosted.org/packages/e5/66/7b94b3b5ba70e955ff597672dad1691333080e37f50280178967aff68657/aiohttp-3.13.3-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:c19b90316ad3b24c69cd78d5c9b4f3aa4497643685901185b65166293d36a00f", size = 1728239, upload-time = "2026-01-03T17:30:32.703Z" }, - { url = "https://files.pythonhosted.org/packages/47/71/6f72f77f9f7d74719692ab65a2a0252584bf8d5f301e2ecb4c0da734530a/aiohttp-3.13.3-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:96d604498a7c782cb15a51c406acaea70d8c027ee6b90c569baa6e7b93073679", size = 1740527, upload-time = "2026-01-03T17:30:34.695Z" }, - { url = "https://files.pythonhosted.org/packages/fa/b4/75ec16cbbd5c01bdaf4a05b19e103e78d7ce1ef7c80867eb0ace42ff4488/aiohttp-3.13.3-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:084911a532763e9d3dd95adf78a78f4096cd5f58cdc18e6fdbc1b58417a45423", size = 1554489, upload-time = "2026-01-03T17:30:36.864Z" }, - { url = "https://files.pythonhosted.org/packages/52/8f/bc518c0eea29f8406dcf7ed1f96c9b48e3bc3995a96159b3fc11f9e08321/aiohttp-3.13.3-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:7a4a94eb787e606d0a09404b9c38c113d3b099d508021faa615d70a0131907ce", size = 1767852, upload-time = "2026-01-03T17:30:39.433Z" }, - { url = "https://files.pythonhosted.org/packages/9d/f2/a07a75173124f31f11ea6f863dc44e6f09afe2bca45dd4e64979490deab1/aiohttp-3.13.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:87797e645d9d8e222e04160ee32aa06bc5c163e8499f24db719e7852ec23093a", size = 1722379, upload-time = "2026-01-03T17:30:41.081Z" }, - { url = "https://files.pythonhosted.org/packages/3c/4a/1a3fee7c21350cac78e5c5cef711bac1b94feca07399f3d406972e2d8fcd/aiohttp-3.13.3-cp312-cp312-win32.whl", hash = "sha256:b04be762396457bef43f3597c991e192ee7da460a4953d7e647ee4b1c28e7046", size = 428253, upload-time = "2026-01-03T17:30:42.644Z" }, - { url = "https://files.pythonhosted.org/packages/d9/b7/76175c7cb4eb73d91ad63c34e29fc4f77c9386bba4a65b53ba8e05ee3c39/aiohttp-3.13.3-cp312-cp312-win_amd64.whl", hash = "sha256:e3531d63d3bdfa7e3ac5e9b27b2dd7ec9df3206a98e0b3445fa906f233264c57", size = 455407, upload-time = "2026-01-03T17:30:44.195Z" }, - { url = "https://files.pythonhosted.org/packages/97/8a/12ca489246ca1faaf5432844adbfce7ff2cc4997733e0af120869345643a/aiohttp-3.13.3-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:5dff64413671b0d3e7d5918ea490bdccb97a4ad29b3f311ed423200b2203e01c", size = 734190, upload-time = "2026-01-03T17:30:45.832Z" }, - { url = "https://files.pythonhosted.org/packages/32/08/de43984c74ed1fca5c014808963cc83cb00d7bb06af228f132d33862ca76/aiohttp-3.13.3-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:87b9aab6d6ed88235aa2970294f496ff1a1f9adcd724d800e9b952395a80ffd9", size = 491783, upload-time = "2026-01-03T17:30:47.466Z" }, - { url = "https://files.pythonhosted.org/packages/17/f8/8dd2cf6112a5a76f81f81a5130c57ca829d101ad583ce57f889179accdda/aiohttp-3.13.3-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:425c126c0dc43861e22cb1c14ba4c8e45d09516d0a3ae0a3f7494b79f5f233a3", size = 490704, upload-time = "2026-01-03T17:30:49.373Z" }, - { url = "https://files.pythonhosted.org/packages/6d/40/a46b03ca03936f832bc7eaa47cfbb1ad012ba1be4790122ee4f4f8cba074/aiohttp-3.13.3-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:7f9120f7093c2a32d9647abcaf21e6ad275b4fbec5b55969f978b1a97c7c86bf", size = 1720652, upload-time = "2026-01-03T17:30:50.974Z" }, - { url = "https://files.pythonhosted.org/packages/f7/7e/917fe18e3607af92657e4285498f500dca797ff8c918bd7d90b05abf6c2a/aiohttp-3.13.3-cp313-cp313-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:697753042d57f4bf7122cab985bf15d0cef23c770864580f5af4f52023a56bd6", size = 1692014, upload-time = "2026-01-03T17:30:52.729Z" }, - { url = "https://files.pythonhosted.org/packages/71/b6/cefa4cbc00d315d68973b671cf105b21a609c12b82d52e5d0c9ae61d2a09/aiohttp-3.13.3-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:6de499a1a44e7de70735d0b39f67c8f25eb3d91eb3103be99ca0fa882cdd987d", size = 1759777, upload-time = "2026-01-03T17:30:54.537Z" }, - { url = "https://files.pythonhosted.org/packages/fb/e3/e06ee07b45e59e6d81498b591fc589629be1553abb2a82ce33efe2a7b068/aiohttp-3.13.3-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:37239e9f9a7ea9ac5bf6b92b0260b01f8a22281996da609206a84df860bc1261", size = 1861276, upload-time = "2026-01-03T17:30:56.512Z" }, - { url = "https://files.pythonhosted.org/packages/7c/24/75d274228acf35ceeb2850b8ce04de9dd7355ff7a0b49d607ee60c29c518/aiohttp-3.13.3-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:f76c1e3fe7d7c8afad7ed193f89a292e1999608170dcc9751a7462a87dfd5bc0", size = 1743131, upload-time = "2026-01-03T17:30:58.256Z" }, - { url = "https://files.pythonhosted.org/packages/04/98/3d21dde21889b17ca2eea54fdcff21b27b93f45b7bb94ca029c31ab59dc3/aiohttp-3.13.3-cp313-cp313-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:fc290605db2a917f6e81b0e1e0796469871f5af381ce15c604a3c5c7e51cb730", size = 1556863, upload-time = "2026-01-03T17:31:00.445Z" }, - { url = "https://files.pythonhosted.org/packages/9e/84/da0c3ab1192eaf64782b03971ab4055b475d0db07b17eff925e8c93b3aa5/aiohttp-3.13.3-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:4021b51936308aeea0367b8f006dc999ca02bc118a0cc78c303f50a2ff6afb91", size = 1682793, upload-time = "2026-01-03T17:31:03.024Z" }, - { url = "https://files.pythonhosted.org/packages/ff/0f/5802ada182f575afa02cbd0ec5180d7e13a402afb7c2c03a9aa5e5d49060/aiohttp-3.13.3-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:49a03727c1bba9a97d3e93c9f93ca03a57300f484b6e935463099841261195d3", size = 1716676, upload-time = "2026-01-03T17:31:04.842Z" }, - { url = "https://files.pythonhosted.org/packages/3f/8c/714d53bd8b5a4560667f7bbbb06b20c2382f9c7847d198370ec6526af39c/aiohttp-3.13.3-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:3d9908a48eb7416dc1f4524e69f1d32e5d90e3981e4e37eb0aa1cd18f9cfa2a4", size = 1733217, upload-time = "2026-01-03T17:31:06.868Z" }, - { url = "https://files.pythonhosted.org/packages/7d/79/e2176f46d2e963facea939f5be2d26368ce543622be6f00a12844d3c991f/aiohttp-3.13.3-cp313-cp313-musllinux_1_2_riscv64.whl", hash = "sha256:2712039939ec963c237286113c68dbad80a82a4281543f3abf766d9d73228998", size = 1552303, upload-time = "2026-01-03T17:31:08.958Z" }, - { url = "https://files.pythonhosted.org/packages/ab/6a/28ed4dea1759916090587d1fe57087b03e6c784a642b85ef48217b0277ae/aiohttp-3.13.3-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:7bfdc049127717581866fa4708791220970ce291c23e28ccf3922c700740fdc0", size = 1763673, upload-time = "2026-01-03T17:31:10.676Z" }, - { url = "https://files.pythonhosted.org/packages/e8/35/4a3daeb8b9fab49240d21c04d50732313295e4bd813a465d840236dd0ce1/aiohttp-3.13.3-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:8057c98e0c8472d8846b9c79f56766bcc57e3e8ac7bfd510482332366c56c591", size = 1721120, upload-time = "2026-01-03T17:31:12.575Z" }, - { url = "https://files.pythonhosted.org/packages/bc/9f/d643bb3c5fb99547323e635e251c609fbbc660d983144cfebec529e09264/aiohttp-3.13.3-cp313-cp313-win32.whl", hash = "sha256:1449ceddcdbcf2e0446957863af03ebaaa03f94c090f945411b61269e2cb5daf", size = 427383, upload-time = "2026-01-03T17:31:14.382Z" }, - { url = "https://files.pythonhosted.org/packages/4e/f1/ab0395f8a79933577cdd996dd2f9aa6014af9535f65dddcf88204682fe62/aiohttp-3.13.3-cp313-cp313-win_amd64.whl", hash = "sha256:693781c45a4033d31d4187d2436f5ac701e7bbfe5df40d917736108c1cc7436e", size = 453899, upload-time = "2026-01-03T17:31:15.958Z" }, - { url = "https://files.pythonhosted.org/packages/99/36/5b6514a9f5d66f4e2597e40dea2e3db271e023eb7a5d22defe96ba560996/aiohttp-3.13.3-cp314-cp314-macosx_10_13_universal2.whl", hash = "sha256:ea37047c6b367fd4bd632bff8077449b8fa034b69e812a18e0132a00fae6e808", size = 737238, upload-time = "2026-01-03T17:31:17.909Z" }, - { url = "https://files.pythonhosted.org/packages/f7/49/459327f0d5bcd8c6c9ca69e60fdeebc3622861e696490d8674a6d0cb90a6/aiohttp-3.13.3-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:6fc0e2337d1a4c3e6acafda6a78a39d4c14caea625124817420abceed36e2415", size = 492292, upload-time = "2026-01-03T17:31:19.919Z" }, - { url = "https://files.pythonhosted.org/packages/e8/0b/b97660c5fd05d3495b4eb27f2d0ef18dc1dc4eff7511a9bf371397ff0264/aiohttp-3.13.3-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:c685f2d80bb67ca8c3837823ad76196b3694b0159d232206d1e461d3d434666f", size = 493021, upload-time = "2026-01-03T17:31:21.636Z" }, - { url = "https://files.pythonhosted.org/packages/54/d4/438efabdf74e30aeceb890c3290bbaa449780583b1270b00661126b8aae4/aiohttp-3.13.3-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:48e377758516d262bde50c2584fc6c578af272559c409eecbdd2bae1601184d6", size = 1717263, upload-time = "2026-01-03T17:31:23.296Z" }, - { url = "https://files.pythonhosted.org/packages/71/f2/7bddc7fd612367d1459c5bcf598a9e8f7092d6580d98de0e057eb42697ad/aiohttp-3.13.3-cp314-cp314-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:34749271508078b261c4abb1767d42b8d0c0cc9449c73a4df494777dc55f0687", size = 1669107, upload-time = "2026-01-03T17:31:25.334Z" }, - { url = "https://files.pythonhosted.org/packages/00/5a/1aeaecca40e22560f97610a329e0e5efef5e0b5afdf9f857f0d93839ab2e/aiohttp-3.13.3-cp314-cp314-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:82611aeec80eb144416956ec85b6ca45a64d76429c1ed46ae1b5f86c6e0c9a26", size = 1760196, upload-time = "2026-01-03T17:31:27.394Z" }, - { url = "https://files.pythonhosted.org/packages/f8/f8/0ff6992bea7bd560fc510ea1c815f87eedd745fe035589c71ce05612a19a/aiohttp-3.13.3-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:2fff83cfc93f18f215896e3a190e8e5cb413ce01553901aca925176e7568963a", size = 1843591, upload-time = "2026-01-03T17:31:29.238Z" }, - { url = "https://files.pythonhosted.org/packages/e3/d1/e30e537a15f53485b61f5be525f2157da719819e8377298502aebac45536/aiohttp-3.13.3-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:bbe7d4cecacb439e2e2a8a1a7b935c25b812af7a5fd26503a66dadf428e79ec1", size = 1720277, upload-time = "2026-01-03T17:31:31.053Z" }, - { url = "https://files.pythonhosted.org/packages/84/45/23f4c451d8192f553d38d838831ebbc156907ea6e05557f39563101b7717/aiohttp-3.13.3-cp314-cp314-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:b928f30fe49574253644b1ca44b1b8adbd903aa0da4b9054a6c20fc7f4092a25", size = 1548575, upload-time = "2026-01-03T17:31:32.87Z" }, - { url = "https://files.pythonhosted.org/packages/6a/ed/0a42b127a43712eda7807e7892c083eadfaf8429ca8fb619662a530a3aab/aiohttp-3.13.3-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:7b5e8fe4de30df199155baaf64f2fcd604f4c678ed20910db8e2c66dc4b11603", size = 1679455, upload-time = "2026-01-03T17:31:34.76Z" }, - { url = "https://files.pythonhosted.org/packages/2e/b5/c05f0c2b4b4fe2c9d55e73b6d3ed4fd6c9dc2684b1d81cbdf77e7fad9adb/aiohttp-3.13.3-cp314-cp314-musllinux_1_2_armv7l.whl", hash = "sha256:8542f41a62bcc58fc7f11cf7c90e0ec324ce44950003feb70640fc2a9092c32a", size = 1687417, upload-time = "2026-01-03T17:31:36.699Z" }, - { url = "https://files.pythonhosted.org/packages/c9/6b/915bc5dad66aef602b9e459b5a973529304d4e89ca86999d9d75d80cbd0b/aiohttp-3.13.3-cp314-cp314-musllinux_1_2_ppc64le.whl", hash = "sha256:5e1d8c8b8f1d91cd08d8f4a3c2b067bfca6ec043d3ff36de0f3a715feeedf926", size = 1729968, upload-time = "2026-01-03T17:31:38.622Z" }, - { url = "https://files.pythonhosted.org/packages/11/3b/e84581290a9520024a08640b63d07673057aec5ca548177a82026187ba73/aiohttp-3.13.3-cp314-cp314-musllinux_1_2_riscv64.whl", hash = "sha256:90455115e5da1c3c51ab619ac57f877da8fd6d73c05aacd125c5ae9819582aba", size = 1545690, upload-time = "2026-01-03T17:31:40.57Z" }, - { url = "https://files.pythonhosted.org/packages/f5/04/0c3655a566c43fd647c81b895dfe361b9f9ad6d58c19309d45cff52d6c3b/aiohttp-3.13.3-cp314-cp314-musllinux_1_2_s390x.whl", hash = "sha256:042e9e0bcb5fba81886c8b4fbb9a09d6b8a00245fd8d88e4d989c1f96c74164c", size = 1746390, upload-time = "2026-01-03T17:31:42.857Z" }, - { url = "https://files.pythonhosted.org/packages/1f/53/71165b26978f719c3419381514c9690bd5980e764a09440a10bb816ea4ab/aiohttp-3.13.3-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:2eb752b102b12a76ca02dff751a801f028b4ffbbc478840b473597fc91a9ed43", size = 1702188, upload-time = "2026-01-03T17:31:44.984Z" }, - { url = "https://files.pythonhosted.org/packages/29/a7/cbe6c9e8e136314fa1980da388a59d2f35f35395948a08b6747baebb6aa6/aiohttp-3.13.3-cp314-cp314-win32.whl", hash = "sha256:b556c85915d8efaed322bf1bdae9486aa0f3f764195a0fb6ee962e5c71ef5ce1", size = 433126, upload-time = "2026-01-03T17:31:47.463Z" }, - { url = "https://files.pythonhosted.org/packages/de/56/982704adea7d3b16614fc5936014e9af85c0e34b58f9046655817f04306e/aiohttp-3.13.3-cp314-cp314-win_amd64.whl", hash = "sha256:9bf9f7a65e7aa20dd764151fb3d616c81088f91f8df39c3893a536e279b4b984", size = 459128, upload-time = "2026-01-03T17:31:49.2Z" }, - { url = "https://files.pythonhosted.org/packages/6c/2a/3c79b638a9c3d4658d345339d22070241ea341ed4e07b5ac60fb0f418003/aiohttp-3.13.3-cp314-cp314t-macosx_10_13_universal2.whl", hash = "sha256:05861afbbec40650d8a07ea324367cb93e9e8cc7762e04dd4405df99fa65159c", size = 769512, upload-time = "2026-01-03T17:31:51.134Z" }, - { url = "https://files.pythonhosted.org/packages/29/b9/3e5014d46c0ab0db8707e0ac2711ed28c4da0218c358a4e7c17bae0d8722/aiohttp-3.13.3-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:2fc82186fadc4a8316768d61f3722c230e2c1dcab4200d52d2ebdf2482e47592", size = 506444, upload-time = "2026-01-03T17:31:52.85Z" }, - { url = "https://files.pythonhosted.org/packages/90/03/c1d4ef9a054e151cd7839cdc497f2638f00b93cbe8043983986630d7a80c/aiohttp-3.13.3-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:0add0900ff220d1d5c5ebbf99ed88b0c1bbf87aa7e4262300ed1376a6b13414f", size = 510798, upload-time = "2026-01-03T17:31:54.91Z" }, - { url = "https://files.pythonhosted.org/packages/ea/76/8c1e5abbfe8e127c893fe7ead569148a4d5a799f7cf958d8c09f3eedf097/aiohttp-3.13.3-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:568f416a4072fbfae453dcf9a99194bbb8bdeab718e08ee13dfa2ba0e4bebf29", size = 1868835, upload-time = "2026-01-03T17:31:56.733Z" }, - { url = "https://files.pythonhosted.org/packages/8e/ac/984c5a6f74c363b01ff97adc96a3976d9c98940b8969a1881575b279ac5d/aiohttp-3.13.3-cp314-cp314t-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:add1da70de90a2569c5e15249ff76a631ccacfe198375eead4aadf3b8dc849dc", size = 1720486, upload-time = "2026-01-03T17:31:58.65Z" }, - { url = "https://files.pythonhosted.org/packages/b2/9a/b7039c5f099c4eb632138728828b33428585031a1e658d693d41d07d89d1/aiohttp-3.13.3-cp314-cp314t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:10b47b7ba335d2e9b1239fa571131a87e2d8ec96b333e68b2a305e7a98b0bae2", size = 1847951, upload-time = "2026-01-03T17:32:00.989Z" }, - { url = "https://files.pythonhosted.org/packages/3c/02/3bec2b9a1ba3c19ff89a43a19324202b8eb187ca1e928d8bdac9bbdddebd/aiohttp-3.13.3-cp314-cp314t-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:3dd4dce1c718e38081c8f35f323209d4c1df7d4db4bab1b5c88a6b4d12b74587", size = 1941001, upload-time = "2026-01-03T17:32:03.122Z" }, - { url = "https://files.pythonhosted.org/packages/37/df/d879401cedeef27ac4717f6426c8c36c3091c6e9f08a9178cc87549c537f/aiohttp-3.13.3-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:34bac00a67a812570d4a460447e1e9e06fae622946955f939051e7cc895cfab8", size = 1797246, upload-time = "2026-01-03T17:32:05.255Z" }, - { url = "https://files.pythonhosted.org/packages/8d/15/be122de1f67e6953add23335c8ece6d314ab67c8bebb3f181063010795a7/aiohttp-3.13.3-cp314-cp314t-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:a19884d2ee70b06d9204b2727a7b9f983d0c684c650254679e716b0b77920632", size = 1627131, upload-time = "2026-01-03T17:32:07.607Z" }, - { url = "https://files.pythonhosted.org/packages/12/12/70eedcac9134cfa3219ab7af31ea56bc877395b1ac30d65b1bc4b27d0438/aiohttp-3.13.3-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:5f8ca7f2bb6ba8348a3614c7918cc4bb73268c5ac2a207576b7afea19d3d9f64", size = 1795196, upload-time = "2026-01-03T17:32:09.59Z" }, - { url = "https://files.pythonhosted.org/packages/32/11/b30e1b1cd1f3054af86ebe60df96989c6a414dd87e27ad16950eee420bea/aiohttp-3.13.3-cp314-cp314t-musllinux_1_2_armv7l.whl", hash = "sha256:b0d95340658b9d2f11d9697f59b3814a9d3bb4b7a7c20b131df4bcef464037c0", size = 1782841, upload-time = "2026-01-03T17:32:11.445Z" }, - { url = "https://files.pythonhosted.org/packages/88/0d/d98a9367b38912384a17e287850f5695c528cff0f14f791ce8ee2e4f7796/aiohttp-3.13.3-cp314-cp314t-musllinux_1_2_ppc64le.whl", hash = "sha256:a1e53262fd202e4b40b70c3aff944a8155059beedc8a89bba9dc1f9ef06a1b56", size = 1795193, upload-time = "2026-01-03T17:32:13.705Z" }, - { url = "https://files.pythonhosted.org/packages/43/a5/a2dfd1f5ff5581632c7f6a30e1744deda03808974f94f6534241ef60c751/aiohttp-3.13.3-cp314-cp314t-musllinux_1_2_riscv64.whl", hash = "sha256:d60ac9663f44168038586cab2157e122e46bdef09e9368b37f2d82d354c23f72", size = 1621979, upload-time = "2026-01-03T17:32:15.965Z" }, - { url = "https://files.pythonhosted.org/packages/fa/f0/12973c382ae7c1cccbc4417e129c5bf54c374dfb85af70893646e1f0e749/aiohttp-3.13.3-cp314-cp314t-musllinux_1_2_s390x.whl", hash = "sha256:90751b8eed69435bac9ff4e3d2f6b3af1f57e37ecb0fbeee59c0174c9e2d41df", size = 1822193, upload-time = "2026-01-03T17:32:18.219Z" }, - { url = "https://files.pythonhosted.org/packages/3c/5f/24155e30ba7f8c96918af1350eb0663e2430aad9e001c0489d89cd708ab1/aiohttp-3.13.3-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:fc353029f176fd2b3ec6cfc71be166aba1936fe5d73dd1992ce289ca6647a9aa", size = 1769801, upload-time = "2026-01-03T17:32:20.25Z" }, - { url = "https://files.pythonhosted.org/packages/eb/f8/7314031ff5c10e6ece114da79b338ec17eeff3a079e53151f7e9f43c4723/aiohttp-3.13.3-cp314-cp314t-win32.whl", hash = "sha256:2e41b18a58da1e474a057b3d35248d8320029f61d70a37629535b16a0c8f3767", size = 466523, upload-time = "2026-01-03T17:32:22.215Z" }, - { url = "https://files.pythonhosted.org/packages/b4/63/278a98c715ae467624eafe375542d8ba9b4383a016df8fdefe0ae28382a7/aiohttp-3.13.3-cp314-cp314t-win_amd64.whl", hash = "sha256:44531a36aa2264a1860089ffd4dce7baf875ee5a6079d5fb42e261c704ef7344", size = 499694, upload-time = "2026-01-03T17:32:24.546Z" }, +sdist = { url = "https://files.pythonhosted.org/packages/82/78/8ea7308cac6934de8c74a14f3d5f65d1c89287426688be79538d0e5c013d/aiohttp-3.14.1.tar.gz", hash = "sha256:307f2cff90a764d329e77040603fa032db89c5c24fdad50c4c15334cba744035", size = 7955794, upload-time = "2026-06-07T21:09:35.529Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/1d/21/151624b51cd92553d95424daf4bf19f19ce9be9002d19253e7e7ce67197b/aiohttp-3.14.1-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:d35143e27778b4bb0fb189562d7f275bff79c62ab8e98459717c0ea617ff2480", size = 757402, upload-time = "2026-06-07T21:06:40.311Z" }, + { url = "https://files.pythonhosted.org/packages/c2/82/280619e0bd7bf2454987e19282616e84762255dd9c8468f62382e8c191f1/aiohttp-3.14.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:bcfb80a2cc36fba2534e5e5b5264dc7ae6fcd9bf15256da3e53d2f499e6fa29d", size = 512310, upload-time = "2026-06-07T21:06:42.207Z" }, + { url = "https://files.pythonhosted.org/packages/55/b2/2aac325583aaa1353045f96dffa586d8a34e8322e14a7ba49cffeb103ab4/aiohttp-3.14.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:27fd7c91e51729b4f7e1577865fa6d34c9adccbc39aabe9000285b48af9f0ec2", size = 512448, upload-time = "2026-06-07T21:06:43.813Z" }, + { url = "https://files.pythonhosted.org/packages/8a/72/a60607cb849faa8af8a356c9329ea2eb6f395d49e82cc82ccba1fd8deb8f/aiohttp-3.14.1-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:64c567bf9eaf664280116a8688f63016e6b32db2505908e2bdaca1b6438142f2", size = 1766854, upload-time = "2026-06-07T21:06:45.391Z" }, + { url = "https://files.pythonhosted.org/packages/b5/d3/d9fe1c9ec7557ab4d0d82bebaa728c6418f0b93295ec2f4ab015f7710cc7/aiohttp-3.14.1-cp312-cp312-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:f5e6ff2bdbb8f4cd3fbe41f99e25bbcd58e3bf9f13d3dd31a11e7917251cc77a", size = 1740884, upload-time = "2026-06-07T21:06:47.413Z" }, + { url = "https://files.pythonhosted.org/packages/c1/dc/f2cecfaf9337ba3e63f181500814ff502aa3d00d9c7ec93a9d23d10a27b2/aiohttp-3.14.1-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:2f73e01dc37122325caf079982621262f96d74823c179038a82fddfc50359264", size = 1810034, upload-time = "2026-06-07T21:06:50.165Z" }, + { url = "https://files.pythonhosted.org/packages/66/d7/2ff65c5e65c0d7476daf7e15c032e0805e36811185b9623e3238ad6c763e/aiohttp-3.14.1-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:bb2c0c80d431c0d03f2c7dbf125150fedd4f0de17366a7ca33f7ccb822391842", size = 1904054, upload-time = "2026-06-07T21:06:52.035Z" }, + { url = "https://files.pythonhosted.org/packages/20/9c/d445818389df371f56d141d881153ba23183c4735a03f7356ffb43f7757d/aiohttp-3.14.1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:3e6fc1a85fa7194a1a7d19f44e8609180f4a8eb5fa4c7ed8b4355f080fad235c", size = 1790278, upload-time = "2026-06-07T21:06:54.049Z" }, + { url = "https://files.pythonhosted.org/packages/4d/aa/bf04cb4d865fc6101c2229a294ad744973b72e513fdc5a6b791e6983d72a/aiohttp-3.14.1-cp312-cp312-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:686b6c0d3911ec387b444ddf5dc62fb7f7c0a7d5186a7861626496a5ab4aff95", size = 1591795, upload-time = "2026-06-07T21:06:55.911Z" }, + { url = "https://files.pythonhosted.org/packages/dc/b4/4dac0038960427ba832f6609dfb4ea5437d7fd80c72001b9e48f834f428b/aiohttp-3.14.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:c6fa4dc7ad6f8109c70bb1499e589f76b0b792baf39f9b017eb92c8a81d0a199", size = 1728397, upload-time = "2026-06-07T21:06:57.777Z" }, + { url = "https://files.pythonhosted.org/packages/2b/f9/7cd4e8ad7aa3b75f17d56bb5498dd604a93d4e6eece822ba0568c413fff0/aiohttp-3.14.1-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:87a5eea1b2a5e21e1ebdbb33ad4165359189327e63fc4e4894693e7f821ac817", size = 1766504, upload-time = "2026-06-07T21:07:00.009Z" }, + { url = "https://files.pythonhosted.org/packages/f9/df/fc01d9fcad0f73fed3f3d361f1f94f975947b50dff82919f6dc2bf4316cc/aiohttp-3.14.1-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:1c1421eb01d4fd608d88cc8290211d177a58532b55ad94076fb349c5bf467f0a", size = 1777806, upload-time = "2026-06-07T21:07:02.064Z" }, + { url = "https://files.pythonhosted.org/packages/41/09/47e2d090bddcc8fb4ccb4c314aadc32d7c5d9bb55f50f6ad1c92fc15d501/aiohttp-3.14.1-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:34b257ec41345c1e8f2df68fa908a7952f5de932723871eb633ecbbff396c9a4", size = 1580707, upload-time = "2026-06-07T21:07:03.942Z" }, + { url = "https://files.pythonhosted.org/packages/3d/36/f1a4ce904ae0b6930cfe9afc96d0896f7ec1a620c400405d63783bb95a9c/aiohttp-3.14.1-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:de538791a80e5d862addbc183f70f0158ac9b9bb872bb147f1fd2a683691e087", size = 1798121, upload-time = "2026-06-07T21:07:05.987Z" }, + { url = "https://files.pythonhosted.org/packages/70/0a/e0075ce9ca0279ee1d4f0c0b85f54fea02ebc83c3007651a72bece658fec/aiohttp-3.14.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:6f71173be42d3241d428f760122febb748de0623f44308a6f120d0dd9ec572e3", size = 1767580, upload-time = "2026-06-07T21:07:07.873Z" }, + { url = "https://files.pythonhosted.org/packages/3e/61/a0c0a8f327a9c52095cdd8e312391b00d3ed64ab6c72bb5c33d8ec251cf7/aiohttp-3.14.1-cp312-cp312-win32.whl", hash = "sha256:ec8dc383ee57ea3e883477dcca3f11b65d58199f1080acaf4cd6ad9a99698be4", size = 452771, upload-time = "2026-06-07T21:07:09.669Z" }, + { url = "https://files.pythonhosted.org/packages/df/d9/ea367c75f16ac9c6cdc8febb25e8318fa21a2b1bc8d6514d4b2d890bface/aiohttp-3.14.1-cp312-cp312-win_amd64.whl", hash = "sha256:2aa92c87868cd13674989f9ee83e5f9f7ea4237589b728048e1f0c8f6caa3271", size = 479873, upload-time = "2026-06-07T21:07:11.538Z" }, + { url = "https://files.pythonhosted.org/packages/03/64/8d96784a7851156db8a4c6c3f6f91042fdf39fb15a4cc38c8b3c14833c45/aiohttp-3.14.1-cp312-cp312-win_arm64.whl", hash = "sha256:2c840c90759922cb5e6dda94596e079a30fb5a5ba548e7e0dc00574703940847", size = 448073, upload-time = "2026-06-07T21:07:13.637Z" }, + { url = "https://files.pythonhosted.org/packages/bc/97/bd137012dd97e1649162b099135a80e1fd59aaa807b2430fc448d1029aff/aiohttp-3.14.1-cp313-cp313-android_21_arm64_v8a.whl", hash = "sha256:b3a03285a7f9c7b016324574a6d92a1c895da6b978cb8f1deee3ac72bc6da178", size = 506882, upload-time = "2026-06-07T21:07:15.501Z" }, + { url = "https://files.pythonhosted.org/packages/ef/79/e5cc690e9d922a66887ceeaca53a8ffd5a7b0be3816142b7abc433742d89/aiohttp-3.14.1-cp313-cp313-android_21_x86_64.whl", hash = "sha256:2a73f487ab8ef5abbb24b7aa9b73e98eaba9e9e031804ff2416f02eca315ccaf", size = 515270, upload-time = "2026-06-07T21:07:17.53Z" }, + { url = "https://files.pythonhosted.org/packages/fe/22/a73ccbf9dbd6e26dda0b24d5fd5db7da92ee3383a79f47677ffb834c5c5b/aiohttp-3.14.1-cp313-cp313-ios_13_0_arm64_iphoneos.whl", hash = "sha256:915fbb7b41b115192259f8c9ae58f3ddc444d2b5579917270211858e606a4afd", size = 485841, upload-time = "2026-06-07T21:07:19.555Z" }, + { url = "https://files.pythonhosted.org/packages/3b/b9/57ed8eaf596321c2ad747bd480fb1700dbd7177c60dfc9e4c187f629662e/aiohttp-3.14.1-cp313-cp313-ios_13_0_arm64_iphonesimulator.whl", hash = "sha256:7fb4bdf95b0561a79f259f9d28fbc109728c5ee7f27aff6391f0ca703a329abe", size = 492088, upload-time = "2026-06-07T21:07:21.581Z" }, + { url = "https://files.pythonhosted.org/packages/78/c0/5ebe5270a7c140d7c6f79dcb018640225f14d406c149e4eec04a7d82fe71/aiohttp-3.14.1-cp313-cp313-ios_13_0_x86_64_iphonesimulator.whl", hash = "sha256:1b9748363260121d2927704f5d4fc498150669ca3ae93625986ee89c8f80dcd4", size = 501564, upload-time = "2026-06-07T21:07:23.388Z" }, + { url = "https://files.pythonhosted.org/packages/75/7f/8cdaa24fc7983865e0915153b96a9ac5bcdd3548d64c5a27d17cecccad2d/aiohttp-3.14.1-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:86a6dab78b0e43e2897a3bbe15745aa60dc5423ca437b7b0b164c069bf91b876", size = 751998, upload-time = "2026-06-07T21:07:25.046Z" }, + { url = "https://files.pythonhosted.org/packages/b2/f4/c4227aacfacc5cb0cc2d119b65301d177912a6842cd64e120c47af76064f/aiohttp-3.14.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:4dfd6e47d3c44c2279907607f73a4240b88c69eb8b90da7e2441a8045dfd21da", size = 510918, upload-time = "2026-06-07T21:07:27.28Z" }, + { url = "https://files.pythonhosted.org/packages/ab/01/a2d5f96cd4e74424864d30bc0a7e44d0a12dacdcfa91b5b2d1bd3dca6bf3/aiohttp-3.14.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:317acd9f8602858dc7d59679812c376c7f0b97bcbbf16e0d6237f54141d8a8a6", size = 508657, upload-time = "2026-06-07T21:07:29.252Z" }, + { url = "https://files.pythonhosted.org/packages/e8/ed/3c0fb5c500fdd8e7ebc10d1889c04384fffa1a9163eac1356088ca9da1b1/aiohttp-3.14.1-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:bd869c427324e5cb15195793de951295710db28be7d818247f3097b4ab5d4b96", size = 1757907, upload-time = "2026-06-07T21:07:31.03Z" }, + { url = "https://files.pythonhosted.org/packages/0b/ab/d4c924d9bd5be3050c226612413ce68cb54c70d2c31b661bfc8d9a5b6a70/aiohttp-3.14.1-cp313-cp313-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:93b032b5ec3255473c143627d21a69ac74ae12f7f33974cb587c564d11b1066f", size = 1737565, upload-time = "2026-06-07T21:07:33.031Z" }, + { url = "https://files.pythonhosted.org/packages/19/2a/37326821ff779084020cdc33224d20b19f42f4183a500ff92022a739eda7/aiohttp-3.14.1-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:f234b4deb12f3ad59127e037bc57c40c21e45b45282df7d3a55a0f409f595296", size = 1799018, upload-time = "2026-06-07T21:07:35.003Z" }, + { url = "https://files.pythonhosted.org/packages/b3/4f/6e947ba73e4ce09070761c05ed3a8ceb7c21f5e46798671d8b2aac0e4626/aiohttp-3.14.1-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:9af6779bfb46abf124068327abcdf9ce95c9ef8287a3e8da76ccf2d0f16c28fa", size = 1894416, upload-time = "2026-06-07T21:07:36.956Z" }, + { url = "https://files.pythonhosted.org/packages/9d/6e/dbf1d0625dc711fb2851f4f3c3055c39ed58bae92082d8c627dbe6013736/aiohttp-3.14.1-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:faccab372e66bc76d5731525e7f1143c922271725b9d38c9f97edcc66266b451", size = 1783881, upload-time = "2026-06-07T21:07:39.063Z" }, + { url = "https://files.pythonhosted.org/packages/44/c2/5e25098a67268ed369483ae7d1a58bd0a13d03aab860d2a0e4a6eb25b046/aiohttp-3.14.1-cp313-cp313-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:f380468b09d2a81633ee863b0ec5648d364bd17bb8ecfb8c2f387f7ac1faf42c", size = 1587572, upload-time = "2026-06-07T21:07:41.058Z" }, + { url = "https://files.pythonhosted.org/packages/2a/bd/cf9cee17e140f942a3de73e658a543aa8fbf35a5fc67a9d2538d52d77f0b/aiohttp-3.14.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:97e704dcd26271f5bda3fa07c3ce0fb76d6d3f8659f4baa1a24442cc9ba177ca", size = 1722137, upload-time = "2026-06-07T21:07:43.014Z" }, + { url = "https://files.pythonhosted.org/packages/89/6d/5684f8c59045c96f81a18cefbc1fbbd79d25b88f1c622f2a5c5c08fcb632/aiohttp-3.14.1-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:269b76ac5394092b95bc4a098f4fc6c191c083c3bd12775d1e30e663132f6a09", size = 1755953, upload-time = "2026-06-07T21:07:45.933Z" }, + { url = "https://files.pythonhosted.org/packages/a8/40/35caf3170f8359760740a7d9aa0fff2e344bef98e1d1186f5a0f6dec17e6/aiohttp-3.14.1-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:5c0b3e614340c889d575451696374c9d17affd54cd607ca0babed8f8c37b9397", size = 1766479, upload-time = "2026-06-07T21:07:48.047Z" }, + { url = "https://files.pythonhosted.org/packages/6d/a1/b0c61e7a137f0d81de49a82023a6df73c3c16d6fefb0f8e4a93d21639002/aiohttp-3.14.1-cp313-cp313-musllinux_1_2_riscv64.whl", hash = "sha256:5663ee9257cfa1add7253a7da3035a02f31b6600ec48261585e1800a81533080", size = 1580077, upload-time = "2026-06-07T21:07:50.069Z" }, + { url = "https://files.pythonhosted.org/packages/0b/41/194ea4623693009fcefebef7aef63c141754f153e9cd0d39d3b9e36c175c/aiohttp-3.14.1-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:603a2c834142172ffddc054067f5ec0ca65d57a0aa98a71bc81952573208e345", size = 1791688, upload-time = "2026-06-07T21:07:52.106Z" }, + { url = "https://files.pythonhosted.org/packages/ba/45/4de841f005cfe1fd63e2a2fe011262c515e2a62aa6994b15947e7d717ac9/aiohttp-3.14.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:cb21957bb8aca671c1765e32f58164cf0c50e6bf41c0bbbd16da20732ecaf588", size = 1761094, upload-time = "2026-06-07T21:07:54.113Z" }, + { url = "https://files.pythonhosted.org/packages/e4/ae/dbce10533d3896d544d5053939ed75b7dc31a1b0973d959b1b5ae21028d6/aiohttp-3.14.1-cp313-cp313-win32.whl", hash = "sha256:e509a55f681e6158c20f70f102f9cf61fb20fbc382272bc6d94b7343f2582780", size = 452662, upload-time = "2026-06-07T21:07:56.06Z" }, + { url = "https://files.pythonhosted.org/packages/7b/d9/0bf1a19362c32f06229da5e7ddfcec91f93474d6307f7a2d3135e9c674dc/aiohttp-3.14.1-cp313-cp313-win_amd64.whl", hash = "sha256:1ac8531b638959718e18c2207fbfe297819875da46a740b29dfa29beba64355a", size = 479748, upload-time = "2026-06-07T21:07:58.319Z" }, + { url = "https://files.pythonhosted.org/packages/22/0a/62e7232dc9484fbec112ceb32efb6a624cc7994ec6e2b019286f17c4e8f2/aiohttp-3.14.1-cp313-cp313-win_arm64.whl", hash = "sha256:250d14af67f6b6a1a4a811049b1afa69d61d617fca6bf33149b3ab1a6dbcf7b8", size = 447723, upload-time = "2026-06-07T21:08:00.154Z" }, + { url = "https://files.pythonhosted.org/packages/c4/a1/5fafa04e1ca91ddb47608699d60649c1c6db3cf41c99e78fc4056f9513db/aiohttp-3.14.1-cp314-cp314-android_24_arm64_v8a.whl", hash = "sha256:7c106c26852ca1c2047c6b80384f17100b4e439af276f21ef3d4e2f450ae7e15", size = 508531, upload-time = "2026-06-07T21:08:02.093Z" }, + { url = "https://files.pythonhosted.org/packages/fa/2e/bfa02f699d87ffc86d5959270b28f1cb410add3ccaced8ed2e0b8a5238fc/aiohttp-3.14.1-cp314-cp314-android_24_x86_64.whl", hash = "sha256:20205f7f5ade7aaec9f4b500549bbc071b046453aed72f9c06dcab87896a83e8", size = 514718, upload-time = "2026-06-07T21:08:04.476Z" }, + { url = "https://files.pythonhosted.org/packages/85/a5/9594ad6289eebbc97d167c44213d557807f90e59115caad24de21ad2c3b1/aiohttp-3.14.1-cp314-cp314-ios_13_0_arm64_iphoneos.whl", hash = "sha256:62a759436b29e677181a9e76bab8b8f689a29cb9c535f45f7c48c9c830d3f8c3", size = 487918, upload-time = "2026-06-07T21:08:06.377Z" }, + { url = "https://files.pythonhosted.org/packages/b4/61/16a32c36c3c49edec122a3dc811f2057df2f94d3b14aa107c8017d981618/aiohttp-3.14.1-cp314-cp314-ios_13_0_arm64_iphonesimulator.whl", hash = "sha256:2964cbf553df4d7a57348da44d961d871895fc1ee4e8c322b2a95612c7b17fba", size = 494014, upload-time = "2026-06-07T21:08:08.263Z" }, + { url = "https://files.pythonhosted.org/packages/9b/89/3ebcf96ed99c05bec9c434aaac6963fd3cbab4a786ae739908a144d9ce44/aiohttp-3.14.1-cp314-cp314-ios_13_0_x86_64_iphonesimulator.whl", hash = "sha256:237651caadc3a59badd39319c54642b5299e9cc98a3a194310e55d5bb9f5e397", size = 502398, upload-time = "2026-06-07T21:08:10.244Z" }, + { url = "https://files.pythonhosted.org/packages/fd/3d/b74870a0c2d40c355928cd5b96c7a11fa821b8a40fc41365e64479b151fb/aiohttp-3.14.1-cp314-cp314-macosx_10_15_universal2.whl", hash = "sha256:896e12dfdbbab9d8f7e16d2b28c6769a60126fa92095d1ebf9473d02593a2448", size = 758018, upload-time = "2026-06-07T21:08:12.447Z" }, + { url = "https://files.pythonhosted.org/packages/d3/66/f42f5c984d99e49c6cff5f26f590750f2e2f7ef1fcfb99966ab5be1b632e/aiohttp-3.14.1-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:d03f281ed22579314ba00821ce20115a7c0ac430660b4cc05704a3f818b3e004", size = 512462, upload-time = "2026-06-07T21:08:14.624Z" }, + { url = "https://files.pythonhosted.org/packages/e9/a7/248e1aebe0c7810b0271e021a0f2a5eb6e78a051885b3c9df49f42a5802d/aiohttp-3.14.1-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:07eabb979d236335fed927e137a928c9adfb7df3b9ec7aa31726f133a62be983", size = 512824, upload-time = "2026-06-07T21:08:16.572Z" }, + { url = "https://files.pythonhosted.org/packages/26/97/2aa0e5ba0727dc3bd5aaebb7ccbc510f7dfb7fb961ec87497cd496635ab1/aiohttp-3.14.1-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:4fe1f1087cbadb280b5e1bb054a4f00d1423c74d6626c5e48400d871d34ecefe", size = 1749898, upload-time = "2026-06-07T21:08:18.635Z" }, + { url = "https://files.pythonhosted.org/packages/00/8d/e97f6c96c891d457c8479d92a514ba194d0412f981d72c70341ee18488ed/aiohttp-3.14.1-cp314-cp314-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:367a9314fdc79dab0fac96e216cb41dd73c85bdca85306ce8999118ba7e0f333", size = 1710114, upload-time = "2026-06-07T21:08:20.892Z" }, + { url = "https://files.pythonhosted.org/packages/6f/e6/aa8d7e863048c8fceb5cd6ce74017311cec3ead07847387e12265fb4444e/aiohttp-3.14.1-cp314-cp314-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:a24f677ebe83749039e7bdf862ff0bbb16818ae4193d4ef96505e269375bcce0", size = 1802541, upload-time = "2026-06-07T21:08:23.044Z" }, + { url = "https://files.pythonhosted.org/packages/83/a8/72193137de57fda4ebfae4563182d082c8856e3b6e9871d0b46f028fb369/aiohttp-3.14.1-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:c83afe0ba876be7e943d2e0ba645809ad441575d2840c895c21ee5de93b9377a", size = 1875776, upload-time = "2026-06-07T21:08:25.288Z" }, + { url = "https://files.pythonhosted.org/packages/a0/18/938441025db6769a3464596b2410af3afde0b21eb2f204c6f766f68af4bd/aiohttp-3.14.1-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:634e385930fb6d2d479cf3aa66515955863b77a5e3c2b5894ca259a25b308602", size = 1760329, upload-time = "2026-06-07T21:08:27.363Z" }, + { url = "https://files.pythonhosted.org/packages/60/29/bf2496b4065e76e09fe48015aaffe5ce161d8f089b06ac6982070f653076/aiohttp-3.14.1-cp314-cp314-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:eeea07c4397bbc57719c4eed8f9c284874d4f175f9b6d57f7a1546b976d455ca", size = 1587293, upload-time = "2026-06-07T21:08:29.805Z" }, + { url = "https://files.pythonhosted.org/packages/49/a2/2136674d52123b1354bd05dd5753c318db47dc0c927cc70b27bab3755456/aiohttp-3.14.1-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:335c0cc3e3545ce98dcb9cfcb836f40c3411f43fa03dab757597d80c89af8a35", size = 1714756, upload-time = "2026-06-07T21:08:32.094Z" }, + { url = "https://files.pythonhosted.org/packages/a7/b9/e5fd2e6f915503081c0f9b1e8540947037929c70c191da2e4d54b31a21a1/aiohttp-3.14.1-cp314-cp314-musllinux_1_2_armv7l.whl", hash = "sha256:ae6be797afdef264e8a84864a85b196ca06045586481b3df8a967322fd2fa844", size = 1721052, upload-time = "2026-06-07T21:08:34.167Z" }, + { url = "https://files.pythonhosted.org/packages/63/5a/2833e324a2263e104e31e2e91bc5bbee81bc499afd32203faee048a883f0/aiohttp-3.14.1-cp314-cp314-musllinux_1_2_ppc64le.whl", hash = "sha256:8560b4d712474335d08907db7973f71912d3a9a8f1dee992ec06b5d2fe359496", size = 1766888, upload-time = "2026-06-07T21:08:36.95Z" }, + { url = "https://files.pythonhosted.org/packages/57/fa/dea6511870913162f3b2e8c42a7614eb203a4540b8c2da43e0bfb0548f3c/aiohttp-3.14.1-cp314-cp314-musllinux_1_2_riscv64.whl", hash = "sha256:2b7edd08e0a5deb1e8564a2fcd8f4561014a3f05252334671bbf55ddd47db0e5", size = 1581679, upload-time = "2026-06-07T21:08:39.292Z" }, + { url = "https://files.pythonhosted.org/packages/14/bd/3cf0d55e71784b33534e9710a67d382d900598b4787fbce6cc7317f8c42a/aiohttp-3.14.1-cp314-cp314-musllinux_1_2_s390x.whl", hash = "sha256:b6ff7fcee63287ae57b5df3e4f5957ce032122802509246dec1a5bcc55904c95", size = 1782021, upload-time = "2026-06-07T21:08:41.407Z" }, + { url = "https://files.pythonhosted.org/packages/c1/af/14bb5843eccbe234f4dfb78ab73e549d99727247e62ae5d62cbd22eaf5b0/aiohttp-3.14.1-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:6ffbb2f4ec1ceaff7e07d43922954da26b223d188bf30658e561b98e23089444", size = 1742574, upload-time = "2026-06-07T21:08:43.795Z" }, + { url = "https://files.pythonhosted.org/packages/f2/1e/fbeb7af9210a67ac0f9c9bec0f8f4568497924e33137a3d5b48e1cf85f3f/aiohttp-3.14.1-cp314-cp314-win32.whl", hash = "sha256:a9875b46d910cff3ea2f5962f9d266b465459fe634e22556ab9bd6fc1192eea0", size = 457773, upload-time = "2026-06-07T21:08:46.168Z" }, + { url = "https://files.pythonhosted.org/packages/f0/2b/13e8d741a9ec5db7d900c060554cf8352ab85e44e2a4469ebb9d377bda17/aiohttp-3.14.1-cp314-cp314-win_amd64.whl", hash = "sha256:af8b4b81a960eeaf1234971ac3cd0ba5901f3cd42eae42a46b4d089a8b492719", size = 485001, upload-time = "2026-06-07T21:08:48.401Z" }, + { url = "https://files.pythonhosted.org/packages/df/30/491acfa2c4d6c3ff59c49a14fc1b50be3241e25bbb0c84c09e2da4d11395/aiohttp-3.14.1-cp314-cp314-win_arm64.whl", hash = "sha256:cf4491381b1b57425c315a56a439251b1bdac07b2275f19a8c44bc57744532ec", size = 453809, upload-time = "2026-06-07T21:08:50.7Z" }, + { url = "https://files.pythonhosted.org/packages/34/e3/19dbe1a1f4cc6230eb9e314de7fe68053b0992f9302b27d12141a0b5db53/aiohttp-3.14.1-cp314-cp314t-macosx_10_15_universal2.whl", hash = "sha256:819c054312f1af92947e6a55883d1b66feefab11531a7fc45e0fb9b63880b5c2", size = 793320, upload-time = "2026-06-07T21:08:52.775Z" }, + { url = "https://files.pythonhosted.org/packages/7f/20/1b7182219ba1b108430d6e4dc53d25ae02dcfcf5a045b33af4e8c5167527/aiohttp-3.14.1-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:10ee9c1753a8f706345b22496c79fbddb5be0599e0823f3738b1534058e25340", size = 529077, upload-time = "2026-06-07T21:08:55Z" }, + { url = "https://files.pythonhosted.org/packages/b9/c8/14ce60ec31a2e5f5274bb17d383a6f7a3aabca31ac04eee05585bbadab16/aiohttp-3.14.1-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:1601cc37baf5750ccacae618ec2daf020769581695550e3b654a911f859c563d", size = 532476, upload-time = "2026-06-07T21:08:57.176Z" }, + { url = "https://files.pythonhosted.org/packages/7e/02/9ac85e081e53da2e061b02fa7758fe0a12d17b8ce2d1f5e6c7cb76730328/aiohttp-3.14.1-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:4d6e0ac9da31c9c04c84e1c0182ad8d6df35965a85cae29cd71d089621b3ae94", size = 1922347, upload-time = "2026-06-07T21:08:59.563Z" }, + { url = "https://files.pythonhosted.org/packages/c0/3e/d3ba07a0ab38b5389e10bec4362d21e10a4f667cba2d79ba30837b3a5059/aiohttp-3.14.1-cp314-cp314t-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:9e8f2d660c350b3d0e259c7a7e3d9b7fc8b41210cbcc3d4a7076ff0a5e5c2fdc", size = 1786465, upload-time = "2026-06-07T21:09:01.909Z" }, + { url = "https://files.pythonhosted.org/packages/0b/cb/e2ee978a00cfb2df829704a69528b18154eba5939f45bc1efa8f33aee4c5/aiohttp-3.14.1-cp314-cp314t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:4691802dda97be727f79d86818acaad7eb8e9252626a1d6b519fedbb92d5e251", size = 1909423, upload-time = "2026-06-07T21:09:04.357Z" }, + { url = "https://files.pythonhosted.org/packages/73/5d/1430334858b1022b58ae50399a918f0bd6fe8fa7fa183598d657ff61e040/aiohttp-3.14.1-cp314-cp314t-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:c389c482a7e9b9dc3ee2701ac46c4125297a3818875b9c305ddb603c04828fd1", size = 2001906, upload-time = "2026-06-07T21:09:06.722Z" }, + { url = "https://files.pythonhosted.org/packages/66/4e/560c7472d3d198a23aa5c8b19a5115bf6a9b77b7d3e4bb363da320430ad2/aiohttp-3.14.1-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:fc0cacab7ba4e56f0f81c82a98c09bed2f39c940107b03a34b168bdf7597edd3", size = 1877095, upload-time = "2026-06-07T21:09:09.011Z" }, + { url = "https://files.pythonhosted.org/packages/0d/f1/4745806578d447db4a784a8591e2dae3afdfc2bcb96f8f81271b13df6543/aiohttp-3.14.1-cp314-cp314t-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:979ed4717f59b8bb12e3963378fa285d93d367e15bcd66c721311826d3c44a6c", size = 1676222, upload-time = "2026-06-07T21:09:11.461Z" }, + { url = "https://files.pythonhosted.org/packages/6a/c9/48255813cca749a229ef0ab476004ec623728ad79a9c0840616f6c076325/aiohttp-3.14.1-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:38e1e7daaea81df51c952e18483f323d878499a1e2bfe564790e0f9701d6f203", size = 1842922, upload-time = "2026-06-07T21:09:14.118Z" }, + { url = "https://files.pythonhosted.org/packages/3d/c0/bbd054e2bee909f529523a5af3891052606af5143c09f5f183ec3b234676/aiohttp-3.14.1-cp314-cp314t-musllinux_1_2_armv7l.whl", hash = "sha256:4132e72c608fe9fecb8f409113567605915b83e9bdd3ea56538d2f9cd35002f1", size = 1825035, upload-time = "2026-06-07T21:09:16.447Z" }, + { url = "https://files.pythonhosted.org/packages/a8/ae/90395d4376deceb74e09ec26b6adf7d2015a6f8802d6d84446af860fef04/aiohttp-3.14.1-cp314-cp314t-musllinux_1_2_ppc64le.whl", hash = "sha256:eefd9cc9b6d4a2db5f00a26bc3e4f9acf71926a6ec557cd56c9c6f27c290b665", size = 1849512, upload-time = "2026-06-07T21:09:18.742Z" }, + { url = "https://files.pythonhosted.org/packages/93/bd/fb25f3049957553d4ce0ba6ae480aa2f592a6985497fca590837d16c1be0/aiohttp-3.14.1-cp314-cp314t-musllinux_1_2_riscv64.whl", hash = "sha256:b165790117eea512d7f3fb22f1f6dad3d55a7189571993eb015591c1401276d1", size = 1668571, upload-time = "2026-06-07T21:09:21.458Z" }, + { url = "https://files.pythonhosted.org/packages/3f/22/7f73303d64dd567ff3addca90b556690ed1233a47b8f55d242fb90af3681/aiohttp-3.14.1-cp314-cp314t-musllinux_1_2_s390x.whl", hash = "sha256:ed09c7eb1c391271c2ed0314a51903e72a3acb653d5ccfc264cdf3ef11f8269d", size = 1881159, upload-time = "2026-06-07T21:09:23.813Z" }, + { url = "https://files.pythonhosted.org/packages/44/be/0474c5a8b5640e1e4aa1923430a91f4151be82e511373fe764189b89aef5/aiohttp-3.14.1-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:99abd37084b82f5830c635fddd0b4993b9742a66eb746dacf433c8590e8f9e3c", size = 1841409, upload-time = "2026-06-07T21:09:26.207Z" }, + { url = "https://files.pythonhosted.org/packages/7b/3c/bb4a7cba26956cb3da4553cc2056cf67be5b5ff6e6d8fa4fbdff73bfb7ae/aiohttp-3.14.1-cp314-cp314t-win32.whl", hash = "sha256:47ddf841cdecc810749921d25606dee45857d12d2ad5ddb7b5bd7eab12e4b365", size = 494166, upload-time = "2026-06-07T21:09:28.505Z" }, + { url = "https://files.pythonhosted.org/packages/8a/84/ec80c2c1f66a952555a9f86df6b33af65108a6febfa0471b69013a12f807/aiohttp-3.14.1-cp314-cp314t-win_amd64.whl", hash = "sha256:5e78b522b7a6e27e0b25d19b247b75039ac4c94f99823e3c9e53ae1603a9f7e9", size = 530255, upload-time = "2026-06-07T21:09:30.843Z" }, + { url = "https://files.pythonhosted.org/packages/2a/71/6e22be134a4061ada85a92951b842f2657f17d926b727f3f94c56ae963d6/aiohttp-3.14.1-cp314-cp314t-win_arm64.whl", hash = "sha256:90d53f1609c29ccc2193945ef732428382a28f78d0456ae4d3daf0d48b74f0f6", size = 469640, upload-time = "2026-06-07T21:09:33.028Z" }, ] [[package]] @@ -146,7 +173,7 @@ wheels = [ [[package]] name = "anthropic" -version = "0.76.0" +version = "0.104.1" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "anyio" }, @@ -158,9 +185,9 @@ dependencies = [ { name = "sniffio" }, { name = "typing-extensions" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/6e/be/d11abafaa15d6304826438170f7574d750218f49a106c54424a40cef4494/anthropic-0.76.0.tar.gz", hash = "sha256:e0cae6a368986d5cf6df743dfbb1b9519e6a9eee9c6c942ad8121c0b34416ffe", size = 495483, upload-time = "2026-01-13T18:41:14.908Z" } +sdist = { url = "https://files.pythonhosted.org/packages/22/c7/7a655b948916f777354648ce979f68b94d5b8dbdb5f61fed1f37fad9378c/anthropic-0.104.1.tar.gz", hash = "sha256:17362b6c45f527afcc9b0fdf62011ffd359726ab2ebcb1978ea0cc41bd8d8d40", size = 850081, upload-time = "2026-05-22T15:36:57.432Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/e5/70/7b0fd9c1a738f59d3babe2b4212031c34ab7d0fda4ffef15b58a55c5bcea/anthropic-0.76.0-py3-none-any.whl", hash = "sha256:81efa3113901192af2f0fe977d3ec73fdadb1e691586306c4256cd6d5ccc331c", size = 390309, upload-time = "2026-01-13T18:41:13.483Z" }, + { url = "https://files.pythonhosted.org/packages/b8/12/d9ab42790494d7c428391a46cd28492395566a6a8ccb138d681978594455/anthropic-0.104.1-py3-none-any.whl", hash = "sha256:35c8cb456f5a4405aafe1f10f03f6fcc54fa51fa8ec01d655cc4b437d120e9b7", size = 832996, upload-time = "2026-05-22T15:36:59.519Z" }, ] [[package]] @@ -194,6 +221,16 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/ce/66/686ac4fc6ef48f5bacde625adac698f41d5316a9753c2b20bb0931c9d4e2/astroid-4.0.3-py3-none-any.whl", hash = "sha256:864a0a34af1bd70e1049ba1e61cee843a7252c826d97825fcee9b2fcbd9e1b14", size = 276443, upload-time = "2026-01-03T22:14:24.412Z" }, ] +[[package]] +name = "atheris" +version = "3.0.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/f8/58/5965955898e16bee17c8379eae12194993bf641c4629016991248b862069/atheris-3.0.0.tar.gz", hash = "sha256:1f0929c7bc3040f3fe4102e557718734190cf2d7718bbb8e3ce6d3eb56ef5bb3", size = 373239, upload-time = "2025-11-24T23:54:02.15Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/85/8c/e9960b996e70e5f6a523670431166b2b238de52fef094955515dcf854da1/atheris-3.0.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:510e502c57b6dc615fb174066407af620d4c7f73cf08a782c86e7761bf12c4eb", size = 34907016, upload-time = "2025-11-24T23:53:56.535Z" }, + { url = "https://files.pythonhosted.org/packages/db/48/df670f75f458cc7c1752a01a394fd59c830b08172dd59cf29d73f31050f9/atheris-3.0.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:a402cdca8a650d1371050b1f9552eb4cdc488d2db64950d603c4560318365eac", size = 34858525, upload-time = "2025-11-24T23:53:59.925Z" }, +] + [[package]] name = "attrs" version = "25.4.0" @@ -205,14 +242,38 @@ wheels = [ [[package]] name = "authlib" -version = "1.6.6" +version = "1.7.2" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "cryptography" }, + { name = "joserfc" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/36/98/7d93f30d029643c0275dbc0bd6d5a6f670661ee6c9a94d93af7ab4887600/authlib-1.7.2.tar.gz", hash = "sha256:2cea25fefcd4e7173bdf1372c0afc265c8034b23a8cd5dcb6a9164b826c64231", size = 176511, upload-time = "2026-05-06T08:10:23.116Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/fb/95/adcb68e20c34162e9135f370d6e31737719c2b6f94bc953fe7ed1f10fe21/authlib-1.7.2-py2.py3-none-any.whl", hash = "sha256:3e1faedc9d87e7d56a164eca3ccb6ace0d61b94abe83e92242f8dc8bba9b4a9f", size = 259548, upload-time = "2026-05-06T08:10:21.436Z" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/bb/9b/b1661026ff24bc641b76b78c5222d614776b0c085bcfdac9bd15a1cb4b35/authlib-1.6.6.tar.gz", hash = "sha256:45770e8e056d0f283451d9996fbb59b70d45722b45d854d58f32878d0a40c38e", size = 164894, upload-time = "2025-12-12T08:01:41.464Z" } + +[[package]] +name = "babel" +version = "2.18.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/7d/b2/51899539b6ceeeb420d40ed3cd4b7a40519404f9baf3d4ac99dc413a834b/babel-2.18.0.tar.gz", hash = "sha256:b80b99a14bd085fcacfa15c9165f651fbb3406e66cc603abf11c5750937c992d", size = 9959554, upload-time = "2026-02-01T12:30:56.078Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/77/f5/21d2de20e8b8b0408f0681956ca2c69f1320a3848ac50e6e7f39c6159675/babel-2.18.0-py3-none-any.whl", hash = "sha256:e2b422b277c2b9a9630c1d7903c2a00d0830c409c59ac8cae9081c92f1aeba35", size = 10196845, upload-time = "2026-02-01T12:30:53.445Z" }, +] + +[[package]] +name = "backrefs" +version = "6.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/4e/a6/e325ec73b638d3ede4421b5445d4a0b8b219481826cc079d510100af356c/backrefs-6.2.tar.gz", hash = "sha256:f44ff4d48808b243b6c0cdc6231e22195c32f77046018141556c66f8bab72a49", size = 7012303, upload-time = "2026-02-16T19:10:15.828Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/54/51/321e821856452f7386c4e9df866f196720b1ad0c5ea1623ea7399969ae3b/authlib-1.6.6-py2.py3-none-any.whl", hash = "sha256:7d9e9bc535c13974313a87f53e8430eb6ea3d1cf6ae4f6efcd793f2e949143fd", size = 244005, upload-time = "2025-12-12T08:01:40.209Z" }, + { url = "https://files.pythonhosted.org/packages/1b/39/3765df263e08a4df37f4f43cb5aa3c6c17a4bdd42ecfe841e04c26037171/backrefs-6.2-py310-none-any.whl", hash = "sha256:0fdc7b012420b6b144410342caeb8adc54c6866cf12064abc9bb211302e496f8", size = 381075, upload-time = "2026-02-16T19:10:04.322Z" }, + { url = "https://files.pythonhosted.org/packages/0f/f0/35240571e1b67ffb19dafb29ab34150b6f59f93f717b041082cdb1bfceb1/backrefs-6.2-py311-none-any.whl", hash = "sha256:08aa7fae530c6b2361d7bdcbda1a7c454e330cc9dbcd03f5c23205e430e5c3be", size = 392874, upload-time = "2026-02-16T19:10:06.314Z" }, + { url = "https://files.pythonhosted.org/packages/e3/63/77e8c9745b4d227cce9f5e0a6f68041278c5f9b18588b35905f5f19c1beb/backrefs-6.2-py312-none-any.whl", hash = "sha256:c3f4b9cb2af8cda0d87ab4f57800b57b95428488477be164dd2b47be54db0c90", size = 398787, upload-time = "2026-02-16T19:10:08.274Z" }, + { url = "https://files.pythonhosted.org/packages/c5/71/c754b1737ad99102e03fa3235acb6cb6d3ac9d6f596cbc3e5f236705abd8/backrefs-6.2-py313-none-any.whl", hash = "sha256:12df81596ab511f783b7d87c043ce26bc5b0288cf3bb03610fe76b8189282b2b", size = 400747, upload-time = "2026-02-16T19:10:09.791Z" }, + { url = "https://files.pythonhosted.org/packages/af/75/be12ba31a6eb20dccef2320cd8ccb3f7d9013b68ba4c70156259fee9e409/backrefs-6.2-py314-none-any.whl", hash = "sha256:e5f805ae09819caa1aa0623b4a83790e7028604aa2b8c73ba602c4454e665de7", size = 412602, upload-time = "2026-02-16T19:10:12.317Z" }, + { url = "https://files.pythonhosted.org/packages/21/f8/d02f650c47d05034dcd6f9c8cf94f39598b7a89c00ecda0ecb2911bc27e9/backrefs-6.2-py39-none-any.whl", hash = "sha256:664e33cd88c6840b7625b826ecf2555f32d491800900f5a541f772c485f7cda7", size = 381077, upload-time = "2026-02-16T19:10:13.74Z" }, ] [[package]] @@ -250,30 +311,30 @@ wheels = [ [[package]] name = "boto3" -version = "1.42.33" +version = "1.43.14" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "botocore" }, { name = "jmespath" }, { name = "s3transfer" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/d4/c7/695a39a862140dd40637a3dc0020f4f645bb78c47f0d9195db76ed7e1da2/boto3-1.42.33.tar.gz", hash = "sha256:5da0d35dd82451d4520af63f8fcc722537597d7c790035e8b3a8fc53f032be3a", size = 112844, upload-time = "2026-01-22T20:29:15.817Z" } +sdist = { url = "https://files.pythonhosted.org/packages/79/4b/616367e871ce3f1cb3e8545a97736b6331b9fb081497f2d44c5b2aa6959d/boto3-1.43.14.tar.gz", hash = "sha256:5c0a994b3182061ee101812e721100717a4d664f9f4ceaf4a86b6d032ce9fc2d", size = 113142, upload-time = "2026-05-22T19:28:47.861Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/be/93/80aa0c9c5931e72252cbf46162f5b438f040f618bb941aa85bb591c62bc9/boto3-1.42.33-py3-none-any.whl", hash = "sha256:81db4a1ef08b3a69b2c5a879e7bd26ee43ca3fd5202cd320a2aaa4f5dd11182c", size = 140574, upload-time = "2026-01-22T20:29:13.531Z" }, + { url = "https://files.pythonhosted.org/packages/cb/00/59cb9329c18e2d3aa23062ceaa87d065f2e81e7d2931df24d64e9a7815aa/boto3-1.43.14-py3-none-any.whl", hash = "sha256:574335744656cfed0b362a0a0467aaf2eb2bf15526edcd02d31d3c661f4b09e4", size = 140536, upload-time = "2026-05-22T19:28:46.49Z" }, ] [[package]] name = "botocore" -version = "1.42.33" +version = "1.43.14" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "jmespath" }, { name = "python-dateutil" }, { name = "urllib3" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/8f/ea/7bfe0902a228b4aa73106e704188189ab0e16e0a0e9598fa2b126ebfe759/botocore-1.42.33.tar.gz", hash = "sha256:ecf48db73605a592b6c7f8f29e517d9eb6cf0c7e004a1fdbd9c192afc7b42b03", size = 14903415, upload-time = "2026-01-22T20:29:04.293Z" } +sdist = { url = "https://files.pythonhosted.org/packages/78/3c/798d2f7deb118241930c7c6bcfb0b970d3f0245bf580700663199aeed2c3/botocore-1.43.14.tar.gz", hash = "sha256:b9e500737e43d2f147c9d4e23b54360335e77d4c0ba90a318f51b65e06cb8516", size = 15382604, upload-time = "2026-05-22T19:28:36.363Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/2f/58/da9a094c8c2499a19c57f4aedca2d5fb2c88bfb9e2931d87af41309c4521/botocore-1.42.33-py3-none-any.whl", hash = "sha256:156a1ead55c38709730c543eb8085c36098b7baf272fedc67cc4a543ae4b4cf6", size = 14575729, upload-time = "2026-01-22T20:29:00.759Z" }, + { url = "https://files.pythonhosted.org/packages/27/7e/6e64821077cd2efc4aa51b7d638fb6d48e1c7c450201c529fbaf1de8bfd3/botocore-1.43.14-py3-none-any.whl", hash = "sha256:1f4a2a95ea78c10398e78431e98c1fe47adb54a7b10a32975144c1f541186658", size = 15061424, upload-time = "2026-05-22T19:28:32.682Z" }, ] [[package]] @@ -294,6 +355,27 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/2c/fc/1d7b80d0eb7b714984ce40efc78859c022cd930e402f599d8ca9e39c78a4/cachetools-6.2.4-py3-none-any.whl", hash = "sha256:69a7a52634fed8b8bf6e24a050fb60bff1c9bd8f6d24572b99c32d4e71e62a51", size = 11551, upload-time = "2025-12-15T18:24:52.332Z" }, ] +[[package]] +name = "caio" +version = "0.9.25" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/92/88/b8527e1b00c1811db339a1df8bd1ae49d146fcea9d6a5c40e3a80aaeb38d/caio-0.9.25.tar.gz", hash = "sha256:16498e7f81d1d0f5a4c0ad3f2540e65fe25691376e0a5bd367f558067113ed10", size = 26781, upload-time = "2025-12-26T15:21:36.501Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/d3/25/79c98ebe12df31548ba4eaf44db11b7cad6b3e7b4203718335620939083c/caio-0.9.25-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:fb7ff95af4c31ad3f03179149aab61097a71fd85e05f89b4786de0359dffd044", size = 36983, upload-time = "2025-12-26T15:21:36.075Z" }, + { url = "https://files.pythonhosted.org/packages/a3/2b/21288691f16d479945968a0a4f2856818c1c5be56881d51d4dac9b255d26/caio-0.9.25-cp312-cp312-manylinux2010_x86_64.manylinux2014_x86_64.manylinux_2_12_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:97084e4e30dfa598449d874c4d8e0c8d5ea17d2f752ef5e48e150ff9d240cd64", size = 82012, upload-time = "2025-12-26T15:22:20.983Z" }, + { url = "https://files.pythonhosted.org/packages/03/c4/8a1b580875303500a9c12b9e0af58cb82e47f5bcf888c2457742a138273c/caio-0.9.25-cp312-cp312-manylinux_2_34_aarch64.whl", hash = "sha256:4fa69eba47e0f041b9d4f336e2ad40740681c43e686b18b191b6c5f4c5544bfb", size = 81502, upload-time = "2026-03-04T22:08:22.381Z" }, + { url = "https://files.pythonhosted.org/packages/d1/1c/0fe770b8ffc8362c48134d1592d653a81a3d8748d764bec33864db36319d/caio-0.9.25-cp312-cp312-manylinux_2_34_x86_64.whl", hash = "sha256:6bebf6f079f1341d19f7386db9b8b1f07e8cc15ae13bfdaff573371ba0575d69", size = 80200, upload-time = "2026-03-04T22:08:23.382Z" }, + { url = "https://files.pythonhosted.org/packages/31/57/5e6ff127e6f62c9f15d989560435c642144aa4210882f9494204bc892305/caio-0.9.25-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:d6c2a3411af97762a2b03840c3cec2f7f728921ff8adda53d7ea2315a8563451", size = 36979, upload-time = "2025-12-26T15:21:35.484Z" }, + { url = "https://files.pythonhosted.org/packages/a3/9f/f21af50e72117eb528c422d4276cbac11fb941b1b812b182e0a9c70d19c5/caio-0.9.25-cp313-cp313-manylinux2010_x86_64.manylinux2014_x86_64.manylinux_2_12_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:0998210a4d5cd5cb565b32ccfe4e53d67303f868a76f212e002a8554692870e6", size = 81900, upload-time = "2025-12-26T15:22:21.919Z" }, + { url = "https://files.pythonhosted.org/packages/9c/12/c39ae2a4037cb10ad5eb3578eb4d5f8c1a2575c62bba675f3406b7ef0824/caio-0.9.25-cp313-cp313-manylinux_2_34_aarch64.whl", hash = "sha256:1a177d4777141b96f175fe2c37a3d96dec7911ed9ad5f02bac38aaa1c936611f", size = 81523, upload-time = "2026-03-04T22:08:25.187Z" }, + { url = "https://files.pythonhosted.org/packages/22/59/f8f2e950eb4f1a5a3883e198dca514b9d475415cb6cd7b78b9213a0dd45a/caio-0.9.25-cp313-cp313-manylinux_2_34_x86_64.whl", hash = "sha256:9ed3cfb28c0e99fec5e208c934e5c157d0866aa9c32aa4dc5e9b6034af6286b7", size = 80243, upload-time = "2026-03-04T22:08:26.449Z" }, + { url = "https://files.pythonhosted.org/packages/69/ca/a08fdc7efdcc24e6a6131a93c85be1f204d41c58f474c42b0670af8c016b/caio-0.9.25-cp314-cp314-macosx_10_15_universal2.whl", hash = "sha256:fab6078b9348e883c80a5e14b382e6ad6aabbc4429ca034e76e730cf464269db", size = 36978, upload-time = "2025-12-26T15:21:41.055Z" }, + { url = "https://files.pythonhosted.org/packages/5e/6c/d4d24f65e690213c097174d26eda6831f45f4734d9d036d81790a27e7b78/caio-0.9.25-cp314-cp314-manylinux2010_x86_64.manylinux2014_x86_64.manylinux_2_12_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:44a6b58e52d488c75cfaa5ecaa404b2b41cc965e6c417e03251e868ecd5b6d77", size = 81832, upload-time = "2025-12-26T15:22:22.757Z" }, + { url = "https://files.pythonhosted.org/packages/87/a4/e534cf7d2d0e8d880e25dd61e8d921ffcfe15bd696734589826f5a2df727/caio-0.9.25-cp314-cp314-manylinux_2_34_aarch64.whl", hash = "sha256:628a630eb7fb22381dd8e3c8ab7f59e854b9c806639811fc3f4310c6bd711d79", size = 81565, upload-time = "2026-03-04T22:08:27.483Z" }, + { url = "https://files.pythonhosted.org/packages/3f/ed/bf81aeac1d290017e5e5ac3e880fd56ee15e50a6d0353986799d1bc5cfd5/caio-0.9.25-cp314-cp314-manylinux_2_34_x86_64.whl", hash = "sha256:0ba16aa605ccb174665357fc729cf500679c2d94d5f1458a6f0d5ca48f2060a7", size = 80071, upload-time = "2026-03-04T22:08:28.751Z" }, + { url = "https://files.pythonhosted.org/packages/86/93/1f76c8d1bafe3b0614e06b2195784a3765bbf7b0a067661af9e2dd47fc33/caio-0.9.25-py3-none-any.whl", hash = "sha256:06c0bb02d6b929119b1cfbe1ca403c768b2013a369e2db46bfa2a5761cf82e40", size = 19087, upload-time = "2025-12-26T15:22:00.221Z" }, +] + [[package]] name = "certifi" version = "2026.1.4" @@ -450,23 +532,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/75/45/54bb2d8d4138964a94bef6e9afe48b0be4705ba66ac442ae7d8a8dc4ffef/click_option_group-0.5.9-py3-none-any.whl", hash = "sha256:ad2599248bd373e2e19bec5407967c3eec1d0d4fc4a5e77b08a0481e75991080", size = 11553, upload-time = "2025-10-09T09:38:00.066Z" }, ] -[[package]] -name = "cloudpickle" -version = "3.1.2" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/27/fb/576f067976d320f5f0114a8d9fa1215425441bb35627b1993e5afd8111e5/cloudpickle-3.1.2.tar.gz", hash = "sha256:7fda9eb655c9c230dab534f1983763de5835249750e85fbcef43aaa30a9a2414", size = 22330, upload-time = "2025-11-03T09:25:26.604Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/88/39/799be3f2f0f38cc727ee3b4f1445fe6d5e4133064ec2e4115069418a5bb6/cloudpickle-3.1.2-py3-none-any.whl", hash = "sha256:9acb47f6afd73f60dc1df93bb801b472f05ff42fa6c84167d25cb206be1fbf4a", size = 22228, upload-time = "2025-11-03T09:25:25.534Z" }, -] - [[package]] name = "code-graph-rag" -version = "0.0.58" +version = "0.0.187" source = { editable = "." } dependencies = [ { name = "click" }, { name = "defusedxml" }, { name = "diff-match-patch" }, + { name = "griffe" }, { name = "huggingface-hub", extra = ["hf-xet"] }, { name = "loguru" }, { name = "mcp" }, @@ -477,6 +551,7 @@ dependencies = [ { name = "pymgclient" }, { name = "python-dotenv" }, { name = "rich" }, + { name = "tiktoken" }, { name = "toml" }, { name = "tree-sitter" }, { name = "tree-sitter-python" }, @@ -491,6 +566,7 @@ semantic = [ { name = "transformers" }, ] test = [ + { name = "libclang" }, { name = "pytest" }, { name = "pytest-asyncio" }, { name = "pytest-cov" }, @@ -498,11 +574,13 @@ test = [ { name = "testcontainers" }, ] treesitter-full = [ + { name = "tree-sitter-c" }, { name = "tree-sitter-cpp" }, { name = "tree-sitter-go" }, { name = "tree-sitter-java" }, { name = "tree-sitter-javascript" }, { name = "tree-sitter-lua" }, + { name = "tree-sitter-php" }, { name = "tree-sitter-python" }, { name = "tree-sitter-rust" }, { name = "tree-sitter-scala" }, @@ -516,6 +594,7 @@ dev = [ { name = "pre-commit" }, { name = "pyinstaller" }, { name = "pylint" }, + { name = "pytest" }, { name = "radon" }, { name = "ruff" }, { name = "semgrep" }, @@ -524,43 +603,56 @@ dev = [ { name = "types-toml" }, { name = "vulture" }, ] +docs = [ + { name = "mkdocs" }, + { name = "mkdocs-material" }, + { name = "mkdocs-minify-plugin" }, +] +fuzz = [ + { name = "atheris" }, +] [package.metadata] requires-dist = [ - { name = "click", specifier = ">=8.0.0" }, + { name = "click", specifier = ">=8.3.1" }, { name = "defusedxml", specifier = ">=0.7.1" }, { name = "diff-match-patch", specifier = ">=20241021" }, - { name = "huggingface-hub", extras = ["hf-xet"], specifier = ">=0.36.0" }, + { name = "griffe", specifier = ">=1.0,<2" }, + { name = "huggingface-hub", extras = ["hf-xet"], specifier = ">=1.7.2" }, + { name = "libclang", marker = "extra == 'test'", specifier = ">=18.1.1" }, { name = "loguru", specifier = ">=0.7.3" }, - { name = "mcp", specifier = ">=1.21.1" }, - { name = "prompt-toolkit", specifier = ">=3.0.0" }, - { name = "protobuf", specifier = ">=5.27.0" }, - { name = "pydantic-ai", specifier = ">=1.27.0" }, - { name = "pydantic-settings", specifier = ">=2.0.0" }, - { name = "pymgclient", specifier = ">=1.4.0" }, + { name = "mcp", specifier = ">=1.25.0" }, + { name = "prompt-toolkit", specifier = ">=3.0.52" }, + { name = "protobuf", specifier = ">=6.33.5" }, + { name = "pydantic-ai", specifier = ">=1.102.0" }, + { name = "pydantic-settings", specifier = ">=2.12.0" }, + { name = "pymgclient", specifier = ">=1.5.1" }, { name = "pytest", marker = "extra == 'test'", specifier = ">=8.4.1" }, { name = "pytest-asyncio", marker = "extra == 'test'", specifier = ">=1.0.0" }, { name = "pytest-cov", marker = "extra == 'test'", specifier = ">=4.0.0" }, { name = "pytest-xdist", marker = "extra == 'test'", specifier = ">=3.8.0" }, - { name = "python-dotenv", specifier = ">=1.1.0" }, + { name = "python-dotenv", specifier = ">=1.2.1" }, { name = "qdrant-client", marker = "extra == 'semantic'", specifier = ">=1.9.0" }, - { name = "rich", specifier = ">=13.7.1" }, + { name = "rich", specifier = ">=14.2.0" }, { name = "testcontainers", marker = "extra == 'test'", specifier = ">=4.9.0" }, + { name = "tiktoken", specifier = ">=0.12.0" }, { name = "toml", specifier = ">=0.10.2" }, { name = "torch", marker = "extra == 'semantic'", specifier = ">=2.6.0" }, { name = "transformers", marker = "extra == 'semantic'", specifier = ">=4.0.0" }, - { name = "tree-sitter", specifier = "==0.25.0" }, + { name = "tree-sitter", specifier = "==0.25.2" }, + { name = "tree-sitter-c", marker = "extra == 'treesitter-full'", specifier = ">=0.24.1" }, { name = "tree-sitter-cpp", marker = "extra == 'treesitter-full'", specifier = ">=0.23.0" }, { name = "tree-sitter-go", marker = "extra == 'treesitter-full'", specifier = ">=0.23.4" }, { name = "tree-sitter-java", marker = "extra == 'treesitter-full'", specifier = ">=0.23.5" }, { name = "tree-sitter-javascript", marker = "extra == 'treesitter-full'", specifier = ">=0.23.1" }, { name = "tree-sitter-lua", marker = "extra == 'treesitter-full'", specifier = ">=0.0.19" }, - { name = "tree-sitter-python", specifier = ">=0.23.6" }, + { name = "tree-sitter-php", marker = "extra == 'treesitter-full'", specifier = ">=0.24.1" }, + { name = "tree-sitter-python", specifier = ">=0.25.0" }, { name = "tree-sitter-python", marker = "extra == 'treesitter-full'", specifier = ">=0.23.6" }, { name = "tree-sitter-rust", marker = "extra == 'treesitter-full'", specifier = ">=0.24.0" }, { name = "tree-sitter-scala", marker = "extra == 'treesitter-full'", specifier = ">=0.24.0" }, { name = "tree-sitter-typescript", marker = "extra == 'treesitter-full'", specifier = ">=0.23.2" }, - { name = "typer", specifier = ">=0.12.5" }, + { name = "typer", specifier = ">=0.21.1" }, { name = "watchdog", specifier = ">=6.0.0" }, ] provides-extras = ["test", "treesitter-full", "semantic"] @@ -572,6 +664,7 @@ dev = [ { name = "pre-commit", specifier = ">=4.2.0" }, { name = "pyinstaller", specifier = ">=6.14.1" }, { name = "pylint", specifier = ">=4.0.4" }, + { name = "pytest", specifier = ">=9.0.2" }, { name = "radon", specifier = ">=6.0.1" }, { name = "ruff", specifier = ">=0.5.5" }, { name = "semgrep", specifier = ">=1.79.0" }, @@ -580,10 +673,16 @@ dev = [ { name = "types-toml", specifier = ">=0.10.8.20240310" }, { name = "vulture", specifier = ">=2.14" }, ] +docs = [ + { name = "mkdocs", specifier = ">=1.6.1,<2" }, + { name = "mkdocs-material", specifier = ">=9.7.3" }, + { name = "mkdocs-minify-plugin", specifier = ">=0.8.0" }, +] +fuzz = [{ name = "atheris", specifier = ">=2.3.0" }] [[package]] name = "cohere" -version = "5.20.1" +version = "5.20.7" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "fastavro" }, @@ -595,9 +694,9 @@ dependencies = [ { name = "types-requests" }, { name = "typing-extensions" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/4b/ed/bb02083654bdc089ae4ef1cd7691fd2233f1fd9f32bcbfacc80ff57d9775/cohere-5.20.1.tar.gz", hash = "sha256:50973f63d2c6138ff52ce37d8d6f78ccc539af4e8c43865e960d68e0bf835b6f", size = 180820, upload-time = "2025-12-18T16:39:50.975Z" } +sdist = { url = "https://files.pythonhosted.org/packages/44/0b/96e2b55a0114ed9d69b3154565f54b764e7530735426290b000f467f4c0f/cohere-5.20.7.tar.gz", hash = "sha256:997ed85fabb3a1e4a4c036fdb520382e7bfa670db48eb59a026803b6f7061dbb", size = 184986, upload-time = "2026-02-25T01:22:18.673Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/7a/e3/94eb11ac3ebaaa3a6afb5d2ff23db95d58bc468ae538c388edf49f2f20b5/cohere-5.20.1-py3-none-any.whl", hash = "sha256:d230fd13d95ba92ae927fce3dd497599b169883afc7954fe29b39fb8d5df5fc7", size = 318973, upload-time = "2025-12-18T16:39:49.504Z" }, + { url = "https://files.pythonhosted.org/packages/9d/86/dc991a75e3b9c2007b90dbfaf7f36fdb2457c216f799e26ce0474faf0c1f/cohere-5.20.7-py3-none-any.whl", hash = "sha256:043fef2a12c30c07e9b2c1f0b869fd66ffd911f58d1492f87e901c4190a65914", size = 323389, upload-time = "2026-02-25T01:22:16.902Z" }, ] [[package]] @@ -685,81 +784,127 @@ wheels = [ [[package]] name = "cryptography" -version = "46.0.3" +version = "48.0.1" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "cffi", marker = "platform_python_implementation != 'PyPy'" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/9f/33/c00162f49c0e2fe8064a62cb92b93e50c74a72bc370ab92f86112b33ff62/cryptography-46.0.3.tar.gz", hash = "sha256:a8b17438104fed022ce745b362294d9ce35b4c2e45c1d958ad4a4b019285f4a1", size = 749258, upload-time = "2025-10-15T23:18:31.74Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/1d/42/9c391dd801d6cf0d561b5890549d4b27bafcc53b39c31a817e69d87c625b/cryptography-46.0.3-cp311-abi3-macosx_10_9_universal2.whl", hash = "sha256:109d4ddfadf17e8e7779c39f9b18111a09efb969a301a31e987416a0191ed93a", size = 7225004, upload-time = "2025-10-15T23:16:52.239Z" }, - { url = "https://files.pythonhosted.org/packages/1c/67/38769ca6b65f07461eb200e85fc1639b438bdc667be02cf7f2cd6a64601c/cryptography-46.0.3-cp311-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:09859af8466b69bc3c27bdf4f5d84a665e0f7ab5088412e9e2ec49758eca5cbc", size = 4296667, upload-time = "2025-10-15T23:16:54.369Z" }, - { url = "https://files.pythonhosted.org/packages/5c/49/498c86566a1d80e978b42f0d702795f69887005548c041636df6ae1ca64c/cryptography-46.0.3-cp311-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:01ca9ff2885f3acc98c29f1860552e37f6d7c7d013d7334ff2a9de43a449315d", size = 4450807, upload-time = "2025-10-15T23:16:56.414Z" }, - { url = "https://files.pythonhosted.org/packages/4b/0a/863a3604112174c8624a2ac3c038662d9e59970c7f926acdcfaed8d61142/cryptography-46.0.3-cp311-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:6eae65d4c3d33da080cff9c4ab1f711b15c1d9760809dad6ea763f3812d254cb", size = 4299615, upload-time = "2025-10-15T23:16:58.442Z" }, - { url = "https://files.pythonhosted.org/packages/64/02/b73a533f6b64a69f3cd3872acb6ebc12aef924d8d103133bb3ea750dc703/cryptography-46.0.3-cp311-abi3-manylinux_2_28_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:e5bf0ed4490068a2e72ac03d786693adeb909981cc596425d09032d372bcc849", size = 4016800, upload-time = "2025-10-15T23:17:00.378Z" }, - { url = "https://files.pythonhosted.org/packages/25/d5/16e41afbfa450cde85a3b7ec599bebefaef16b5c6ba4ec49a3532336ed72/cryptography-46.0.3-cp311-abi3-manylinux_2_28_ppc64le.whl", hash = "sha256:5ecfccd2329e37e9b7112a888e76d9feca2347f12f37918facbb893d7bb88ee8", size = 4984707, upload-time = "2025-10-15T23:17:01.98Z" }, - { url = "https://files.pythonhosted.org/packages/c9/56/e7e69b427c3878352c2fb9b450bd0e19ed552753491d39d7d0a2f5226d41/cryptography-46.0.3-cp311-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:a2c0cd47381a3229c403062f764160d57d4d175e022c1df84e168c6251a22eec", size = 4482541, upload-time = "2025-10-15T23:17:04.078Z" }, - { url = "https://files.pythonhosted.org/packages/78/f6/50736d40d97e8483172f1bb6e698895b92a223dba513b0ca6f06b2365339/cryptography-46.0.3-cp311-abi3-manylinux_2_34_aarch64.whl", hash = "sha256:549e234ff32571b1f4076ac269fcce7a808d3bf98b76c8dd560e42dbc66d7d91", size = 4299464, upload-time = "2025-10-15T23:17:05.483Z" }, - { url = "https://files.pythonhosted.org/packages/00/de/d8e26b1a855f19d9994a19c702fa2e93b0456beccbcfe437eda00e0701f2/cryptography-46.0.3-cp311-abi3-manylinux_2_34_ppc64le.whl", hash = "sha256:c0a7bb1a68a5d3471880e264621346c48665b3bf1c3759d682fc0864c540bd9e", size = 4950838, upload-time = "2025-10-15T23:17:07.425Z" }, - { url = "https://files.pythonhosted.org/packages/8f/29/798fc4ec461a1c9e9f735f2fc58741b0daae30688f41b2497dcbc9ed1355/cryptography-46.0.3-cp311-abi3-manylinux_2_34_x86_64.whl", hash = "sha256:10b01676fc208c3e6feeb25a8b83d81767e8059e1fe86e1dc62d10a3018fa926", size = 4481596, upload-time = "2025-10-15T23:17:09.343Z" }, - { url = "https://files.pythonhosted.org/packages/15/8d/03cd48b20a573adfff7652b76271078e3045b9f49387920e7f1f631d125e/cryptography-46.0.3-cp311-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:0abf1ffd6e57c67e92af68330d05760b7b7efb243aab8377e583284dbab72c71", size = 4426782, upload-time = "2025-10-15T23:17:11.22Z" }, - { url = "https://files.pythonhosted.org/packages/fa/b1/ebacbfe53317d55cf33165bda24c86523497a6881f339f9aae5c2e13e57b/cryptography-46.0.3-cp311-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:a04bee9ab6a4da801eb9b51f1b708a1b5b5c9eb48c03f74198464c66f0d344ac", size = 4698381, upload-time = "2025-10-15T23:17:12.829Z" }, - { url = "https://files.pythonhosted.org/packages/96/92/8a6a9525893325fc057a01f654d7efc2c64b9de90413adcf605a85744ff4/cryptography-46.0.3-cp311-abi3-win32.whl", hash = "sha256:f260d0d41e9b4da1ed1e0f1ce571f97fe370b152ab18778e9e8f67d6af432018", size = 3055988, upload-time = "2025-10-15T23:17:14.65Z" }, - { url = "https://files.pythonhosted.org/packages/7e/bf/80fbf45253ea585a1e492a6a17efcb93467701fa79e71550a430c5e60df0/cryptography-46.0.3-cp311-abi3-win_amd64.whl", hash = "sha256:a9a3008438615669153eb86b26b61e09993921ebdd75385ddd748702c5adfddb", size = 3514451, upload-time = "2025-10-15T23:17:16.142Z" }, - { url = "https://files.pythonhosted.org/packages/2e/af/9b302da4c87b0beb9db4e756386a7c6c5b8003cd0e742277888d352ae91d/cryptography-46.0.3-cp311-abi3-win_arm64.whl", hash = "sha256:5d7f93296ee28f68447397bf5198428c9aeeab45705a55d53a6343455dcb2c3c", size = 2928007, upload-time = "2025-10-15T23:17:18.04Z" }, - { url = "https://files.pythonhosted.org/packages/f5/e2/a510aa736755bffa9d2f75029c229111a1d02f8ecd5de03078f4c18d91a3/cryptography-46.0.3-cp314-cp314t-macosx_10_9_universal2.whl", hash = "sha256:00a5e7e87938e5ff9ff5447ab086a5706a957137e6e433841e9d24f38a065217", size = 7158012, upload-time = "2025-10-15T23:17:19.982Z" }, - { url = "https://files.pythonhosted.org/packages/73/dc/9aa866fbdbb95b02e7f9d086f1fccfeebf8953509b87e3f28fff927ff8a0/cryptography-46.0.3-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:c8daeb2d2174beb4575b77482320303f3d39b8e81153da4f0fb08eb5fe86a6c5", size = 4288728, upload-time = "2025-10-15T23:17:21.527Z" }, - { url = "https://files.pythonhosted.org/packages/c5/fd/bc1daf8230eaa075184cbbf5f8cd00ba9db4fd32d63fb83da4671b72ed8a/cryptography-46.0.3-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:39b6755623145ad5eff1dab323f4eae2a32a77a7abef2c5089a04a3d04366715", size = 4435078, upload-time = "2025-10-15T23:17:23.042Z" }, - { url = "https://files.pythonhosted.org/packages/82/98/d3bd5407ce4c60017f8ff9e63ffee4200ab3e23fe05b765cab805a7db008/cryptography-46.0.3-cp314-cp314t-manylinux_2_28_aarch64.whl", hash = "sha256:db391fa7c66df6762ee3f00c95a89e6d428f4d60e7abc8328f4fe155b5ac6e54", size = 4293460, upload-time = "2025-10-15T23:17:24.885Z" }, - { url = "https://files.pythonhosted.org/packages/26/e9/e23e7900983c2b8af7a08098db406cf989d7f09caea7897e347598d4cd5b/cryptography-46.0.3-cp314-cp314t-manylinux_2_28_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:78a97cf6a8839a48c49271cdcbd5cf37ca2c1d6b7fdd86cc864f302b5e9bf459", size = 3995237, upload-time = "2025-10-15T23:17:26.449Z" }, - { url = "https://files.pythonhosted.org/packages/91/15/af68c509d4a138cfe299d0d7ddb14afba15233223ebd933b4bbdbc7155d3/cryptography-46.0.3-cp314-cp314t-manylinux_2_28_ppc64le.whl", hash = "sha256:dfb781ff7eaa91a6f7fd41776ec37c5853c795d3b358d4896fdbb5df168af422", size = 4967344, upload-time = "2025-10-15T23:17:28.06Z" }, - { url = "https://files.pythonhosted.org/packages/ca/e3/8643d077c53868b681af077edf6b3cb58288b5423610f21c62aadcbe99f4/cryptography-46.0.3-cp314-cp314t-manylinux_2_28_x86_64.whl", hash = "sha256:6f61efb26e76c45c4a227835ddeae96d83624fb0d29eb5df5b96e14ed1a0afb7", size = 4466564, upload-time = "2025-10-15T23:17:29.665Z" }, - { url = "https://files.pythonhosted.org/packages/0e/43/c1e8726fa59c236ff477ff2b5dc071e54b21e5a1e51aa2cee1676f1c986f/cryptography-46.0.3-cp314-cp314t-manylinux_2_34_aarch64.whl", hash = "sha256:23b1a8f26e43f47ceb6d6a43115f33a5a37d57df4ea0ca295b780ae8546e8044", size = 4292415, upload-time = "2025-10-15T23:17:31.686Z" }, - { url = "https://files.pythonhosted.org/packages/42/f9/2f8fefdb1aee8a8e3256a0568cffc4e6d517b256a2fe97a029b3f1b9fe7e/cryptography-46.0.3-cp314-cp314t-manylinux_2_34_ppc64le.whl", hash = "sha256:b419ae593c86b87014b9be7396b385491ad7f320bde96826d0dd174459e54665", size = 4931457, upload-time = "2025-10-15T23:17:33.478Z" }, - { url = "https://files.pythonhosted.org/packages/79/30/9b54127a9a778ccd6d27c3da7563e9f2d341826075ceab89ae3b41bf5be2/cryptography-46.0.3-cp314-cp314t-manylinux_2_34_x86_64.whl", hash = "sha256:50fc3343ac490c6b08c0cf0d704e881d0d660be923fd3076db3e932007e726e3", size = 4466074, upload-time = "2025-10-15T23:17:35.158Z" }, - { url = "https://files.pythonhosted.org/packages/ac/68/b4f4a10928e26c941b1b6a179143af9f4d27d88fe84a6a3c53592d2e76bf/cryptography-46.0.3-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:22d7e97932f511d6b0b04f2bfd818d73dcd5928db509460aaf48384778eb6d20", size = 4420569, upload-time = "2025-10-15T23:17:37.188Z" }, - { url = "https://files.pythonhosted.org/packages/a3/49/3746dab4c0d1979888f125226357d3262a6dd40e114ac29e3d2abdf1ec55/cryptography-46.0.3-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:d55f3dffadd674514ad19451161118fd010988540cee43d8bc20675e775925de", size = 4681941, upload-time = "2025-10-15T23:17:39.236Z" }, - { url = "https://files.pythonhosted.org/packages/fd/30/27654c1dbaf7e4a3531fa1fc77986d04aefa4d6d78259a62c9dc13d7ad36/cryptography-46.0.3-cp314-cp314t-win32.whl", hash = "sha256:8a6e050cb6164d3f830453754094c086ff2d0b2f3a897a1d9820f6139a1f0914", size = 3022339, upload-time = "2025-10-15T23:17:40.888Z" }, - { url = "https://files.pythonhosted.org/packages/f6/30/640f34ccd4d2a1bc88367b54b926b781b5a018d65f404d409aba76a84b1c/cryptography-46.0.3-cp314-cp314t-win_amd64.whl", hash = "sha256:760f83faa07f8b64e9c33fc963d790a2edb24efb479e3520c14a45741cd9b2db", size = 3494315, upload-time = "2025-10-15T23:17:42.769Z" }, - { url = "https://files.pythonhosted.org/packages/ba/8b/88cc7e3bd0a8e7b861f26981f7b820e1f46aa9d26cc482d0feba0ecb4919/cryptography-46.0.3-cp314-cp314t-win_arm64.whl", hash = "sha256:516ea134e703e9fe26bcd1277a4b59ad30586ea90c365a87781d7887a646fe21", size = 2919331, upload-time = "2025-10-15T23:17:44.468Z" }, - { url = "https://files.pythonhosted.org/packages/fd/23/45fe7f376a7df8daf6da3556603b36f53475a99ce4faacb6ba2cf3d82021/cryptography-46.0.3-cp38-abi3-macosx_10_9_universal2.whl", hash = "sha256:cb3d760a6117f621261d662bccc8ef5bc32ca673e037c83fbe565324f5c46936", size = 7218248, upload-time = "2025-10-15T23:17:46.294Z" }, - { url = "https://files.pythonhosted.org/packages/27/32/b68d27471372737054cbd34c84981f9edbc24fe67ca225d389799614e27f/cryptography-46.0.3-cp38-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:4b7387121ac7d15e550f5cb4a43aef2559ed759c35df7336c402bb8275ac9683", size = 4294089, upload-time = "2025-10-15T23:17:48.269Z" }, - { url = "https://files.pythonhosted.org/packages/26/42/fa8389d4478368743e24e61eea78846a0006caffaf72ea24a15159215a14/cryptography-46.0.3-cp38-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:15ab9b093e8f09daab0f2159bb7e47532596075139dd74365da52ecc9cb46c5d", size = 4440029, upload-time = "2025-10-15T23:17:49.837Z" }, - { url = "https://files.pythonhosted.org/packages/5f/eb/f483db0ec5ac040824f269e93dd2bd8a21ecd1027e77ad7bdf6914f2fd80/cryptography-46.0.3-cp38-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:46acf53b40ea38f9c6c229599a4a13f0d46a6c3fa9ef19fc1a124d62e338dfa0", size = 4297222, upload-time = "2025-10-15T23:17:51.357Z" }, - { url = "https://files.pythonhosted.org/packages/fd/cf/da9502c4e1912cb1da3807ea3618a6829bee8207456fbbeebc361ec38ba3/cryptography-46.0.3-cp38-abi3-manylinux_2_28_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:10ca84c4668d066a9878890047f03546f3ae0a6b8b39b697457b7757aaf18dbc", size = 4012280, upload-time = "2025-10-15T23:17:52.964Z" }, - { url = "https://files.pythonhosted.org/packages/6b/8f/9adb86b93330e0df8b3dcf03eae67c33ba89958fc2e03862ef1ac2b42465/cryptography-46.0.3-cp38-abi3-manylinux_2_28_ppc64le.whl", hash = "sha256:36e627112085bb3b81b19fed209c05ce2a52ee8b15d161b7c643a7d5a88491f3", size = 4978958, upload-time = "2025-10-15T23:17:54.965Z" }, - { url = "https://files.pythonhosted.org/packages/d1/a0/5fa77988289c34bdb9f913f5606ecc9ada1adb5ae870bd0d1054a7021cc4/cryptography-46.0.3-cp38-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:1000713389b75c449a6e979ffc7dcc8ac90b437048766cef052d4d30b8220971", size = 4473714, upload-time = "2025-10-15T23:17:56.754Z" }, - { url = "https://files.pythonhosted.org/packages/14/e5/fc82d72a58d41c393697aa18c9abe5ae1214ff6f2a5c18ac470f92777895/cryptography-46.0.3-cp38-abi3-manylinux_2_34_aarch64.whl", hash = "sha256:b02cf04496f6576afffef5ddd04a0cb7d49cf6be16a9059d793a30b035f6b6ac", size = 4296970, upload-time = "2025-10-15T23:17:58.588Z" }, - { url = "https://files.pythonhosted.org/packages/78/06/5663ed35438d0b09056973994f1aec467492b33bd31da36e468b01ec1097/cryptography-46.0.3-cp38-abi3-manylinux_2_34_ppc64le.whl", hash = "sha256:71e842ec9bc7abf543b47cf86b9a743baa95f4677d22baa4c7d5c69e49e9bc04", size = 4940236, upload-time = "2025-10-15T23:18:00.897Z" }, - { url = "https://files.pythonhosted.org/packages/fc/59/873633f3f2dcd8a053b8dd1d38f783043b5fce589c0f6988bf55ef57e43e/cryptography-46.0.3-cp38-abi3-manylinux_2_34_x86_64.whl", hash = "sha256:402b58fc32614f00980b66d6e56a5b4118e6cb362ae8f3fda141ba4689bd4506", size = 4472642, upload-time = "2025-10-15T23:18:02.749Z" }, - { url = "https://files.pythonhosted.org/packages/3d/39/8e71f3930e40f6877737d6f69248cf74d4e34b886a3967d32f919cc50d3b/cryptography-46.0.3-cp38-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:ef639cb3372f69ec44915fafcd6698b6cc78fbe0c2ea41be867f6ed612811963", size = 4423126, upload-time = "2025-10-15T23:18:04.85Z" }, - { url = "https://files.pythonhosted.org/packages/cd/c7/f65027c2810e14c3e7268353b1681932b87e5a48e65505d8cc17c99e36ae/cryptography-46.0.3-cp38-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:3b51b8ca4f1c6453d8829e1eb7299499ca7f313900dd4d89a24b8b87c0a780d4", size = 4686573, upload-time = "2025-10-15T23:18:06.908Z" }, - { url = "https://files.pythonhosted.org/packages/0a/6e/1c8331ddf91ca4730ab3086a0f1be19c65510a33b5a441cb334e7a2d2560/cryptography-46.0.3-cp38-abi3-win32.whl", hash = "sha256:6276eb85ef938dc035d59b87c8a7dc559a232f954962520137529d77b18ff1df", size = 3036695, upload-time = "2025-10-15T23:18:08.672Z" }, - { url = "https://files.pythonhosted.org/packages/90/45/b0d691df20633eff80955a0fc7695ff9051ffce8b69741444bd9ed7bd0db/cryptography-46.0.3-cp38-abi3-win_amd64.whl", hash = "sha256:416260257577718c05135c55958b674000baef9a1c7d9e8f306ec60d71db850f", size = 3501720, upload-time = "2025-10-15T23:18:10.632Z" }, - { url = "https://files.pythonhosted.org/packages/e8/cb/2da4cc83f5edb9c3257d09e1e7ab7b23f049c7962cae8d842bbef0a9cec9/cryptography-46.0.3-cp38-abi3-win_arm64.whl", hash = "sha256:d89c3468de4cdc4f08a57e214384d0471911a3830fcdaf7a8cc587e42a866372", size = 2918740, upload-time = "2025-10-15T23:18:12.277Z" }, -] +sdist = { url = "https://files.pythonhosted.org/packages/12/45/870e7f4bef50e5f53b9f51d4428aee5290eedf58ba443f16b1ebb7ab8e66/cryptography-48.0.1.tar.gz", hash = "sha256:266f4ee051abb2f725b74ef8072b521ce1feacf685a3364fa6a6b45548db791a", size = 832989, upload-time = "2026-06-09T22:32:31.8Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/1b/bc/ee4137cbbe105652c0ee4252792b78fc8e7afa4b8e61d9d5dc05a7f45731/cryptography-48.0.1-cp311-abi3-macosx_10_9_universal2.whl", hash = "sha256:3e4a1a3232eef2e6c732827d5722db29a0cc8b27af2a4d865b094cf954be9ca1", size = 8008324, upload-time = "2026-06-09T22:31:00.702Z" }, + { url = "https://files.pythonhosted.org/packages/d5/85/6379d42181bfc713094f081360fc5784d6c816b599d45e7f082502d173ce/cryptography-48.0.1-cp311-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:32143b24adb918f078134e1e230f1eb8cc04886b92c28b5f0041aaf3e5699225", size = 4696243, upload-time = "2026-06-09T22:32:33.446Z" }, + { url = "https://files.pythonhosted.org/packages/9c/87/c85d147b53323c7eb4d850920c8901377323c2a0ff8d79c262d4fee89aa2/cryptography-48.0.1-cp311-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:f0d27a5696721ef7a672b8c810f6aded391058e0b9486e63e6d93baf765da691", size = 4713235, upload-time = "2026-06-09T22:31:40.141Z" }, + { url = "https://files.pythonhosted.org/packages/79/58/67cbf8cf1ee7c54b439ca07bbecf8362c07afc11a3724fea70f745784add/cryptography-48.0.1-cp311-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:eb86ce1af36fe65041b6db9a8bb064ee621a7e5fded0f80d475ec243477cd242", size = 4702323, upload-time = "2026-06-09T22:31:42.191Z" }, + { url = "https://files.pythonhosted.org/packages/89/c6/24266ac10c47f6cd2a865f4446062b466da1d1f10b27189eac00e61bf0c9/cryptography-48.0.1-cp311-abi3-manylinux_2_28_ppc64le.whl", hash = "sha256:b024e784ad6c077ee0147b35ea9cbfc1e34e1fd4c1dcca214c2794d73a12df08", size = 5300085, upload-time = "2026-06-09T22:31:58.703Z" }, + { url = "https://files.pythonhosted.org/packages/d2/bb/cc4b78784f97efc8c5874c2a9743708d172be6663024b34a0467885ae0c8/cryptography-48.0.1-cp311-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:3752f2dbc8f07a30aad2932c986cea495b03bb554887828225da104f732852b6", size = 4746137, upload-time = "2026-06-09T22:31:31.01Z" }, + { url = "https://files.pythonhosted.org/packages/1f/52/0c44de3f5267f8fbe8e835138017522a333436166e406f0db9b9e6e3033f/cryptography-48.0.1-cp311-abi3-manylinux_2_31_armv7l.whl", hash = "sha256:bd81490cd5801d755cf97bb68ac191f14b708470b1c7cf4580f669b9c9264cd8", size = 4333867, upload-time = "2026-06-09T22:32:28.096Z" }, + { url = "https://files.pythonhosted.org/packages/9a/2e/772d7adbfa931537bc401640b7cac9976bff689bda187833e5d63b428e49/cryptography-48.0.1-cp311-abi3-manylinux_2_34_aarch64.whl", hash = "sha256:66fd0771e7b9c6dcd44cf1120690d2338d16d72795cf40cae2786a39eba65429", size = 4701805, upload-time = "2026-06-09T22:31:38.284Z" }, + { url = "https://files.pythonhosted.org/packages/f8/a3/b06844f303873493c963caf581c04df31c7035e0c1b0f02c4814d319ec80/cryptography-48.0.1-cp311-abi3-manylinux_2_34_ppc64le.whl", hash = "sha256:3fd2ca57062b241c856670b073487d2e86c4637937ca5601e48f97bf8e11fc8f", size = 5258461, upload-time = "2026-06-09T22:31:04.187Z" }, + { url = "https://files.pythonhosted.org/packages/9f/13/8b765e2e12b07c74941caadb9d1c8fdc006c4dfbf2b8f2d610519758954d/cryptography-48.0.1-cp311-abi3-manylinux_2_34_x86_64.whl", hash = "sha256:0ee6ea481db1ab889cba043ec1eda17bb9c1ea79db6722f779c3667f9f70322f", size = 4745488, upload-time = "2026-06-09T22:32:30.07Z" }, + { url = "https://files.pythonhosted.org/packages/2e/aa/48972bce55049b32a94f4907eda4d75fa385aad8a39506cc2fc72196ecf0/cryptography-48.0.1-cp311-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:f2ceef93cb096aa3c4cc4b5c94ca6131f9196d28c64d6111533402a9b2054d41", size = 4830256, upload-time = "2026-06-09T22:31:43.868Z" }, + { url = "https://files.pythonhosted.org/packages/47/a2/e5079a032fb85cf6005046ca92bbd78b0c82dad2b5751ab8c311659da06f/cryptography-48.0.1-cp311-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:9bd3f92d76217892b15df84ca256c2c113d386fdda7a7d8691aeeced976507c6", size = 4979117, upload-time = "2026-06-09T22:31:05.845Z" }, + { url = "https://files.pythonhosted.org/packages/b7/a0/8f50cae9c74e718ed769d63ed5c74bd0ea830c9550a74629cebd1b9c7bc7/cryptography-48.0.1-cp311-abi3-win32.whl", hash = "sha256:b9a32b876490d66c8bcc9963ef220199569748434ab01a9d6aaeabf88e7f5158", size = 3304154, upload-time = "2026-06-09T22:32:16.845Z" }, + { url = "https://files.pythonhosted.org/packages/c5/69/0572c77dbace6fef72f33755bd52ea399c71367250d366237f8691826b9e/cryptography-48.0.1-cp311-abi3-win_amd64.whl", hash = "sha256:39489bfca54c7a1f6b297efcd8bc608ab92d16c4ca631b0cad4da46724588b24", size = 3817138, upload-time = "2026-06-09T22:32:00.388Z" }, + { url = "https://files.pythonhosted.org/packages/42/06/3e768b4c3bc78201583fa35a0e18f640dd782ff41afba88f8545481a8874/cryptography-48.0.1-cp314-cp314t-macosx_10_9_universal2.whl", hash = "sha256:f817adc181390bd54f2f700107a7419040fb7c1bdf2fc26f36551a06a68c3345", size = 7989830, upload-time = "2026-06-09T22:31:07.8Z" }, + { url = "https://files.pythonhosted.org/packages/8a/13/6476736484b94041110c8340a3eb63962fea4975baea8cb4a512adb44d4d/cryptography-48.0.1-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:d5d30989c6917b478b5817902e85fddaea2261efa8648383d965381ccb9e1ac4", size = 4689201, upload-time = "2026-06-09T22:31:09.745Z" }, + { url = "https://files.pythonhosted.org/packages/79/62/65a87f34d2a431546e2509b85d55e8c90df86d668f6731da64d538512ac2/cryptography-48.0.1-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:df637c05205ea7c1d7fbcbe54bbfea648a52951155f997af13d895d0ecc96991", size = 4702822, upload-time = "2026-06-09T22:32:24.409Z" }, + { url = "https://files.pythonhosted.org/packages/7f/59/810b5204b0a9b10f4b6bc06bd551a8b609803cd931806bc3b71884b225e5/cryptography-48.0.1-cp314-cp314t-manylinux_2_28_aarch64.whl", hash = "sha256:869c3b8a53bfe27147832df48b32adadf558249d50e76cb3769d40e986b13265", size = 4694875, upload-time = "2026-06-09T22:32:08.737Z" }, + { url = "https://files.pythonhosted.org/packages/24/dc/d8ca05ffea724eec6d232ea6f18e74c269eb6bdfdcc9bfba689790d1325f/cryptography-48.0.1-cp314-cp314t-manylinux_2_28_ppc64le.whl", hash = "sha256:e361afba8918070d376df76f408a4f67fec0ee9cff81a99e48fe9a233ef59e17", size = 5290385, upload-time = "2026-06-09T22:31:15.212Z" }, + { url = "https://files.pythonhosted.org/packages/03/8c/3be6cb4da181f5bb6c19cf560c2359d60644a6b5fc5b57854e528f47b296/cryptography-48.0.1-cp314-cp314t-manylinux_2_28_x86_64.whl", hash = "sha256:d069066deead00ac7f090be101be875a06855908f7ec004c27b8fefb4acfb411", size = 4737082, upload-time = "2026-06-09T22:32:22.66Z" }, + { url = "https://files.pythonhosted.org/packages/aa/f6/d5f60a5a1434dbfd949e227fd0065d194c7e6b6ac526b17f5c06152b8231/cryptography-48.0.1-cp314-cp314t-manylinux_2_31_armv7l.whl", hash = "sha256:09f73a725d582cef64b91281a322cd798d14a33b2b6f2b7ad9531dc336d84c02", size = 4325328, upload-time = "2026-06-09T22:32:10.777Z" }, + { url = "https://files.pythonhosted.org/packages/17/b7/ba75dd947a14b6ad907b01ae8f6b5b348cdd1b48142f0063dee9e20c1d9d/cryptography-48.0.1-cp314-cp314t-manylinux_2_34_aarch64.whl", hash = "sha256:15254441469dd6bf027039453288e2072124f8b6603563f5d759e1c9b69273fa", size = 4694530, upload-time = "2026-06-09T22:31:53.105Z" }, + { url = "https://files.pythonhosted.org/packages/62/29/50d6b9e8aff12d8b67afaeb3569335e32dc83a5723e3bbded24fdac9f809/cryptography-48.0.1-cp314-cp314t-manylinux_2_34_ppc64le.whl", hash = "sha256:8ace4507d1e6533c125f4fac754f8bb8b6a74c08e92179dabd7e16571a3efbf3", size = 5245046, upload-time = "2026-06-09T22:31:25.774Z" }, + { url = "https://files.pythonhosted.org/packages/9f/04/618f4115cfc0add0838c82507aa18a346089428da8653ad38b3ff36f5cb3/cryptography-48.0.1-cp314-cp314t-manylinux_2_34_x86_64.whl", hash = "sha256:b4e391975f038e66432328639620a4aff2d307513b004f1ca06d6225bced815c", size = 4736660, upload-time = "2026-06-09T22:32:12.676Z" }, + { url = "https://files.pythonhosted.org/packages/24/9c/06e062462a0de28a3b3911322eded4c16deb9f441b1b7575d3dc59488ab5/cryptography-48.0.1-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:42fcd8e26fe555d9b3577a135f5091fefa0aa4e99129c23fb56787a1bd4ada72", size = 4822229, upload-time = "2026-06-09T22:31:17.062Z" }, + { url = "https://files.pythonhosted.org/packages/f4/be/0561971eaaee4b8a0e7d5113c536921063ab91aaf23278ac374eaf881e11/cryptography-48.0.1-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:c1400da5e32a43253392277eac7490a60e497d810a63dd5608d71bbd7af507c9", size = 4966364, upload-time = "2026-06-09T22:31:32.842Z" }, + { url = "https://files.pythonhosted.org/packages/a4/27/728c77876f12b000820b69ae490f3c4083775e79e07827e9e60be07ad209/cryptography-48.0.1-cp314-cp314t-win32.whl", hash = "sha256:0df56b056bc17c1b7d6821dfa65216e62bd232d8ab05eb3db44e71d235651471", size = 3278498, upload-time = "2026-06-09T22:31:29.154Z" }, + { url = "https://files.pythonhosted.org/packages/06/e3/79a612c6d7b1e6ee0edd43633d53035bec2cfb78c82b76f7864f39e36f34/cryptography-48.0.1-cp314-cp314t-win_amd64.whl", hash = "sha256:9de21387aa95e2a895823d0745b430bed4f33503ba9ab5e0b5311f33e37d66d2", size = 3798790, upload-time = "2026-06-09T22:31:56.697Z" }, + { url = "https://files.pythonhosted.org/packages/ca/6c/00fa2a95997164c8b2072ce327c23d4ab20809ccc323ea5fab91e53a4bba/cryptography-48.0.1-cp39-abi3-macosx_10_9_universal2.whl", hash = "sha256:4fdc69f8e4316bcf0c8c8ec1f26f285d12e8142d88d96c876a59a03be3f6ae67", size = 7987408, upload-time = "2026-06-09T22:32:20.777Z" }, + { url = "https://files.pythonhosted.org/packages/b0/d9/45f309a7e4e5f3f8f121d6d3be9e94024a7726ec598d6e08ae04edb2f04d/cryptography-48.0.1-cp39-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:48fe40804d4caa2288f24e70ca8c64c42dd826da0ad7e4f1b41b2128d679e6c8", size = 4690196, upload-time = "2026-06-09T22:31:54.74Z" }, + { url = "https://files.pythonhosted.org/packages/5f/9f/a1bc8bcc798811b8527eb374bbccf30a3f3e806829d967118222bf1125eb/cryptography-48.0.1-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:86be3b1b0b6bf09482fb50a979c508d2950ed95f5621ec77f4e385962006b83a", size = 4696782, upload-time = "2026-06-09T22:31:45.615Z" }, + { url = "https://files.pythonhosted.org/packages/66/c2/81a4fb4e4373c500bb526bc337ac5719dd31dd15b970b84a238168c6aa08/cryptography-48.0.1-cp39-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:4ab0a343c807bbcd90c971cd1ecf072937cd01847a9e002bef88fb47ac6be577", size = 4696618, upload-time = "2026-06-09T22:31:11.564Z" }, + { url = "https://files.pythonhosted.org/packages/e5/0b/aa68b221dde92d09cb29a024ede17550ee21e77a404e59fc093c82bb51e1/cryptography-48.0.1-cp39-abi3-manylinux_2_28_ppc64le.whl", hash = "sha256:9621de99d2da096006b629979efd8ae7eb2d8b822488d0c89ee4000c306c59b1", size = 5289970, upload-time = "2026-06-09T22:31:20.368Z" }, + { url = "https://files.pythonhosted.org/packages/78/13/fba657f958d2af66ea959a4ba01212632089249d34af1ae48054136344d7/cryptography-48.0.1-cp39-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:88c852a0ae366e262e5a1744b685e6a433dc8788dd2a277e418bf4904203609d", size = 4731873, upload-time = "2026-06-09T22:31:22.253Z" }, + { url = "https://files.pythonhosted.org/packages/4c/4c/9a964756d24a26b3e34dfcb16f961b89838786e6700b635b0d1e3adff4b6/cryptography-48.0.1-cp39-abi3-manylinux_2_31_armv7l.whl", hash = "sha256:43c5835e2cb98c8733d86f57d6fc879b613f5c3478607281c3e36daffc6dd8a6", size = 4330804, upload-time = "2026-06-09T22:31:36.56Z" }, + { url = "https://files.pythonhosted.org/packages/4b/0f/a10f3a6eb12950a10e3a874070283aa2dd5875b2bfd15fad8a3e17b3f13e/cryptography-48.0.1-cp39-abi3-manylinux_2_34_aarch64.whl", hash = "sha256:fe0180af5bf9236518a087e35bf2d9a347d5f5f51e63c579d683ddff424e3d46", size = 4696217, upload-time = "2026-06-09T22:31:13.351Z" }, + { url = "https://files.pythonhosted.org/packages/f3/6f/5cd12f951165ea73ef85266775d97e4c763b2474ccfd816dd69d3a18d6f8/cryptography-48.0.1-cp39-abi3-manylinux_2_34_ppc64le.whl", hash = "sha256:b7a2d1a937a738a881737cec135a38bb61470589b17515b9f73f571d0ae10401", size = 5245252, upload-time = "2026-06-09T22:32:02.193Z" }, + { url = "https://files.pythonhosted.org/packages/68/ab/8aaa12e4516ec4464033ab79b6f3b592bd5a92102467c4ace8a0d970203f/cryptography-48.0.1-cp39-abi3-manylinux_2_34_x86_64.whl", hash = "sha256:b74ca3b8e5ecdd833bf6a002ca41b4793bb27fb8f1c06ffaf2643c9e9140e31b", size = 4731388, upload-time = "2026-06-09T22:32:04.019Z" }, + { url = "https://files.pythonhosted.org/packages/1b/24/50027ea4dca85ec1f40688f3c24fb32ccacd520583c9592c3cc95628e6fb/cryptography-48.0.1-cp39-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:2c37f2461406063b417837f5f3daab668652acd82423efcd7f0a9f04be972de1", size = 4824186, upload-time = "2026-06-09T22:32:18.707Z" }, + { url = "https://files.pythonhosted.org/packages/52/41/04cb5eb17085ade6f50cc611fb657df6a0f5885350de8764ece89c050197/cryptography-48.0.1-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:86fe77abb1bd87afb251d4d02ada7ecf53a32cee9b67d976abb2e45a13297475", size = 4964539, upload-time = "2026-06-09T22:31:18.793Z" }, + { url = "https://files.pythonhosted.org/packages/36/bf/ed70785c496e89d7e73b7cda2d21f2447fd6d4e821714b8d04ff217fed92/cryptography-48.0.1-cp39-abi3-win32.whl", hash = "sha256:6b2c0c3e6ccf3ade7750f836ef3ee36eea250cc467d45c256895573ac08cc6f1", size = 3282307, upload-time = "2026-06-09T22:30:53.162Z" }, + { url = "https://files.pythonhosted.org/packages/b3/ff/371ea7d252656ee1eb6d83eeeef3d1d0c6baf1d6497687d081ea03814670/cryptography-48.0.1-cp39-abi3-win_amd64.whl", hash = "sha256:9a49ca6c81417f6a5edb50375a60cccdd70fa0a91a5211829dbea74eba94d2ac", size = 3793408, upload-time = "2026-06-09T22:32:15.191Z" }, +] + +[[package]] +name = "csscompressor" +version = "0.9.5" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/f1/2a/8c3ac3d8bc94e6de8d7ae270bb5bc437b210bb9d6d9e46630c98f4abd20c/csscompressor-0.9.5.tar.gz", hash = "sha256:afa22badbcf3120a4f392e4d22f9fff485c044a1feda4a950ecc5eba9dd31a05", size = 237808, upload-time = "2017-11-26T21:13:08.238Z" } [[package]] name = "cuda-bindings" -version = "12.9.4" +version = "13.3.1" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "cuda-pathfinder" }, ] wheels = [ - { url = "https://files.pythonhosted.org/packages/a9/c1/dabe88f52c3e3760d861401bb994df08f672ec893b8f7592dc91626adcf3/cuda_bindings-12.9.4-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:fda147a344e8eaeca0c6ff113d2851ffca8f7dfc0a6c932374ee5c47caa649c8", size = 12151019, upload-time = "2025-10-21T14:51:43.167Z" }, - { url = "https://files.pythonhosted.org/packages/63/56/e465c31dc9111be3441a9ba7df1941fe98f4aa6e71e8788a3fb4534ce24d/cuda_bindings-12.9.4-cp313-cp313-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:32bdc5a76906be4c61eb98f546a6786c5773a881f3b166486449b5d141e4a39f", size = 11906628, upload-time = "2025-10-21T14:51:49.905Z" }, - { url = "https://files.pythonhosted.org/packages/a3/84/1e6be415e37478070aeeee5884c2022713c1ecc735e6d82d744de0252eee/cuda_bindings-12.9.4-cp313-cp313t-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:56e0043c457a99ac473ddc926fe0dc4046694d99caef633e92601ab52cbe17eb", size = 11925991, upload-time = "2025-10-21T14:51:56.535Z" }, - { url = "https://files.pythonhosted.org/packages/d1/af/6dfd8f2ed90b1d4719bc053ff8940e494640fe4212dc3dd72f383e4992da/cuda_bindings-12.9.4-cp314-cp314-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:8b72ee72a9cc1b531db31eebaaee5c69a8ec3500e32c6933f2d3b15297b53686", size = 11922703, upload-time = "2025-10-21T14:52:03.585Z" }, - { url = "https://files.pythonhosted.org/packages/6c/19/90ac264acc00f6df8a49378eedec9fd2db3061bf9263bf9f39fd3d8377c3/cuda_bindings-12.9.4-cp314-cp314t-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d80bffc357df9988dca279734bc9674c3934a654cab10cadeed27ce17d8635ee", size = 11924658, upload-time = "2025-10-21T14:52:10.411Z" }, + { url = "https://files.pythonhosted.org/packages/ce/67/5e7dba1ba576dd73da5dee894ca076ca5e959450dfff66d6d510a255d1f7/cuda_bindings-13.3.1-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c7855c4868aabc0cfae28abbe83d56734bdfbd08f08fc234ac1912a12858bf49", size = 6025351, upload-time = "2026-05-29T23:11:49.685Z" }, + { url = "https://files.pythonhosted.org/packages/39/2a/6d2e9047d1fb243dbaa364b01e0297534b9ed7fd27dba1c9f361519cf69b/cuda_bindings-13.3.1-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:e32d08f71ebcdf00f0f41eab2eb37e8da94c8ed411cc9f7f7a019ce6b34abe3a", size = 6657965, upload-time = "2026-05-29T23:11:52.227Z" }, + { url = "https://files.pythonhosted.org/packages/cc/6e/2394f8163360f8391f8f1b7e72d300a82724edb81a7b7084c799fbd4c91f/cuda_bindings-13.3.1-cp313-cp313-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9efb21c1ee64981e184b9e0ba5eb3179e5ba3d4b51665a6cb52b8ef3d01a7cbf", size = 5920504, upload-time = "2026-05-29T23:11:56.883Z" }, + { url = "https://files.pythonhosted.org/packages/34/c2/ef9b6a63f7dc432712a462c816662e662e00d38caa9b861c8c2588195d03/cuda_bindings-13.3.1-cp313-cp313-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:2732904099e0a4d4db774a5fc6d91ee95fae065b4d2ecabb4968c5fe2406c9d7", size = 6476660, upload-time = "2026-05-29T23:11:59.188Z" }, + { url = "https://files.pythonhosted.org/packages/b1/81/bff68ce829999c1e4209c761bbf903b1c06ec570416ddb25020864ad5907/cuda_bindings-13.3.1-cp314-cp314-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1ab2f74ed65bfef4163ba07a8db16f1085e0729291db12a2423aff84ee8278b8", size = 6013639, upload-time = "2026-05-29T23:12:03.509Z" }, + { url = "https://files.pythonhosted.org/packages/d4/e0/c8a1f0c8f9ffdea4f5fe6dbab89b326cef4d85caf489dad39e209da89416/cuda_bindings-13.3.1-cp314-cp314-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:efd4c814d311ec08c981f6dded1dbe7d4b371067ee4f6c14cccec4bde9590f80", size = 6534419, upload-time = "2026-05-29T23:12:05.633Z" }, + { url = "https://files.pythonhosted.org/packages/52/b8/83b1f563925b290f2d11a01a77a84013ba56052fe3653a5bef3ccfbb43d6/cuda_bindings-13.3.1-cp314-cp314t-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c3c772dfff49681541d59630c90f858e173ac926b9c593a2b7123f2a1043cc76", size = 5809771, upload-time = "2026-05-29T23:12:10.422Z" }, + { url = "https://files.pythonhosted.org/packages/12/20/e79b4bfe98f075195afb6343d41c498f9dbd2d161d7021d4d28bceb83581/cuda_bindings-13.3.1-cp314-cp314t-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:36febb7c1079d68a981dbbd8d5a67235b399802b82075c9388624719607e52b9", size = 6358584, upload-time = "2026-05-29T23:12:12.767Z" }, ] [[package]] name = "cuda-pathfinder" -version = "1.3.3" +version = "1.5.5" source = { registry = "https://pypi.org/simple" } wheels = [ - { url = "https://files.pythonhosted.org/packages/0b/02/4dbe7568a42e46582248942f54dc64ad094769532adbe21e525e4edf7bc4/cuda_pathfinder-1.3.3-py3-none-any.whl", hash = "sha256:9984b664e404f7c134954a771be8775dfd6180ea1e1aef4a5a37d4be05d9bbb1", size = 27154, upload-time = "2025-12-04T22:35:08.996Z" }, + { url = "https://files.pythonhosted.org/packages/11/c8/26f2e4aae92f11522a96043892ba39a90eac610d5242523aa863212bc1c7/cuda_pathfinder-1.5.5-py3-none-any.whl", hash = "sha256:0228c023f95d1480f143ef5c8922d27a2ab052087a942e81dc289c9eb8f91689", size = 51671, upload-time = "2026-05-27T01:21:25.413Z" }, +] + +[[package]] +name = "cuda-toolkit" +version = "13.0.2" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/57/b2/453099f5f3b698d7d0eab38916aac44c7f76229f451709e2eb9db6615dcd/cuda_toolkit-13.0.2-py2.py3-none-any.whl", hash = "sha256:b198824cf2f54003f50d64ada3a0f184b42ca0846c1c94192fa269ecd97a66eb", size = 2364, upload-time = "2025-12-19T23:24:07.328Z" }, +] + +[package.optional-dependencies] +cudart = [ + { name = "nvidia-cuda-runtime", marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, +] +cufft = [ + { name = "nvidia-cufft", marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, +] +cufile = [ + { name = "nvidia-cufile", marker = "sys_platform == 'linux'" }, +] +cupti = [ + { name = "nvidia-cuda-cupti", marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, +] +curand = [ + { name = "nvidia-curand", marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, +] +cusolver = [ + { name = "nvidia-cusolver", marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, +] +cusparse = [ + { name = "nvidia-cusparse", marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, +] +nvjitlink = [ + { name = "nvidia-nvjitlink", marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, +] +nvrtc = [ + { name = "nvidia-cuda-nvrtc", marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, +] +nvtx = [ + { name = "nvidia-nvtx", marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, ] [[package]] @@ -804,15 +949,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/1e/77/dc8c558f7593132cf8fefec57c4f60c83b16941c574ac5f619abb3ae7933/dill-0.4.1-py3-none-any.whl", hash = "sha256:1e1ce33e978ae97fcfcff5638477032b801c46c7c65cf717f95fbc2248f79a9d", size = 120019, upload-time = "2026-01-19T02:36:55.663Z" }, ] -[[package]] -name = "diskcache" -version = "5.6.3" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/3f/21/1c1ffc1a039ddcc459db43cc108658f32c57d271d7289a2794e401d0fdb6/diskcache-5.6.3.tar.gz", hash = "sha256:2c3a3fa2743d8535d832ec61c2054a1641f41775aa7c556758a109941e33e4fc", size = 67916, upload-time = "2023-08-31T06:12:00.316Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/3f/27/4570e78fc0bf5ea0ca45eb1de3818a23787af9b390c0b0a0033a1b8236f9/diskcache-5.6.3-py3-none-any.whl", hash = "sha256:5e31b2d5fbad117cc363ebaf6b689474db18a1f6438bc82358b024abd4c2ca19", size = 45550, upload-time = "2023-08-31T06:11:58.822Z" }, -] - [[package]] name = "distlib" version = "0.4.0" @@ -933,24 +1069,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/e9/47/21867c2e5fd006c8d36a560df9e32cb4f1f566b20c5dd41f5f8a2124f7de/face-24.0.0-py3-none-any.whl", hash = "sha256:0e2c17b426fa4639a4e77d1de9580f74a98f4869ba4c7c8c175b810611622cd3", size = 54742, upload-time = "2024-11-02T05:24:24.939Z" }, ] -[[package]] -name = "fakeredis" -version = "2.33.0" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "redis" }, - { name = "sortedcontainers" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/5f/f9/57464119936414d60697fcbd32f38909bb5688b616ae13de6e98384433e0/fakeredis-2.33.0.tar.gz", hash = "sha256:d7bc9a69d21df108a6451bbffee23b3eba432c21a654afc7ff2d295428ec5770", size = 175187, upload-time = "2025-12-16T19:45:52.269Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/6e/78/a850fed8aeef96d4a99043c90b818b2ed5419cd5b24a4049fd7cfb9f1471/fakeredis-2.33.0-py3-none-any.whl", hash = "sha256:de535f3f9ccde1c56672ab2fdd6a8efbc4f2619fc2f1acc87b8737177d71c965", size = 119605, upload-time = "2025-12-16T19:45:51.08Z" }, -] - -[package.optional-dependencies] -lua = [ - { name = "lupa" }, -] - [[package]] name = "fastavro" version = "1.12.1" @@ -988,32 +1106,63 @@ wheels = [ [[package]] name = "fastmcp" -version = "2.14.4" +version = "3.3.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "fastmcp-slim", extra = ["client", "server"] }, +] +sdist = { url = "https://files.pythonhosted.org/packages/3b/a9/5c5a01b6abd5346bf60b97cfd29e4a86661940c27dd562bfcda07fd03519/fastmcp-3.3.1.tar.gz", hash = "sha256:979362ea557de42a5f40342563c7e4b236bcc8e7cd192715f50030695d1a71cd", size = 28681699, upload-time = "2026-05-15T15:50:39.673Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/9f/11/6b1bdada6ccfe647d615ae63f9106f8136aec17971e9361546af01c7d38e/fastmcp-3.3.1-py3-none-any.whl", hash = "sha256:862440c5c4d281363a5995eee59d77f0f7cac1f18869038729cecf03b02fc522", size = 7903, upload-time = "2026-05-15T15:50:36.424Z" }, +] + +[[package]] +name = "fastmcp-slim" +version = "3.3.1" source = { registry = "https://pypi.org/simple" } dependencies = [ + { name = "platformdirs" }, + { name = "pydantic", extra = ["email"] }, + { name = "pydantic-settings" }, + { name = "python-dotenv" }, + { name = "rich" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/d1/a0/627103e517e1d0d6f1eec633d5662d13e776f01b45ad188e4f5f7478b438/fastmcp_slim-3.3.1.tar.gz", hash = "sha256:0957835fc59452e143ab2f4b7836d2d2df9b2d9958408edc79ba8b56232b2a88", size = 567007, upload-time = "2026-05-15T15:50:10.426Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/7a/ee/97047f4cc2d7b1d46670d08d8ad01a96e7a748cc01c0b4b351ad8eddbc7a/fastmcp_slim-3.3.1-py3-none-any.whl", hash = "sha256:6cf1c2d77e3adb0d409d6825ed6b0b2a999062973e00b8eea03bd48bf9b4c043", size = 738644, upload-time = "2026-05-15T15:50:08.336Z" }, +] + +[package.optional-dependencies] +client = [ + { name = "authlib" }, + { name = "exceptiongroup" }, + { name = "httpx" }, + { name = "mcp" }, + { name = "opentelemetry-api" }, + { name = "py-key-value-aio", extra = ["filetree", "keyring", "memory"] }, +] +server = [ { name = "authlib" }, { name = "cyclopts" }, { name = "exceptiongroup" }, + { name = "griffelib" }, { name = "httpx" }, { name = "jsonref" }, { name = "jsonschema-path" }, { name = "mcp" }, { name = "openapi-pydantic" }, + { name = "opentelemetry-api" }, { name = "packaging" }, - { name = "platformdirs" }, - { name = "py-key-value-aio", extra = ["disk", "keyring", "memory"] }, - { name = "pydantic", extra = ["email"] }, - { name = "pydocket" }, + { name = "py-key-value-aio", extra = ["filetree", "keyring", "memory"] }, { name = "pyperclip" }, - { name = "python-dotenv" }, - { name = "rich" }, + { name = "python-multipart" }, + { name = "pyyaml" }, + { name = "uncalled-for" }, { name = "uvicorn" }, + { name = "watchfiles" }, { name = "websockets" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/fd/a9/a57d5e5629ebd4ef82b495a7f8e346ce29ef80cc86b15c8c40570701b94d/fastmcp-2.14.4.tar.gz", hash = "sha256:c01f19845c2adda0a70d59525c9193be64a6383014c8d40ce63345ac664053ff", size = 8302239, upload-time = "2026-01-22T17:29:37.024Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/3e/41/c4d407e2218fd60d84acb6cc5131d28ff876afecf325e3fd9d27b8318581/fastmcp-2.14.4-py3-none-any.whl", hash = "sha256:5858cff5e4c8ea8107f9bca2609d71d6256e0fce74495912f6e51625e466c49a", size = 417788, upload-time = "2026-01-22T17:29:35.159Z" }, -] [[package]] name = "filelock" @@ -1124,15 +1273,27 @@ wheels = [ [[package]] name = "genai-prices" -version = "0.0.51" +version = "0.0.61" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "httpx" }, { name = "pydantic" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/3d/22/427934ef8e7ed29c35afc274666b87fe01a3a27ec7ff102f5839ce4723c0/genai_prices-0.0.51.tar.gz", hash = "sha256:003da98172641c94d7516b0fd8cec5ecf2dbab64a884996c26cc194c5e0b592e", size = 58071, upload-time = "2026-01-13T12:49:11.872Z" } +sdist = { url = "https://files.pythonhosted.org/packages/65/71/0c76010eec75f4b3623d521044785c0977c14adabe1cac72b004349567fb/genai_prices-0.0.61.tar.gz", hash = "sha256:4b3bcfd49f174c05831b09f9ee36557d3648569e2f594af6c24b72031b3f0e52", size = 67806, upload-time = "2026-05-19T17:01:36.902Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/4a/af/b11b80d02aaefc2fc6bfaabb3ae873439c90dc464b3a29eda51b969842b0/genai_prices-0.0.51-py3-none-any.whl", hash = "sha256:4e0f5892a7ec757d59f343c5dbf9675b0f9e8ed65f4fe26ac7df600e34788ca0", size = 60656, upload-time = "2026-01-13T12:49:12.867Z" }, + { url = "https://files.pythonhosted.org/packages/de/ec/b08dc2e834ca00fd8dfedcb17ae2e920667adaad617b45e32b7a3b146f24/genai_prices-0.0.61-py3-none-any.whl", hash = "sha256:d77142f61c13e69909ac19c8e44fd315fd65f3afd714e8d55e914fab0eaf47a2", size = 70853, upload-time = "2026-05-19T17:01:37.858Z" }, +] + +[[package]] +name = "ghp-import" +version = "2.1.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "python-dateutil" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/d9/29/d40217cbe2f6b1359e00c6c307bb3fc876ba74068cbab3dde77f03ca0dc4/ghp-import-2.1.0.tar.gz", hash = "sha256:9c535c4c61193c2df8871222567d7fd7e5014d835f97dc7b7439069e2413d343", size = 10943, upload-time = "2022-05-02T15:47:16.11Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/f7/ec/67fbef5d497f86283db54c22eec6f6140243aae73265799baaaa19cd17fb/ghp_import-2.1.0-py3-none-any.whl", hash = "sha256:8337dd7b50877f163d4c0289bc1f1c7f127550241988d568c1db512c4324a619", size = 11034, upload-time = "2022-05-02T15:47:14.552Z" }, ] [[package]] @@ -1151,15 +1312,15 @@ wheels = [ [[package]] name = "google-auth" -version = "2.47.0" +version = "2.53.0" source = { registry = "https://pypi.org/simple" } dependencies = [ + { name = "cryptography" }, { name = "pyasn1-modules" }, - { name = "rsa" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/60/3c/ec64b9a275ca22fa1cd3b6e77fefcf837b0732c890aa32d2bd21313d9b33/google_auth-2.47.0.tar.gz", hash = "sha256:833229070a9dfee1a353ae9877dcd2dec069a8281a4e72e72f77d4a70ff945da", size = 323719, upload-time = "2026-01-06T21:55:31.045Z" } +sdist = { url = "https://files.pythonhosted.org/packages/c6/ad/ff781329bbbdc0974a098d996e89c9e1f7024262f9e3eec442fbb9ad1ac6/google_auth-2.53.0.tar.gz", hash = "sha256:e7e6aa16f6bee7b2b264830fd04f08087a1d5a836df516251a5d15327b246c9c", size = 335844, upload-time = "2026-05-15T20:53:07.928Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/db/18/79e9008530b79527e0d5f79e7eef08d3b179b7f851cfd3a2f27822fbdfa9/google_auth-2.47.0-py3-none-any.whl", hash = "sha256:c516d68336bfde7cf0da26aab674a36fedcf04b37ac4edd59c597178760c3498", size = 234867, upload-time = "2026-01-06T21:55:28.6Z" }, + { url = "https://files.pythonhosted.org/packages/4a/c9/db44165ba7c581268c6d46017ef63339110378305062830104fc7fa144cb/google_auth-2.53.0-py3-none-any.whl", hash = "sha256:6e7449917c599b35126a99ec268ec6880301f2fea41dce198fe8fd83ff642b68", size = 246071, upload-time = "2026-05-15T20:53:05.609Z" }, ] [package.optional-dependencies] @@ -1169,7 +1330,7 @@ requests = [ [[package]] name = "google-genai" -version = "1.60.0" +version = "2.6.0" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "anyio" }, @@ -1183,9 +1344,9 @@ dependencies = [ { name = "typing-extensions" }, { name = "websockets" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/0a/3f/a753be0dcee352b7d63bc6d1ba14a72591d63b6391dac0cdff7ac168c530/google_genai-1.60.0.tar.gz", hash = "sha256:9768061775fddfaecfefb0d6d7a6cabefb3952ebd246cd5f65247151c07d33d1", size = 487721, upload-time = "2026-01-21T22:17:30.398Z" } +sdist = { url = "https://files.pythonhosted.org/packages/dd/ec/6e49f50f5c70588d97c6ed25e0b8c18828bf4d58895f397b53a7522168a1/google_genai-2.6.0.tar.gz", hash = "sha256:7d4f777234002f2e94be499dbdfb43b506a6aca9dbbec13e61d3dc6ce640ffa7", size = 554809, upload-time = "2026-05-22T01:34:33.581Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/31/e5/384b1f383917b5f0ae92e28f47bc27b16e3d26cd9bacb25e9f8ecab3c8fe/google_genai-1.60.0-py3-none-any.whl", hash = "sha256:967338378ffecebec19a8ed90cf8797b26818bacbefd7846a9280beb1099f7f3", size = 719431, upload-time = "2026-01-21T22:17:28.086Z" }, + { url = "https://files.pythonhosted.org/packages/b3/9e/e8ba4e58a9d5daf42343f3ea1cb0efb721eba36a1d6624e9873d039a5c1e/google_genai-2.6.0-py3-none-any.whl", hash = "sha256:272b6f6320f5d355735241ad441f972af095ec80dc10cb075cb430d96721648a", size = 821003, upload-time = "2026-05-22T01:34:31.55Z" }, ] [[package]] @@ -1212,6 +1373,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/9c/83/3b1d03d36f224edded98e9affd0467630fc09d766c0e56fb1498cbb04a9b/griffe-1.15.0-py3-none-any.whl", hash = "sha256:6f6762661949411031f5fcda9593f586e6ce8340f0ba88921a0f2ef7a81eb9a3", size = 150705, upload-time = "2025-11-10T15:03:13.549Z" }, ] +[[package]] +name = "griffelib" +version = "2.0.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/ad/06/eccbd311c9e2b3ca45dbc063b93134c57a1ccc7607c5e545264ad092c4a9/griffelib-2.0.0.tar.gz", hash = "sha256:e504d637a089f5cab9b5daf18f7645970509bf4f53eda8d79ed71cce8bd97934", size = 166312, upload-time = "2026-03-23T21:06:55.954Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/4d/51/c936033e16d12b627ea334aaaaf42229c37620d0f15593456ab69ab48161/griffelib-2.0.0-py3-none-any.whl", hash = "sha256:01284878c966508b6d6f1dbff9b6fa607bc062d8261c5c7253cb285b06422a7f", size = 142004, upload-time = "2026-02-09T19:09:40.561Z" }, +] + [[package]] name = "groq" version = "1.0.0" @@ -1294,31 +1464,34 @@ wheels = [ [[package]] name = "hf-xet" -version = "1.2.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/5e/6e/0f11bacf08a67f7fb5ee09740f2ca54163863b07b70d579356e9222ce5d8/hf_xet-1.2.0.tar.gz", hash = "sha256:a8c27070ca547293b6890c4bf389f713f80e8c478631432962bb7f4bc0bd7d7f", size = 506020, upload-time = "2025-10-24T19:04:32.129Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/9e/a5/85ef910a0aa034a2abcfadc360ab5ac6f6bc4e9112349bd40ca97551cff0/hf_xet-1.2.0-cp313-cp313t-macosx_10_12_x86_64.whl", hash = "sha256:ceeefcd1b7aed4956ae8499e2199607765fbd1c60510752003b6cc0b8413b649", size = 2861870, upload-time = "2025-10-24T19:04:11.422Z" }, - { url = "https://files.pythonhosted.org/packages/ea/40/e2e0a7eb9a51fe8828ba2d47fe22a7e74914ea8a0db68a18c3aa7449c767/hf_xet-1.2.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:b70218dd548e9840224df5638fdc94bd033552963cfa97f9170829381179c813", size = 2717584, upload-time = "2025-10-24T19:04:09.586Z" }, - { url = "https://files.pythonhosted.org/packages/a5/7d/daf7f8bc4594fdd59a8a596f9e3886133fdc68e675292218a5e4c1b7e834/hf_xet-1.2.0-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7d40b18769bb9a8bc82a9ede575ce1a44c75eb80e7375a01d76259089529b5dc", size = 3315004, upload-time = "2025-10-24T19:04:00.314Z" }, - { url = "https://files.pythonhosted.org/packages/b1/ba/45ea2f605fbf6d81c8b21e4d970b168b18a53515923010c312c06cd83164/hf_xet-1.2.0-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:cd3a6027d59cfb60177c12d6424e31f4b5ff13d8e3a1247b3a584bf8977e6df5", size = 3222636, upload-time = "2025-10-24T19:03:58.111Z" }, - { url = "https://files.pythonhosted.org/packages/4a/1d/04513e3cab8f29ab8c109d309ddd21a2705afab9d52f2ba1151e0c14f086/hf_xet-1.2.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:6de1fc44f58f6dd937956c8d304d8c2dea264c80680bcfa61ca4a15e7b76780f", size = 3408448, upload-time = "2025-10-24T19:04:20.951Z" }, - { url = "https://files.pythonhosted.org/packages/f0/7c/60a2756d7feec7387db3a1176c632357632fbe7849fce576c5559d4520c7/hf_xet-1.2.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:f182f264ed2acd566c514e45da9f2119110e48a87a327ca271027904c70c5832", size = 3503401, upload-time = "2025-10-24T19:04:22.549Z" }, - { url = "https://files.pythonhosted.org/packages/4e/64/48fffbd67fb418ab07451e4ce641a70de1c40c10a13e25325e24858ebe5a/hf_xet-1.2.0-cp313-cp313t-win_amd64.whl", hash = "sha256:293a7a3787e5c95d7be1857358a9130694a9c6021de3f27fa233f37267174382", size = 2900866, upload-time = "2025-10-24T19:04:33.461Z" }, - { url = "https://files.pythonhosted.org/packages/e2/51/f7e2caae42f80af886db414d4e9885fac959330509089f97cccb339c6b87/hf_xet-1.2.0-cp314-cp314t-macosx_10_12_x86_64.whl", hash = "sha256:10bfab528b968c70e062607f663e21e34e2bba349e8038db546646875495179e", size = 2861861, upload-time = "2025-10-24T19:04:19.01Z" }, - { url = "https://files.pythonhosted.org/packages/6e/1d/a641a88b69994f9371bd347f1dd35e5d1e2e2460a2e350c8d5165fc62005/hf_xet-1.2.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:2a212e842647b02eb6a911187dc878e79c4aa0aa397e88dd3b26761676e8c1f8", size = 2717699, upload-time = "2025-10-24T19:04:17.306Z" }, - { url = "https://files.pythonhosted.org/packages/df/e0/e5e9bba7d15f0318955f7ec3f4af13f92e773fbb368c0b8008a5acbcb12f/hf_xet-1.2.0-cp314-cp314t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:30e06daccb3a7d4c065f34fc26c14c74f4653069bb2b194e7f18f17cbe9939c0", size = 3314885, upload-time = "2025-10-24T19:04:07.642Z" }, - { url = "https://files.pythonhosted.org/packages/21/90/b7fe5ff6f2b7b8cbdf1bd56145f863c90a5807d9758a549bf3d916aa4dec/hf_xet-1.2.0-cp314-cp314t-manylinux_2_28_aarch64.whl", hash = "sha256:29c8fc913a529ec0a91867ce3d119ac1aac966e098cf49501800c870328cc090", size = 3221550, upload-time = "2025-10-24T19:04:05.55Z" }, - { url = "https://files.pythonhosted.org/packages/6f/cb/73f276f0a7ce46cc6a6ec7d6c7d61cbfe5f2e107123d9bbd0193c355f106/hf_xet-1.2.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:66e159cbfcfbb29f920db2c09ed8b660eb894640d284f102ada929b6e3dc410a", size = 3408010, upload-time = "2025-10-24T19:04:28.598Z" }, - { url = "https://files.pythonhosted.org/packages/b8/1e/d642a12caa78171f4be64f7cd9c40e3ca5279d055d0873188a58c0f5fbb9/hf_xet-1.2.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:9c91d5ae931510107f148874e9e2de8a16052b6f1b3ca3c1b12f15ccb491390f", size = 3503264, upload-time = "2025-10-24T19:04:30.397Z" }, - { url = "https://files.pythonhosted.org/packages/17/b5/33764714923fa1ff922770f7ed18c2daae034d21ae6e10dbf4347c854154/hf_xet-1.2.0-cp314-cp314t-win_amd64.whl", hash = "sha256:210d577732b519ac6ede149d2f2f34049d44e8622bf14eb3d63bbcd2d4b332dc", size = 2901071, upload-time = "2025-10-24T19:04:37.463Z" }, - { url = "https://files.pythonhosted.org/packages/96/2d/22338486473df5923a9ab7107d375dbef9173c338ebef5098ef593d2b560/hf_xet-1.2.0-cp37-abi3-macosx_10_12_x86_64.whl", hash = "sha256:46740d4ac024a7ca9b22bebf77460ff43332868b661186a8e46c227fdae01848", size = 2866099, upload-time = "2025-10-24T19:04:15.366Z" }, - { url = "https://files.pythonhosted.org/packages/7f/8c/c5becfa53234299bc2210ba314eaaae36c2875e0045809b82e40a9544f0c/hf_xet-1.2.0-cp37-abi3-macosx_11_0_arm64.whl", hash = "sha256:27df617a076420d8845bea087f59303da8be17ed7ec0cd7ee3b9b9f579dff0e4", size = 2722178, upload-time = "2025-10-24T19:04:13.695Z" }, - { url = "https://files.pythonhosted.org/packages/9a/92/cf3ab0b652b082e66876d08da57fcc6fa2f0e6c70dfbbafbd470bb73eb47/hf_xet-1.2.0-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3651fd5bfe0281951b988c0facbe726aa5e347b103a675f49a3fa8144c7968fd", size = 3320214, upload-time = "2025-10-24T19:04:03.596Z" }, - { url = "https://files.pythonhosted.org/packages/46/92/3f7ec4a1b6a65bf45b059b6d4a5d38988f63e193056de2f420137e3c3244/hf_xet-1.2.0-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:d06fa97c8562fb3ee7a378dd9b51e343bc5bc8190254202c9771029152f5e08c", size = 3229054, upload-time = "2025-10-24T19:04:01.949Z" }, - { url = "https://files.pythonhosted.org/packages/0b/dd/7ac658d54b9fb7999a0ccb07ad863b413cbaf5cf172f48ebcd9497ec7263/hf_xet-1.2.0-cp37-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:4c1428c9ae73ec0939410ec73023c4f842927f39db09b063b9482dac5a3bb737", size = 3413812, upload-time = "2025-10-24T19:04:24.585Z" }, - { url = "https://files.pythonhosted.org/packages/92/68/89ac4e5b12a9ff6286a12174c8538a5930e2ed662091dd2572bbe0a18c8a/hf_xet-1.2.0-cp37-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:a55558084c16b09b5ed32ab9ed38421e2d87cf3f1f89815764d1177081b99865", size = 3508920, upload-time = "2025-10-24T19:04:26.927Z" }, - { url = "https://files.pythonhosted.org/packages/cb/44/870d44b30e1dcfb6a65932e3e1506c103a8a5aea9103c337e7a53180322c/hf_xet-1.2.0-cp37-abi3-win_amd64.whl", hash = "sha256:e6584a52253f72c9f52f9e549d5895ca7a471608495c4ecaa6cc73dba2b24d69", size = 2905735, upload-time = "2025-10-24T19:04:35.928Z" }, +version = "1.4.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/09/08/23c84a26716382c89151b5b447b4beb19e3345f3a93d3b73009a71a57ad3/hf_xet-1.4.2.tar.gz", hash = "sha256:b7457b6b482d9e0743bd116363239b1fa904a5e65deede350fbc0c4ea67c71ea", size = 672357, upload-time = "2026-03-13T06:58:51.077Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/18/06/e8cf74c3c48e5485c7acc5a990d0d8516cdfb5fdf80f799174f1287cc1b5/hf_xet-1.4.2-cp313-cp313t-macosx_10_12_x86_64.whl", hash = "sha256:ac8202ae1e664b2c15cdfc7298cbb25e80301ae596d602ef7870099a126fcad4", size = 3796125, upload-time = "2026-03-13T06:58:33.177Z" }, + { url = "https://files.pythonhosted.org/packages/66/d4/b73ebab01cbf60777323b7de9ef05550790451eb5172a220d6b9845385ec/hf_xet-1.4.2-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:6d2f8ee39fa9fba9af929f8c0d0482f8ee6e209179ad14a909b6ad78ffcb7c81", size = 3555985, upload-time = "2026-03-13T06:58:31.797Z" }, + { url = "https://files.pythonhosted.org/packages/ff/e7/ded6d1bd041c3f2bca9e913a0091adfe32371988e047dd3a68a2463c15a2/hf_xet-1.4.2-cp313-cp313t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:4642a6cf249c09da8c1f87fe50b24b2a3450b235bf8adb55700b52f0ea6e2eb6", size = 4212085, upload-time = "2026-03-13T06:58:24.323Z" }, + { url = "https://files.pythonhosted.org/packages/97/c1/a0a44d1f98934f7bdf17f7a915b934f9fca44bb826628c553589900f6df8/hf_xet-1.4.2-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:769431385e746c92dc05492dde6f687d304584b89c33d79def8367ace06cb555", size = 3988266, upload-time = "2026-03-13T06:58:22.887Z" }, + { url = "https://files.pythonhosted.org/packages/7a/82/be713b439060e7d1f1d93543c8053d4ef2fe7e6922c5b31642eaa26f3c4b/hf_xet-1.4.2-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:c9dd1c1bc4cc56168f81939b0e05b4c36dd2d28c13dc1364b17af89aa0082496", size = 4188513, upload-time = "2026-03-13T06:58:40.858Z" }, + { url = "https://files.pythonhosted.org/packages/21/a6/cbd4188b22abd80ebd0edbb2b3e87f2633e958983519980815fb8314eae5/hf_xet-1.4.2-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:fca58a2ae4e6f6755cc971ac6fcdf777ea9284d7e540e350bb000813b9a3008d", size = 4428287, upload-time = "2026-03-13T06:58:42.601Z" }, + { url = "https://files.pythonhosted.org/packages/b2/4e/84e45b25e2e3e903ed3db68d7eafa96dae9a1d1f6d0e7fc85120347a852f/hf_xet-1.4.2-cp313-cp313t-win_amd64.whl", hash = "sha256:163aab46854ccae0ab6a786f8edecbbfbaa38fcaa0184db6feceebf7000c93c0", size = 3665574, upload-time = "2026-03-13T06:58:53.881Z" }, + { url = "https://files.pythonhosted.org/packages/ee/71/c5ac2b9a7ae39c14e91973035286e73911c31980fe44e7b1d03730c00adc/hf_xet-1.4.2-cp313-cp313t-win_arm64.whl", hash = "sha256:09b138422ecbe50fd0c84d4da5ff537d27d487d3607183cd10e3e53f05188e82", size = 3528760, upload-time = "2026-03-13T06:58:52.187Z" }, + { url = "https://files.pythonhosted.org/packages/1e/0f/fcd2504015eab26358d8f0f232a1aed6b8d363a011adef83fe130bff88f7/hf_xet-1.4.2-cp314-cp314t-macosx_10_12_x86_64.whl", hash = "sha256:949dcf88b484bb9d9276ca83f6599e4aa03d493c08fc168c124ad10b2e6f75d7", size = 3796493, upload-time = "2026-03-13T06:58:39.267Z" }, + { url = "https://files.pythonhosted.org/packages/82/56/19c25105ff81731ca6d55a188b5de2aa99d7a2644c7aa9de1810d5d3b726/hf_xet-1.4.2-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:41659966020d59eb9559c57de2cde8128b706a26a64c60f0531fa2318f409418", size = 3555797, upload-time = "2026-03-13T06:58:37.546Z" }, + { url = "https://files.pythonhosted.org/packages/bf/e3/8933c073186849b5e06762aa89847991d913d10a95d1603eb7f2c3834086/hf_xet-1.4.2-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:5c588e21d80010119458dd5d02a69093f0d115d84e3467efe71ffb2c67c19146", size = 4212127, upload-time = "2026-03-13T06:58:30.539Z" }, + { url = "https://files.pythonhosted.org/packages/eb/01/f89ebba4e369b4ed699dcb60d3152753870996f41c6d22d3d7cac01310e1/hf_xet-1.4.2-cp314-cp314t-manylinux_2_28_aarch64.whl", hash = "sha256:a296744d771a8621ad1d50c098d7ab975d599800dae6d48528ba3944e5001ba0", size = 3987788, upload-time = "2026-03-13T06:58:29.139Z" }, + { url = "https://files.pythonhosted.org/packages/84/4d/8a53e5ffbc2cc33bbf755382ac1552c6d9af13f623ed125fe67cc3e6772f/hf_xet-1.4.2-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:f563f7efe49588b7d0629d18d36f46d1658fe7e08dce3fa3d6526e1c98315e2d", size = 4188315, upload-time = "2026-03-13T06:58:48.017Z" }, + { url = "https://files.pythonhosted.org/packages/d1/b8/b7a1c1b5592254bd67050632ebbc1b42cc48588bf4757cb03c2ef87e704a/hf_xet-1.4.2-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:5b2e0132c56d7ee1bf55bdb638c4b62e7106f6ac74f0b786fed499d5548c5570", size = 4428306, upload-time = "2026-03-13T06:58:49.502Z" }, + { url = "https://files.pythonhosted.org/packages/a0/0c/40779e45b20e11c7c5821a94135e0207080d6b3d76e7b78ccb413c6f839b/hf_xet-1.4.2-cp314-cp314t-win_amd64.whl", hash = "sha256:2f45c712c2fa1215713db10df6ac84b49d0e1c393465440e9cb1de73ecf7bbf6", size = 3665826, upload-time = "2026-03-13T06:58:59.88Z" }, + { url = "https://files.pythonhosted.org/packages/51/4c/e2688c8ad1760d7c30f7c429c79f35f825932581bc7c9ec811436d2f21a0/hf_xet-1.4.2-cp314-cp314t-win_arm64.whl", hash = "sha256:6d53df40616f7168abfccff100d232e9d460583b9d86fa4912c24845f192f2b8", size = 3529113, upload-time = "2026-03-13T06:58:58.491Z" }, + { url = "https://files.pythonhosted.org/packages/b4/86/b40b83a2ff03ef05c4478d2672b1fc2b9683ff870e2b25f4f3af240f2e7b/hf_xet-1.4.2-cp37-abi3-macosx_10_12_x86_64.whl", hash = "sha256:71f02d6e4cdd07f344f6844845d78518cc7186bd2bc52d37c3b73dc26a3b0bc5", size = 3800339, upload-time = "2026-03-13T06:58:36.245Z" }, + { url = "https://files.pythonhosted.org/packages/64/2e/af4475c32b4378b0e92a587adb1aa3ec53e3450fd3e5fe0372a874531c00/hf_xet-1.4.2-cp37-abi3-macosx_11_0_arm64.whl", hash = "sha256:e9b38d876e94d4bdcf650778d6ebbaa791dd28de08db9736c43faff06ede1b5a", size = 3559664, upload-time = "2026-03-13T06:58:34.787Z" }, + { url = "https://files.pythonhosted.org/packages/3c/4c/781267da3188db679e601de18112021a5cb16506fe86b246e22c5401a9c4/hf_xet-1.4.2-cp37-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:77e8c180b7ef12d8a96739a4e1e558847002afe9ea63b6f6358b2271a8bdda1c", size = 4217422, upload-time = "2026-03-13T06:58:27.472Z" }, + { url = "https://files.pythonhosted.org/packages/68/47/d6cf4a39ecf6c7705f887a46f6ef5c8455b44ad9eb0d391aa7e8a2ff7fea/hf_xet-1.4.2-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:c3b3c6a882016b94b6c210957502ff7877802d0dbda8ad142c8595db8b944271", size = 3992847, upload-time = "2026-03-13T06:58:25.989Z" }, + { url = "https://files.pythonhosted.org/packages/2d/ef/e80815061abff54697239803948abc665c6b1d237102c174f4f7a9a5ffc5/hf_xet-1.4.2-cp37-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:9d9a634cc929cfbaf2e1a50c0e532ae8c78fa98618426769480c58501e8c8ac2", size = 4193843, upload-time = "2026-03-13T06:58:44.59Z" }, + { url = "https://files.pythonhosted.org/packages/54/75/07f6aa680575d9646c4167db6407c41340cbe2357f5654c4e72a1b01ca14/hf_xet-1.4.2-cp37-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:6b0932eb8b10317ea78b7da6bab172b17be03bbcd7809383d8d5abd6a2233e04", size = 4432751, upload-time = "2026-03-13T06:58:46.533Z" }, + { url = "https://files.pythonhosted.org/packages/cd/71/193eabd7e7d4b903c4aa983a215509c6114915a5a237525ec562baddb868/hf_xet-1.4.2-cp37-abi3-win_amd64.whl", hash = "sha256:ad185719fb2e8ac26f88c8100562dbf9dbdcc3d9d2add00faa94b5f106aea53f", size = 3671149, upload-time = "2026-03-13T06:58:57.07Z" }, + { url = "https://files.pythonhosted.org/packages/b4/7e/ccf239da366b37ba7f0b36095450efae4a64980bdc7ec2f51354205fdf39/hf_xet-1.4.2-cp37-abi3-win_arm64.whl", hash = "sha256:32c012286b581f783653e718c1862aea5b9eb140631685bb0c5e7012c8719a87", size = 3533426, upload-time = "2026-03-13T06:58:55.46Z" }, ] [[package]] @@ -1330,6 +1503,14 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/07/c6/80c95b1b2b94682a72cbdbfb85b81ae2daffa4291fbfa1b1464502ede10d/hpack-4.1.0-py3-none-any.whl", hash = "sha256:157ac792668d995c657d93111f46b4535ed114f0c9c8d672271bbec7eae1b496", size = 34357, upload-time = "2025-01-22T21:44:56.92Z" }, ] +[[package]] +name = "htmlmin2" +version = "0.1.13" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/be/31/a76f4bfa885f93b8167cb4c85cf32b54d1f64384d0b897d45bc6d19b7b45/htmlmin2-0.1.13-py3-none-any.whl", hash = "sha256:75609f2a42e64f7ce57dbff28a39890363bde9e7e5885db633317efbdf8c79a2", size = 34486, upload-time = "2023-03-14T21:28:30.388Z" }, +] + [[package]] name = "httpcore" version = "1.0.9" @@ -1374,30 +1555,28 @@ wheels = [ [[package]] name = "huggingface-hub" -version = "0.36.0" +version = "1.7.2" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "filelock" }, { name = "fsspec" }, - { name = "hf-xet", marker = "platform_machine == 'aarch64' or platform_machine == 'amd64' or platform_machine == 'arm64' or platform_machine == 'x86_64'" }, + { name = "hf-xet", marker = "platform_machine == 'AMD64' or platform_machine == 'aarch64' or platform_machine == 'amd64' or platform_machine == 'arm64' or platform_machine == 'x86_64'" }, + { name = "httpx" }, { name = "packaging" }, { name = "pyyaml" }, - { name = "requests" }, { name = "tqdm" }, + { name = "typer" }, { name = "typing-extensions" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/98/63/4910c5fa9128fdadf6a9c5ac138e8b1b6cee4ca44bf7915bbfbce4e355ee/huggingface_hub-0.36.0.tar.gz", hash = "sha256:47b3f0e2539c39bf5cde015d63b72ec49baff67b6931c3d97f3f84532e2b8d25", size = 463358, upload-time = "2025-10-23T12:12:01.413Z" } +sdist = { url = "https://files.pythonhosted.org/packages/19/15/eafc1c57bf0f8afffb243dcd4c0cceb785e956acc17bba4d9bf2ae21fc9c/huggingface_hub-1.7.2.tar.gz", hash = "sha256:7f7e294e9bbb822e025bdb2ada025fa4344d978175a7f78e824d86e35f7ab43b", size = 724684, upload-time = "2026-03-20T10:36:08.767Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/cb/bd/1a875e0d592d447cbc02805fd3fe0f497714d6a2583f59d14fa9ebad96eb/huggingface_hub-0.36.0-py3-none-any.whl", hash = "sha256:7bcc9ad17d5b3f07b57c78e79d527102d08313caa278a641993acddcb894548d", size = 566094, upload-time = "2025-10-23T12:11:59.557Z" }, + { url = "https://files.pythonhosted.org/packages/08/de/3ad061a05f74728927ded48c90b73521b9a9328c85d841bdefb30e01fb85/huggingface_hub-1.7.2-py3-none-any.whl", hash = "sha256:288f33a0a17b2a73a1359e2a5fd28d1becb2c121748c6173ab8643fb342c850e", size = 618036, upload-time = "2026-03-20T10:36:06.824Z" }, ] [package.optional-dependencies] hf-xet = [ { name = "hf-xet" }, ] -inference = [ - { name = "aiohttp" }, -] [[package]] name = "hyperframe" @@ -1419,11 +1598,11 @@ wheels = [ [[package]] name = "idna" -version = "3.11" +version = "3.15" source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/6f/6d/0703ccc57f3a7233505399edb88de3cbd678da106337b9fcde432b65ed60/idna-3.11.tar.gz", hash = "sha256:795dafcc9c04ed0c1fb032c2aa73654d8e8c5023a7df64a53f39190ada629902", size = 194582, upload-time = "2025-10-12T14:55:20.501Z" } +sdist = { url = "https://files.pythonhosted.org/packages/82/77/7b3966d0b9d1d31a36ddf1746926a11dface89a83409bf1483f0237aa758/idna-3.15.tar.gz", hash = "sha256:ca962446ea538f7092a95e057da437618e886f4d349216d2b1e294abfdb65fdc", size = 199245, upload-time = "2026-05-12T22:45:57.011Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/0e/61/66938bbb5fc52dbdf84594873d5b51fb1f7c7794e9c0f5bd885f30bc507b/idna-3.11-py3-none-any.whl", hash = "sha256:771a87f49d9defaf64091e6e6fe9c18d4833f140bd19464795bc32d966ca37ea", size = 71008, upload-time = "2025-10-12T14:55:18.883Z" }, + { url = "https://files.pythonhosted.org/packages/d2/23/408243171aa9aaba178d3e2559159c24c1171a641aa83b67bdd3394ead8e/idna-3.15-py3-none-any.whl", hash = "sha256:048adeaf8c2d788c40fee287673ccaa74c24ffd8dcf09ffa555a2fbb59f10ac8", size = 72340, upload-time = "2026-05-12T22:45:55.733Z" }, ] [[package]] @@ -1447,15 +1626,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/cb/b1/3846dd7f199d53cb17f49cba7e651e9ce294d8497c8c150530ed11865bb8/iniconfig-2.3.0-py3-none-any.whl", hash = "sha256:f631c04d2c48c52b84d0d0549c99ff3859c98df65b3101406327ecc7d53fbf12", size = 7484, upload-time = "2025-10-18T21:55:41.639Z" }, ] -[[package]] -name = "invoke" -version = "2.2.1" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/de/bd/b461d3424a24c80490313fd77feeb666ca4f6a28c7e72713e3d9095719b4/invoke-2.2.1.tar.gz", hash = "sha256:515bf49b4a48932b79b024590348da22f39c4942dff991ad1fb8b8baea1be707", size = 304762, upload-time = "2025-10-11T00:36:35.172Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/32/4b/b99e37f88336009971405cbb7630610322ed6fbfa31e1d7ab3fbf3049a2d/invoke-2.2.1-py3-none-any.whl", hash = "sha256:2413bc441b376e5cd3f55bb5d364f973ad8bdd7bf87e53c79de3c11bf3feecc8", size = 160287, upload-time = "2025-10-11T00:36:33.703Z" }, -] - [[package]] name = "isort" version = "7.0.0" @@ -1596,6 +1766,33 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/14/2f/967ba146e6d58cf6a652da73885f52fc68001525b4197effc174321d70b4/jmespath-1.1.0-py3-none-any.whl", hash = "sha256:a5663118de4908c91729bea0acadca56526eb2698e83de10cd116ae0f4e97c64", size = 20419, upload-time = "2026-01-22T16:35:24.919Z" }, ] +[[package]] +name = "joserfc" +version = "1.6.7" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "cryptography" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/1b/cb/52e479f20804904f5df20ac4539d292dcecd1287aaa33cba1d1def1d9d8e/joserfc-1.6.7.tar.gz", hash = "sha256:6999fe89457069ecacd8cc797c88a805f83054dd883333fa0409f74b46479fd7", size = 232158, upload-time = "2026-05-23T01:46:44.069Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c5/e4/bcf6718b5662894c6831f46296b73cd4b1a2e90c20b6d437e20c4997388c/joserfc-1.6.7-py3-none-any.whl", hash = "sha256:9e51e4a64840aa1734a058258e80a4480e2ff2d5686e480e7c92c954a92fbe05", size = 70603, upload-time = "2026-05-23T01:46:42.129Z" }, +] + +[[package]] +name = "jsmin" +version = "3.0.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/5e/73/e01e4c5e11ad0494f4407a3f623ad4d87714909f50b17a06ed121034ff6e/jsmin-3.0.1.tar.gz", hash = "sha256:c0959a121ef94542e807a674142606f7e90214a2b3d1eb17300244bbb5cc2bfc", size = 13925, upload-time = "2022-01-16T20:35:59.13Z" } + +[[package]] +name = "jsonpath-python" +version = "1.1.6" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/98/18/4ca8742534a5993ff383f7602e325ce2d5d7cc93d72ac5e1cdedbea8a458/jsonpath_python-1.1.6.tar.gz", hash = "sha256:dded9932b4ec41fb8726e09c83afa4e6be618f938c2db287cc2a81723c639671", size = 88178, upload-time = "2026-05-07T01:26:34.482Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/55/8a/1270a6803bd821cbfcdda387eaa13cb41a7b1f7b9bd145979b3bfb9d6cb7/jsonpath_python-1.1.6-py3-none-any.whl", hash = "sha256:a1c50afd8d3fbbaf47a4873bc890dcb3c15da96f5c020327977d844d8731a2d4", size = 14453, upload-time = "2026-05-07T01:26:33.306Z" }, +] + [[package]] name = "jsonref" version = "1.1.0" @@ -1664,6 +1861,23 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/81/db/e655086b7f3a705df045bf0933bdd9c2f79bb3c97bfef1384598bb79a217/keyring-25.7.0-py3-none-any.whl", hash = "sha256:be4a0b195f149690c166e850609a477c532ddbfbaed96a404d4e43f8d5e2689f", size = 39160, upload-time = "2025-11-16T16:26:08.402Z" }, ] +[[package]] +name = "libclang" +version = "18.1.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/6e/5c/ca35e19a4f142adffa27e3d652196b7362fa612243e2b916845d801454fc/libclang-18.1.1.tar.gz", hash = "sha256:a1214966d08d73d971287fc3ead8dfaf82eb07fb197680d8b3859dbbbbf78250", size = 39612, upload-time = "2024-03-17T16:04:37.434Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/4b/49/f5e3e7e1419872b69f6f5e82ba56e33955a74bd537d8a1f5f1eff2f3668a/libclang-18.1.1-1-py2.py3-none-macosx_11_0_arm64.whl", hash = "sha256:0b2e143f0fac830156feb56f9231ff8338c20aecfe72b4ffe96f19e5a1dbb69a", size = 25836045, upload-time = "2024-06-30T17:40:31.646Z" }, + { url = "https://files.pythonhosted.org/packages/e2/e5/fc61bbded91a8830ccce94c5294ecd6e88e496cc85f6704bf350c0634b70/libclang-18.1.1-py2.py3-none-macosx_10_9_x86_64.whl", hash = "sha256:6f14c3f194704e5d09769108f03185fce7acaf1d1ae4bbb2f30a72c2400cb7c5", size = 26502641, upload-time = "2024-03-18T15:52:26.722Z" }, + { url = "https://files.pythonhosted.org/packages/db/ed/1df62b44db2583375f6a8a5e2ca5432bbdc3edb477942b9b7c848c720055/libclang-18.1.1-py2.py3-none-macosx_11_0_arm64.whl", hash = "sha256:83ce5045d101b669ac38e6da8e58765f12da2d3aafb3b9b98d88b286a60964d8", size = 26420207, upload-time = "2024-03-17T15:00:26.63Z" }, + { url = "https://files.pythonhosted.org/packages/1d/fc/716c1e62e512ef1c160e7984a73a5fc7df45166f2ff3f254e71c58076f7c/libclang-18.1.1-py2.py3-none-manylinux2010_x86_64.whl", hash = "sha256:c533091d8a3bbf7460a00cb6c1a71da93bffe148f172c7d03b1c31fbf8aa2a0b", size = 24515943, upload-time = "2024-03-17T16:03:45.942Z" }, + { url = "https://files.pythonhosted.org/packages/3c/3d/f0ac1150280d8d20d059608cf2d5ff61b7c3b7f7bcf9c0f425ab92df769a/libclang-18.1.1-py2.py3-none-manylinux2014_aarch64.whl", hash = "sha256:54dda940a4a0491a9d1532bf071ea3ef26e6dbaf03b5000ed94dd7174e8f9592", size = 23784972, upload-time = "2024-03-17T16:12:47.677Z" }, + { url = "https://files.pythonhosted.org/packages/fe/2f/d920822c2b1ce9326a4c78c0c2b4aa3fde610c7ee9f631b600acb5376c26/libclang-18.1.1-py2.py3-none-manylinux2014_armv7l.whl", hash = "sha256:cf4a99b05376513717ab5d82a0db832c56ccea4fd61a69dbb7bccf2dfb207dbe", size = 20259606, upload-time = "2024-03-17T16:17:42.437Z" }, + { url = "https://files.pythonhosted.org/packages/2d/c2/de1db8c6d413597076a4259cea409b83459b2db997c003578affdd32bf66/libclang-18.1.1-py2.py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:69f8eb8f65c279e765ffd28aaa7e9e364c776c17618af8bff22a8df58677ff4f", size = 24921494, upload-time = "2024-03-17T16:14:20.132Z" }, + { url = "https://files.pythonhosted.org/packages/0b/2d/3f480b1e1d31eb3d6de5e3ef641954e5c67430d5ac93b7fa7e07589576c7/libclang-18.1.1-py2.py3-none-win_amd64.whl", hash = "sha256:4dd2d3b82fab35e2bf9ca717d7b63ac990a3519c7e312f19fa8e86dcc712f7fb", size = 26415083, upload-time = "2024-03-17T16:42:21.703Z" }, + { url = "https://files.pythonhosted.org/packages/71/cf/e01dc4cc79779cd82d77888a88ae2fa424d93b445ad4f6c02bfc18335b70/libclang-18.1.1-py2.py3-none-win_arm64.whl", hash = "sha256:3f0e1f49f04d3cd198985fea0511576b0aee16f9ff0e0f0cad7f9c57ec3c20e8", size = 22361112, upload-time = "2024-03-17T16:42:59.565Z" }, +] + [[package]] name = "logfire" version = "4.19.0" @@ -1709,58 +1923,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/0c/29/0348de65b8cc732daa3e33e67806420b2ae89bdce2b04af740289c5c6c8c/loguru-0.7.3-py3-none-any.whl", hash = "sha256:31a33c10c8e1e10422bfd431aeb5d351c7cf7fa671e3c4df004162264b28220c", size = 61595, upload-time = "2024-12-06T11:20:54.538Z" }, ] -[[package]] -name = "lupa" -version = "2.6" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/b8/1c/191c3e6ec6502e3dbe25a53e27f69a5daeac3e56de1f73c0138224171ead/lupa-2.6.tar.gz", hash = "sha256:9a770a6e89576be3447668d7ced312cd6fd41d3c13c2462c9dc2c2ab570e45d9", size = 7240282, upload-time = "2025-10-24T07:20:29.738Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/94/86/ce243390535c39d53ea17ccf0240815e6e457e413e40428a658ea4ee4b8d/lupa-2.6-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:47ce718817ef1cc0c40d87c3d5ae56a800d61af00fbc0fad1ca9be12df2f3b56", size = 951707, upload-time = "2025-10-24T07:18:03.884Z" }, - { url = "https://files.pythonhosted.org/packages/86/85/cedea5e6cbeb54396fdcc55f6b741696f3f036d23cfaf986d50d680446da/lupa-2.6-cp312-cp312-macosx_11_0_universal2.whl", hash = "sha256:7aba985b15b101495aa4b07112cdc08baa0c545390d560ad5cfde2e9e34f4d58", size = 1916703, upload-time = "2025-10-24T07:18:05.6Z" }, - { url = "https://files.pythonhosted.org/packages/24/be/3d6b5f9a8588c01a4d88129284c726017b2089f3a3fd3ba8bd977292fea0/lupa-2.6-cp312-cp312-macosx_11_0_x86_64.whl", hash = "sha256:b766f62f95b2739f2248977d29b0722e589dcf4f0ccfa827ccbd29f0148bd2e5", size = 985152, upload-time = "2025-10-24T07:18:08.561Z" }, - { url = "https://files.pythonhosted.org/packages/eb/23/9f9a05beee5d5dce9deca4cb07c91c40a90541fc0a8e09db4ee670da550f/lupa-2.6-cp312-cp312-manylinux2010_i686.manylinux_2_12_i686.manylinux_2_28_i686.whl", hash = "sha256:00a934c23331f94cb51760097ebfab14b005d55a6b30a2b480e3c53dd2fa290d", size = 1159599, upload-time = "2025-10-24T07:18:10.346Z" }, - { url = "https://files.pythonhosted.org/packages/40/4e/e7c0583083db9d7f1fd023800a9767d8e4391e8330d56c2373d890ac971b/lupa-2.6-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:21de9f38bd475303e34a042b7081aabdf50bd9bafd36ce4faea2f90fd9f15c31", size = 1038686, upload-time = "2025-10-24T07:18:12.112Z" }, - { url = "https://files.pythonhosted.org/packages/1c/9f/5a4f7d959d4feba5e203ff0c31889e74d1ca3153122be4a46dca7d92bf7c/lupa-2.6-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:cf3bda96d3fc41237e964a69c23647d50d4e28421111360274d4799832c560e9", size = 2071956, upload-time = "2025-10-24T07:18:14.572Z" }, - { url = "https://files.pythonhosted.org/packages/92/34/2f4f13ca65d01169b1720176aedc4af17bc19ee834598c7292db232cb6dc/lupa-2.6-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:5a76ead245da54801a81053794aa3975f213221f6542d14ec4b859ee2e7e0323", size = 1057199, upload-time = "2025-10-24T07:18:16.379Z" }, - { url = "https://files.pythonhosted.org/packages/35/2a/5f7d2eebec6993b0dcd428e0184ad71afb06a45ba13e717f6501bfed1da3/lupa-2.6-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:8dd0861741caa20886ddbda0a121d8e52fb9b5bb153d82fa9bba796962bf30e8", size = 1173693, upload-time = "2025-10-24T07:18:18.153Z" }, - { url = "https://files.pythonhosted.org/packages/e4/29/089b4d2f8e34417349af3904bb40bec40b65c8731f45e3fd8d497ca573e5/lupa-2.6-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:239e63948b0b23023f81d9a19a395e768ed3da6a299f84e7963b8f813f6e3f9c", size = 2164394, upload-time = "2025-10-24T07:18:20.403Z" }, - { url = "https://files.pythonhosted.org/packages/f3/1b/79c17b23c921f81468a111cad843b076a17ef4b684c4a8dff32a7969c3f0/lupa-2.6-cp312-cp312-win32.whl", hash = "sha256:325894e1099499e7a6f9c351147661a2011887603c71086d36fe0f964d52d1ce", size = 1420647, upload-time = "2025-10-24T07:18:23.368Z" }, - { url = "https://files.pythonhosted.org/packages/b8/15/5121e68aad3584e26e1425a5c9a79cd898f8a152292059e128c206ee817c/lupa-2.6-cp312-cp312-win_amd64.whl", hash = "sha256:c735a1ce8ee60edb0fe71d665f1e6b7c55c6021f1d340eb8c865952c602cd36f", size = 1688529, upload-time = "2025-10-24T07:18:25.523Z" }, - { url = "https://files.pythonhosted.org/packages/28/1d/21176b682ca5469001199d8b95fa1737e29957a3d185186e7a8b55345f2e/lupa-2.6-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:663a6e58a0f60e7d212017d6678639ac8df0119bc13c2145029dcba084391310", size = 947232, upload-time = "2025-10-24T07:18:27.878Z" }, - { url = "https://files.pythonhosted.org/packages/ce/4c/d327befb684660ca13cf79cd1f1d604331808f9f1b6fb6bf57832f8edf80/lupa-2.6-cp313-cp313-macosx_11_0_universal2.whl", hash = "sha256:d1f5afda5c20b1f3217a80e9bc1b77037f8a6eb11612fd3ada19065303c8f380", size = 1908625, upload-time = "2025-10-24T07:18:29.944Z" }, - { url = "https://files.pythonhosted.org/packages/66/8e/ad22b0a19454dfd08662237a84c792d6d420d36b061f239e084f29d1a4f3/lupa-2.6-cp313-cp313-macosx_11_0_x86_64.whl", hash = "sha256:26f2b3c085fe76e9119e48c1013c1cccdc1f51585d456858290475aa38e7089e", size = 981057, upload-time = "2025-10-24T07:18:31.553Z" }, - { url = "https://files.pythonhosted.org/packages/5c/48/74859073ab276bd0566c719f9ca0108b0cfc1956ca0d68678d117d47d155/lupa-2.6-cp313-cp313-manylinux2010_i686.manylinux_2_12_i686.manylinux_2_28_i686.whl", hash = "sha256:60d2f902c7b96fb8ab98493dcff315e7bb4d0b44dc9dd76eb37de575025d5685", size = 1156227, upload-time = "2025-10-24T07:18:33.981Z" }, - { url = "https://files.pythonhosted.org/packages/09/6c/0e9ded061916877253c2266074060eb71ed99fb21d73c8c114a76725bce2/lupa-2.6-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a02d25dee3a3250967c36590128d9220ae02f2eda166a24279da0b481519cbff", size = 1035752, upload-time = "2025-10-24T07:18:36.32Z" }, - { url = "https://files.pythonhosted.org/packages/dd/ef/f8c32e454ef9f3fe909f6c7d57a39f950996c37a3deb7b391fec7903dab7/lupa-2.6-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:6eae1ee16b886b8914ff292dbefbf2f48abfbdee94b33a88d1d5475e02423203", size = 2069009, upload-time = "2025-10-24T07:18:38.072Z" }, - { url = "https://files.pythonhosted.org/packages/53/dc/15b80c226a5225815a890ee1c11f07968e0aba7a852df41e8ae6fe285063/lupa-2.6-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:b0edd5073a4ee74ab36f74fe61450148e6044f3952b8d21248581f3c5d1a58be", size = 1056301, upload-time = "2025-10-24T07:18:40.165Z" }, - { url = "https://files.pythonhosted.org/packages/31/14/2086c1425c985acfb30997a67e90c39457122df41324d3c179d6ee2292c6/lupa-2.6-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:0c53ee9f22a8a17e7d4266ad48e86f43771951797042dd51d1494aaa4f5f3f0a", size = 1170673, upload-time = "2025-10-24T07:18:42.426Z" }, - { url = "https://files.pythonhosted.org/packages/10/e5/b216c054cf86576c0191bf9a9f05de6f7e8e07164897d95eea0078dca9b2/lupa-2.6-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:de7c0f157a9064a400d828789191a96da7f4ce889969a588b87ec80de9b14772", size = 2162227, upload-time = "2025-10-24T07:18:46.112Z" }, - { url = "https://files.pythonhosted.org/packages/59/2f/33ecb5bedf4f3bc297ceacb7f016ff951331d352f58e7e791589609ea306/lupa-2.6-cp313-cp313-win32.whl", hash = "sha256:ee9523941ae0a87b5b703417720c5d78f72d2f5bc23883a2ea80a949a3ed9e75", size = 1419558, upload-time = "2025-10-24T07:18:48.371Z" }, - { url = "https://files.pythonhosted.org/packages/f9/b4/55e885834c847ea610e111d87b9ed4768f0afdaeebc00cd46810f25029f6/lupa-2.6-cp313-cp313-win_amd64.whl", hash = "sha256:b1335a5835b0a25ebdbc75cf0bda195e54d133e4d994877ef025e218c2e59db9", size = 1683424, upload-time = "2025-10-24T07:18:50.976Z" }, - { url = "https://files.pythonhosted.org/packages/66/9d/d9427394e54d22a35d1139ef12e845fd700d4872a67a34db32516170b746/lupa-2.6-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:dcb6d0a3264873e1653bc188499f48c1fb4b41a779e315eba45256cfe7bc33c1", size = 953818, upload-time = "2025-10-24T07:18:53.378Z" }, - { url = "https://files.pythonhosted.org/packages/10/41/27bbe81953fb2f9ecfced5d9c99f85b37964cfaf6aa8453bb11283983721/lupa-2.6-cp314-cp314-macosx_11_0_universal2.whl", hash = "sha256:a37e01f2128f8c36106726cb9d360bac087d58c54b4522b033cc5691c584db18", size = 1915850, upload-time = "2025-10-24T07:18:55.259Z" }, - { url = "https://files.pythonhosted.org/packages/a3/98/f9ff60db84a75ba8725506bbf448fb085bc77868a021998ed2a66d920568/lupa-2.6-cp314-cp314-macosx_11_0_x86_64.whl", hash = "sha256:458bd7e9ff3c150b245b0fcfbb9bd2593d1152ea7f0a7b91c1d185846da033fe", size = 982344, upload-time = "2025-10-24T07:18:57.05Z" }, - { url = "https://files.pythonhosted.org/packages/41/f7/f39e0f1c055c3b887d86b404aaf0ca197b5edfd235a8b81b45b25bac7fc3/lupa-2.6-cp314-cp314-manylinux2010_i686.manylinux_2_12_i686.manylinux_2_28_i686.whl", hash = "sha256:052ee82cac5206a02df77119c325339acbc09f5ce66967f66a2e12a0f3211cad", size = 1156543, upload-time = "2025-10-24T07:18:59.251Z" }, - { url = "https://files.pythonhosted.org/packages/9e/9c/59e6cffa0d672d662ae17bd7ac8ecd2c89c9449dee499e3eb13ca9cd10d9/lupa-2.6-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:96594eca3c87dd07938009e95e591e43d554c1dbd0385be03c100367141db5a8", size = 1047974, upload-time = "2025-10-24T07:19:01.449Z" }, - { url = "https://files.pythonhosted.org/packages/23/c6/a04e9cef7c052717fcb28fb63b3824802488f688391895b618e39be0f684/lupa-2.6-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:e8faddd9d198688c8884091173a088a8e920ecc96cda2ffed576a23574c4b3f6", size = 2073458, upload-time = "2025-10-24T07:19:03.369Z" }, - { url = "https://files.pythonhosted.org/packages/e6/10/824173d10f38b51fc77785228f01411b6ca28826ce27404c7c912e0e442c/lupa-2.6-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:daebb3a6b58095c917e76ba727ab37b27477fb926957c825205fbda431552134", size = 1067683, upload-time = "2025-10-24T07:19:06.2Z" }, - { url = "https://files.pythonhosted.org/packages/b6/dc/9692fbcf3c924d9c4ece2d8d2f724451ac2e09af0bd2a782db1cef34e799/lupa-2.6-cp314-cp314-musllinux_1_2_i686.whl", hash = "sha256:f3154e68972befe0f81564e37d8142b5d5d79931a18309226a04ec92487d4ea3", size = 1171892, upload-time = "2025-10-24T07:19:08.544Z" }, - { url = "https://files.pythonhosted.org/packages/84/ff/e318b628d4643c278c96ab3ddea07fc36b075a57383c837f5b11e537ba9d/lupa-2.6-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:e4dadf77b9fedc0bfa53417cc28dc2278a26d4cbd95c29f8927ad4d8fe0a7ef9", size = 2166641, upload-time = "2025-10-24T07:19:10.485Z" }, - { url = "https://files.pythonhosted.org/packages/12/f7/a6f9ec2806cf2d50826980cdb4b3cffc7691dc6f95e13cc728846d5cb793/lupa-2.6-cp314-cp314-win32.whl", hash = "sha256:cb34169c6fa3bab3e8ac58ca21b8a7102f6a94b6a5d08d3636312f3f02fafd8f", size = 1456857, upload-time = "2025-10-24T07:19:37.989Z" }, - { url = "https://files.pythonhosted.org/packages/c5/de/df71896f25bdc18360fdfa3b802cd7d57d7fede41a0e9724a4625b412c85/lupa-2.6-cp314-cp314-win_amd64.whl", hash = "sha256:b74f944fe46c421e25d0f8692aef1e842192f6f7f68034201382ac440ef9ea67", size = 1731191, upload-time = "2025-10-24T07:19:40.281Z" }, - { url = "https://files.pythonhosted.org/packages/47/3c/a1f23b01c54669465f5f4c4083107d496fbe6fb45998771420e9aadcf145/lupa-2.6-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:0e21b716408a21ab65723f8841cf7f2f37a844b7a965eeabb785e27fca4099cf", size = 999343, upload-time = "2025-10-24T07:19:12.519Z" }, - { url = "https://files.pythonhosted.org/packages/c5/6d/501994291cb640bfa2ccf7f554be4e6914afa21c4026bd01bff9ca8aac57/lupa-2.6-cp314-cp314t-macosx_11_0_universal2.whl", hash = "sha256:589db872a141bfff828340079bbdf3e9a31f2689f4ca0d88f97d9e8c2eae6142", size = 2000730, upload-time = "2025-10-24T07:19:14.869Z" }, - { url = "https://files.pythonhosted.org/packages/53/a5/457ffb4f3f20469956c2d4c4842a7675e884efc895b2f23d126d23e126cc/lupa-2.6-cp314-cp314t-macosx_11_0_x86_64.whl", hash = "sha256:cd852a91a4a9d4dcbb9a58100f820a75a425703ec3e3f049055f60b8533b7953", size = 1021553, upload-time = "2025-10-24T07:19:17.123Z" }, - { url = "https://files.pythonhosted.org/packages/51/6b/36bb5a5d0960f2a5c7c700e0819abb76fd9bf9c1d8a66e5106416d6e9b14/lupa-2.6-cp314-cp314t-manylinux2010_i686.manylinux_2_12_i686.manylinux_2_28_i686.whl", hash = "sha256:0334753be028358922415ca97a64a3048e4ed155413fc4eaf87dd0a7e2752983", size = 1133275, upload-time = "2025-10-24T07:19:20.51Z" }, - { url = "https://files.pythonhosted.org/packages/19/86/202ff4429f663013f37d2229f6176ca9f83678a50257d70f61a0a97281bf/lupa-2.6-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:661d895cd38c87658a34780fac54a690ec036ead743e41b74c3fb81a9e65a6aa", size = 1038441, upload-time = "2025-10-24T07:19:22.509Z" }, - { url = "https://files.pythonhosted.org/packages/a7/42/d8125f8e420714e5b52e9c08d88b5329dfb02dcca731b4f21faaee6cc5b5/lupa-2.6-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:6aa58454ccc13878cc177c62529a2056be734da16369e451987ff92784994ca7", size = 2058324, upload-time = "2025-10-24T07:19:24.979Z" }, - { url = "https://files.pythonhosted.org/packages/2b/2c/47bf8b84059876e877a339717ddb595a4a7b0e8740bacae78ba527562e1c/lupa-2.6-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:1425017264e470c98022bba8cff5bd46d054a827f5df6b80274f9cc71dafd24f", size = 1060250, upload-time = "2025-10-24T07:19:27.262Z" }, - { url = "https://files.pythonhosted.org/packages/c2/06/d88add2b6406ca1bdec99d11a429222837ca6d03bea42ca75afa169a78cb/lupa-2.6-cp314-cp314t-musllinux_1_2_i686.whl", hash = "sha256:224af0532d216e3105f0a127410f12320f7c5f1aa0300bdf9646b8d9afb0048c", size = 1151126, upload-time = "2025-10-24T07:19:29.522Z" }, - { url = "https://files.pythonhosted.org/packages/b4/a0/89e6a024c3b4485b89ef86881c9d55e097e7cb0bdb74efb746f2fa6a9a76/lupa-2.6-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:9abb98d5a8fd27c8285302e82199f0e56e463066f88f619d6594a450bf269d80", size = 2153693, upload-time = "2025-10-24T07:19:31.379Z" }, - { url = "https://files.pythonhosted.org/packages/b6/36/a0f007dc58fc1bbf51fb85dcc82fcb1f21b8c4261361de7dab0e3d8521ef/lupa-2.6-cp314-cp314t-win32.whl", hash = "sha256:1849efeba7a8f6fb8aa2c13790bee988fd242ae404bd459509640eeea3d1e291", size = 1590104, upload-time = "2025-10-24T07:19:33.514Z" }, - { url = "https://files.pythonhosted.org/packages/7d/5e/db903ce9cf82c48d6b91bf6d63ae4c8d0d17958939a4e04ba6b9f38b8643/lupa-2.6-cp314-cp314t-win_amd64.whl", hash = "sha256:fc1498d1a4fc028bc521c26d0fad4ca00ed63b952e32fb95949bda76a04bad52", size = 1913818, upload-time = "2025-10-24T07:19:36.039Z" }, -] - [[package]] name = "macholib" version = "1.16.4" @@ -1785,6 +1947,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/d2/f0/834e479e47e499b6478e807fb57b31cc2db696c4db30557bb6f5aea4a90b/mando-0.7.1-py2.py3-none-any.whl", hash = "sha256:26ef1d70928b6057ee3ca12583d73c63e05c49de8972d620c278a7b206581a8a", size = 28149, upload-time = "2022-02-24T08:12:25.24Z" }, ] +[[package]] +name = "markdown" +version = "3.10.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/2b/f4/69fa6ed85ae003c2378ffa8f6d2e3234662abd02c10d216c0ba96081a238/markdown-3.10.2.tar.gz", hash = "sha256:994d51325d25ad8aa7ce4ebaec003febcce822c3f8c911e3b17c52f7f589f950", size = 368805, upload-time = "2026-02-09T14:57:26.942Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/de/1f/77fa3081e4f66ca3576c896ae5d31c3002ac6607f9747d2e3aa49227e464/markdown-3.10.2-py3-none-any.whl", hash = "sha256:e91464b71ae3ee7afd3017d9f358ef0baf158fd9a298db92f1d4761133824c36", size = 108180, upload-time = "2026-02-09T14:57:25.787Z" }, +] + [[package]] name = "markdown-it-py" version = "4.0.0" @@ -1903,22 +2074,116 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/b3/38/89ba8ad64ae25be8de66a6d463314cf1eb366222074cfda9ee839c56a4b4/mdurl-0.1.2-py3-none-any.whl", hash = "sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8", size = 9979, upload-time = "2022-08-14T12:40:09.779Z" }, ] +[[package]] +name = "mergedeep" +version = "1.3.4" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/3a/41/580bb4006e3ed0361b8151a01d324fb03f420815446c7def45d02f74c270/mergedeep-1.3.4.tar.gz", hash = "sha256:0096d52e9dad9939c3d975a774666af186eda617e6ca84df4c94dec30004f2a8", size = 4661, upload-time = "2021-02-05T18:55:30.623Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/2c/19/04f9b178c2d8a15b076c8b5140708fa6ffc5601fb6f1e975537072df5b2a/mergedeep-1.3.4-py3-none-any.whl", hash = "sha256:70775750742b25c0d8f36c55aed03d24c3384d17c951b3175d898bd778ef0307", size = 6354, upload-time = "2021-02-05T18:55:29.583Z" }, +] + [[package]] name = "mistralai" -version = "1.9.11" +version = "2.4.5" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "eval-type-backport" }, { name = "httpx" }, - { name = "invoke" }, + { name = "jsonpath-python" }, + { name = "opentelemetry-api" }, + { name = "opentelemetry-semantic-conventions" }, { name = "pydantic" }, { name = "python-dateutil" }, - { name = "pyyaml" }, { name = "typing-inspection" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/5a/8d/d8b7af67a966b6f227024e1cb7287fc19901a434f87a5a391dcfe635d338/mistralai-1.9.11.tar.gz", hash = "sha256:3df9e403c31a756ec79e78df25ee73cea3eb15f86693773e16b16adaf59c9b8a", size = 208051, upload-time = "2025-10-02T15:53:40.473Z" } +sdist = { url = "https://files.pythonhosted.org/packages/8e/3f/5624d57c5897c83c55d3e4c7dd4127de42ad14fd3183e26566cdc7dca1bf/mistralai-2.4.5.tar.gz", hash = "sha256:ef165bb004ec4423cbf19a440bf0983ca0c3fc92ab12a35ebca097bdf418e33a", size = 424611, upload-time = "2026-05-07T11:46:43.888Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/fe/76/4ce12563aea5a76016f8643eff30ab731e6656c845e9e4d090ef10c7b925/mistralai-1.9.11-py3-none-any.whl", hash = "sha256:7a3dc2b8ef3fceaa3582220234261b5c4e3e03a972563b07afa150e44a25a6d3", size = 442796, upload-time = "2025-10-02T15:53:39.134Z" }, + { url = "https://files.pythonhosted.org/packages/1b/48/2c5c4f853dec32a625c1a3d23809b80cf2e135c3441fe1764f72910dfea9/mistralai-2.4.5-py3-none-any.whl", hash = "sha256:bf3b6550258ab16dec8547b90e9c18bebf9099f55b7fc25a884bf0bbeffced0f", size = 995999, upload-time = "2026-05-07T11:46:41.915Z" }, +] + +[[package]] +name = "mkdocs" +version = "1.6.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "click" }, + { name = "colorama", marker = "sys_platform == 'win32'" }, + { name = "ghp-import" }, + { name = "jinja2" }, + { name = "markdown" }, + { name = "markupsafe" }, + { name = "mergedeep" }, + { name = "mkdocs-get-deps" }, + { name = "packaging" }, + { name = "pathspec" }, + { name = "pyyaml" }, + { name = "pyyaml-env-tag" }, + { name = "watchdog" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/bc/c6/bbd4f061bd16b378247f12953ffcb04786a618ce5e904b8c5a01a0309061/mkdocs-1.6.1.tar.gz", hash = "sha256:7b432f01d928c084353ab39c57282f29f92136665bdd6abf7c1ec8d822ef86f2", size = 3889159, upload-time = "2024-08-30T12:24:06.899Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/22/5b/dbc6a8cddc9cfa9c4971d59fb12bb8d42e161b7e7f8cc89e49137c5b279c/mkdocs-1.6.1-py3-none-any.whl", hash = "sha256:db91759624d1647f3f34aa0c3f327dd2601beae39a366d6e064c03468d35c20e", size = 3864451, upload-time = "2024-08-30T12:24:05.054Z" }, +] + +[[package]] +name = "mkdocs-get-deps" +version = "0.2.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "mergedeep" }, + { name = "platformdirs" }, + { name = "pyyaml" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/98/f5/ed29cd50067784976f25ed0ed6fcd3c2ce9eb90650aa3b2796ddf7b6870b/mkdocs_get_deps-0.2.0.tar.gz", hash = "sha256:162b3d129c7fad9b19abfdcb9c1458a651628e4b1dea628ac68790fb3061c60c", size = 10239, upload-time = "2023-11-20T17:51:09.981Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/9f/d4/029f984e8d3f3b6b726bd33cafc473b75e9e44c0f7e80a5b29abc466bdea/mkdocs_get_deps-0.2.0-py3-none-any.whl", hash = "sha256:2bf11d0b133e77a0dd036abeeb06dec8775e46efa526dc70667d8863eefc6134", size = 9521, upload-time = "2023-11-20T17:51:08.587Z" }, +] + +[[package]] +name = "mkdocs-material" +version = "9.7.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "babel" }, + { name = "backrefs" }, + { name = "colorama" }, + { name = "jinja2" }, + { name = "markdown" }, + { name = "mkdocs" }, + { name = "mkdocs-material-extensions" }, + { name = "paginate" }, + { name = "pygments" }, + { name = "pymdown-extensions" }, + { name = "requests" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/8a/b4/f900fcb8e6f510241e334ca401eddcb61ed880fb6572f7f32e4228472ca1/mkdocs_material-9.7.3.tar.gz", hash = "sha256:e5f0a18319699da7e78c35e4a8df7e93537a888660f61a86bd773a7134798f22", size = 4097748, upload-time = "2026-02-24T12:06:22.646Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/b9/1b/16ad0193079bb8a15aa1d2620813a9cd15b18de150a4ea1b2c607fb4c74d/mkdocs_material-9.7.3-py3-none-any.whl", hash = "sha256:37ebf7b4788c992203faf2e71900be3c197c70a4be9b0d72aed537b08a91dd9d", size = 9305078, upload-time = "2026-02-24T12:06:19.155Z" }, +] + +[[package]] +name = "mkdocs-material-extensions" +version = "1.3.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/79/9b/9b4c96d6593b2a541e1cb8b34899a6d021d208bb357042823d4d2cabdbe7/mkdocs_material_extensions-1.3.1.tar.gz", hash = "sha256:10c9511cea88f568257f960358a467d12b970e1f7b2c0e5fb2bb48cab1928443", size = 11847, upload-time = "2023-11-22T19:09:45.208Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/5b/54/662a4743aa81d9582ee9339d4ffa3c8fd40a4965e033d77b9da9774d3960/mkdocs_material_extensions-1.3.1-py3-none-any.whl", hash = "sha256:adff8b62700b25cb77b53358dad940f3ef973dd6db797907c49e3c2ef3ab4e31", size = 8728, upload-time = "2023-11-22T19:09:43.465Z" }, +] + +[[package]] +name = "mkdocs-minify-plugin" +version = "0.8.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "csscompressor" }, + { name = "htmlmin2" }, + { name = "jsmin" }, + { name = "mkdocs" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/52/67/fe4b77e7a8ae7628392e28b14122588beaf6078b53eb91c7ed000fd158ac/mkdocs-minify-plugin-0.8.0.tar.gz", hash = "sha256:bc11b78b8120d79e817308e2b11539d790d21445eb63df831e393f76e52e753d", size = 8366, upload-time = "2024-01-29T16:11:32.982Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/1b/cd/2e8d0d92421916e2ea4ff97f10a544a9bd5588eb747556701c983581df13/mkdocs_minify_plugin-0.8.0-py3-none-any.whl", hash = "sha256:5fba1a3f7bd9a2142c9954a6559a57e946587b21f133165ece30ea145c66aee6", size = 6723, upload-time = "2024-01-29T16:11:31.851Z" }, ] [[package]] @@ -2049,14 +2314,14 @@ wheels = [ [[package]] name = "nexus-rpc" -version = "1.2.0" +version = "1.4.0" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "typing-extensions" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/06/50/95d7bc91f900da5e22662c82d9bf0f72a4b01f2a552708bf2f43807707a1/nexus_rpc-1.2.0.tar.gz", hash = "sha256:b4ddaffa4d3996aaeadf49b80dfcdfbca48fe4cb616defaf3b3c5c2c8fc61890", size = 74142, upload-time = "2025-11-17T19:17:06.798Z" } +sdist = { url = "https://files.pythonhosted.org/packages/35/d5/cd1ffb202b76ebc1b33c1332a3416e55a39929006982adc2b1eb069aaa9b/nexus_rpc-1.4.0.tar.gz", hash = "sha256:3b8b373d4865671789cc43623e3dc0bcbf192562e40e13727e17f1c149050fba", size = 82367, upload-time = "2026-02-25T22:01:34.053Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/13/04/eaac430d0e6bf21265ae989427d37e94be5e41dc216879f1fbb6c5339942/nexus_rpc-1.2.0-py3-none-any.whl", hash = "sha256:977876f3af811ad1a09b2961d3d1ac9233bda43ff0febbb0c9906483b9d9f8a3", size = 28166, upload-time = "2025-11-17T19:17:05.64Z" }, + { url = "https://files.pythonhosted.org/packages/11/52/6327a5f4fda01207205038a106a99848a41c83e933cd23ea2cab3d2ebc6c/nexus_rpc-1.4.0-py3-none-any.whl", hash = "sha256:14c953d3519113f8ccec533a9efdb6b10c28afef75d11cdd6d422640c40b3a49", size = 29645, upload-time = "2026-02-25T22:01:33.122Z" }, ] [[package]] @@ -2130,142 +2395,160 @@ wheels = [ ] [[package]] -name = "nvidia-cublas-cu12" -version = "12.8.4.1" +name = "nvidia-cublas" +version = "13.1.1.3" source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "nvidia-cuda-nvrtc" }, +] wheels = [ - { url = "https://files.pythonhosted.org/packages/dc/61/e24b560ab2e2eaeb3c839129175fb330dfcfc29e5203196e5541a4c44682/nvidia_cublas_cu12-12.8.4.1-py3-none-manylinux_2_27_x86_64.whl", hash = "sha256:8ac4e771d5a348c551b2a426eda6193c19aa630236b418086020df5ba9667142", size = 594346921, upload-time = "2025-03-07T01:44:31.254Z" }, + { url = "https://files.pythonhosted.org/packages/a7/a1/0bd24ee8c8d03adac032fd2909426a00c88f8c57961b1277ded97f91119f/nvidia_cublas-13.1.1.3-py3-none-manylinux_2_27_aarch64.whl", hash = "sha256:b7a210458267ac818974c53038fbec2e969d5c99f305ab15c72522fa9f001dd5", size = 542848918, upload-time = "2026-04-08T18:46:22.985Z" }, + { url = "https://files.pythonhosted.org/packages/3b/cd/154ca20c38269e05eff77c1464e6c1da89f50a6390b565e9d82e06bc11e1/nvidia_cublas-13.1.1.3-py3-none-manylinux_2_27_x86_64.whl", hash = "sha256:37936a16db8fe4ac1f065c2139360608a543a09275cb1a1af612e08cfa065436", size = 423138758, upload-time = "2026-04-08T18:46:58.655Z" }, ] [[package]] -name = "nvidia-cuda-cupti-cu12" -version = "12.8.90" +name = "nvidia-cuda-cupti" +version = "13.0.85" source = { registry = "https://pypi.org/simple" } wheels = [ - { url = "https://files.pythonhosted.org/packages/f8/02/2adcaa145158bf1a8295d83591d22e4103dbfd821bcaf6f3f53151ca4ffa/nvidia_cuda_cupti_cu12-12.8.90-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:ea0cb07ebda26bb9b29ba82cda34849e73c166c18162d3913575b0c9db9a6182", size = 10248621, upload-time = "2025-03-07T01:40:21.213Z" }, + { url = "https://files.pythonhosted.org/packages/2a/2a/80353b103fc20ce05ef51e928daed4b6015db4aaa9162ed0997090fe2250/nvidia_cuda_cupti-13.0.85-py3-none-manylinux_2_25_aarch64.whl", hash = "sha256:796bd679890ee55fb14a94629b698b6db54bcfd833d391d5e94017dd9d7d3151", size = 10310827, upload-time = "2025-09-04T08:26:42.012Z" }, + { url = "https://files.pythonhosted.org/packages/33/6d/737d164b4837a9bbd202f5ae3078975f0525a55730fe871d8ed4e3b952b0/nvidia_cuda_cupti-13.0.85-py3-none-manylinux_2_25_x86_64.whl", hash = "sha256:4eb01c08e859bf924d222250d2e8f8b8ff6d3db4721288cf35d14252a4d933c8", size = 10715597, upload-time = "2025-09-04T08:26:51.312Z" }, ] [[package]] -name = "nvidia-cuda-nvrtc-cu12" -version = "12.8.93" +name = "nvidia-cuda-nvrtc" +version = "13.0.88" source = { registry = "https://pypi.org/simple" } wheels = [ - { url = "https://files.pythonhosted.org/packages/05/6b/32f747947df2da6994e999492ab306a903659555dddc0fbdeb9d71f75e52/nvidia_cuda_nvrtc_cu12-12.8.93-py3-none-manylinux2010_x86_64.manylinux_2_12_x86_64.whl", hash = "sha256:a7756528852ef889772a84c6cd89d41dfa74667e24cca16bb31f8f061e3e9994", size = 88040029, upload-time = "2025-03-07T01:42:13.562Z" }, + { url = "https://files.pythonhosted.org/packages/c3/68/483a78f5e8f31b08fb1bb671559968c0ca3a065ac7acabfc7cee55214fd6/nvidia_cuda_nvrtc-13.0.88-py3-none-manylinux2010_x86_64.manylinux_2_12_x86_64.whl", hash = "sha256:ad9b6d2ead2435f11cbb6868809d2adeeee302e9bb94bcf0539c7a40d80e8575", size = 90215200, upload-time = "2025-09-04T08:28:44.204Z" }, + { url = "https://files.pythonhosted.org/packages/b7/dc/6bb80850e0b7edd6588d560758f17e0550893a1feaf436807d64d2da040f/nvidia_cuda_nvrtc-13.0.88-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:d27f20a0ca67a4bb34268a5e951033496c5b74870b868bacd046b1b8e0c3267b", size = 43015449, upload-time = "2025-09-04T08:28:20.239Z" }, ] [[package]] -name = "nvidia-cuda-runtime-cu12" -version = "12.8.90" +name = "nvidia-cuda-runtime" +version = "13.0.96" source = { registry = "https://pypi.org/simple" } wheels = [ - { url = "https://files.pythonhosted.org/packages/0d/9b/a997b638fcd068ad6e4d53b8551a7d30fe8b404d6f1804abf1df69838932/nvidia_cuda_runtime_cu12-12.8.90-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:adade8dcbd0edf427b7204d480d6066d33902cab2a4707dcfc48a2d0fd44ab90", size = 954765, upload-time = "2025-03-07T01:40:01.615Z" }, + { url = "https://files.pythonhosted.org/packages/87/4f/17d7b9b8e285199c58ce28e31b5c5bbaa4d8271af06a89b6405258245de2/nvidia_cuda_runtime-13.0.96-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:ef9bcbe90493a2b9d810e43d249adb3d02e98dd30200d86607d8d02687c43f55", size = 2261060, upload-time = "2025-10-09T08:55:15.78Z" }, + { url = "https://files.pythonhosted.org/packages/2e/24/d1558f3b68b1d26e706813b1d10aa1d785e4698c425af8db8edc3dced472/nvidia_cuda_runtime-13.0.96-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:7f82250d7782aa23b6cfe765ecc7db554bd3c2870c43f3d1821f1d18aebf0548", size = 2243632, upload-time = "2025-10-09T08:55:36.117Z" }, ] [[package]] -name = "nvidia-cudnn-cu12" -version = "9.10.2.21" +name = "nvidia-cudnn-cu13" +version = "9.20.0.48" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "nvidia-cublas-cu12" }, + { name = "nvidia-cublas" }, ] wheels = [ - { url = "https://files.pythonhosted.org/packages/ba/51/e123d997aa098c61d029f76663dedbfb9bc8dcf8c60cbd6adbe42f76d049/nvidia_cudnn_cu12-9.10.2.21-py3-none-manylinux_2_27_x86_64.whl", hash = "sha256:949452be657fa16687d0930933f032835951ef0892b37d2d53824d1a84dc97a8", size = 706758467, upload-time = "2025-06-06T21:54:08.597Z" }, + { url = "https://files.pythonhosted.org/packages/56/c5/83384d846b2fd17c44bd499b36c75a45ed4f095fbbb2252294e89cea5c5c/nvidia_cudnn_cu13-9.20.0.48-py3-none-manylinux_2_27_aarch64.whl", hash = "sha256:e31454ae00094b0c55319d9d15b6fa2fc50a9e1c0f5c8c80fb75258234e731e1", size = 444574296, upload-time = "2026-03-09T19:28:27.751Z" }, + { url = "https://files.pythonhosted.org/packages/6e/5e/edb9c0ae051602c3ccaffe424256463636d639e27d7f302dde9975ef9e7a/nvidia_cudnn_cu13-9.20.0.48-py3-none-manylinux_2_27_x86_64.whl", hash = "sha256:0c45dd8eeb50b603f07995b1b300c62ffe6a1980482b82b3bcf94a4ca9d49304", size = 366173588, upload-time = "2026-03-09T19:29:34.474Z" }, ] [[package]] -name = "nvidia-cufft-cu12" -version = "11.3.3.83" +name = "nvidia-cufft" +version = "12.0.0.61" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "nvidia-nvjitlink-cu12" }, + { name = "nvidia-nvjitlink" }, ] wheels = [ - { url = "https://files.pythonhosted.org/packages/1f/13/ee4e00f30e676b66ae65b4f08cb5bcbb8392c03f54f2d5413ea99a5d1c80/nvidia_cufft_cu12-11.3.3.83-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:4d2dd21ec0b88cf61b62e6b43564355e5222e4a3fb394cac0db101f2dd0d4f74", size = 193118695, upload-time = "2025-03-07T01:45:27.821Z" }, + { url = "https://files.pythonhosted.org/packages/8b/ae/f417a75c0259e85c1d2f83ca4e960289a5f814ed0cea74d18c353d3e989d/nvidia_cufft-12.0.0.61-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:2708c852ef8cd89d1d2068bdbece0aa188813a0c934db3779b9b1faa8442e5f5", size = 214053554, upload-time = "2025-09-04T08:31:38.196Z" }, + { url = "https://files.pythonhosted.org/packages/a8/2f/7b57e29836ea8714f81e9898409196f47d772d5ddedddf1592eadb8ab743/nvidia_cufft-12.0.0.61-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:6c44f692dce8fd5ffd3e3df134b6cdb9c2f72d99cf40b62c32dde45eea9ddad3", size = 214085489, upload-time = "2025-09-04T08:31:56.044Z" }, ] [[package]] -name = "nvidia-cufile-cu12" -version = "1.13.1.3" +name = "nvidia-cufile" +version = "1.15.1.6" source = { registry = "https://pypi.org/simple" } wheels = [ - { url = "https://files.pythonhosted.org/packages/bb/fe/1bcba1dfbfb8d01be8d93f07bfc502c93fa23afa6fd5ab3fc7c1df71038a/nvidia_cufile_cu12-1.13.1.3-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:1d069003be650e131b21c932ec3d8969c1715379251f8d23a1860554b1cb24fc", size = 1197834, upload-time = "2025-03-07T01:45:50.723Z" }, + { url = "https://files.pythonhosted.org/packages/3f/70/4f193de89a48b71714e74602ee14d04e4019ad36a5a9f20c425776e72cd6/nvidia_cufile-1.15.1.6-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:08a3ecefae5a01c7f5117351c64f17c7c62efa5fffdbe24fc7d298da19cd0b44", size = 1223672, upload-time = "2025-09-04T08:32:22.779Z" }, + { url = "https://files.pythonhosted.org/packages/ab/73/cc4a14c9813a8a0d509417cf5f4bdaba76e924d58beb9864f5a7baceefbf/nvidia_cufile-1.15.1.6-py3-none-manylinux_2_27_aarch64.whl", hash = "sha256:bdc0deedc61f548bddf7733bdc216456c2fdb101d020e1ab4b88d232d5e2f6d1", size = 1136992, upload-time = "2025-09-04T08:32:14.119Z" }, ] [[package]] -name = "nvidia-curand-cu12" -version = "10.3.9.90" +name = "nvidia-curand" +version = "10.4.0.35" source = { registry = "https://pypi.org/simple" } wheels = [ - { url = "https://files.pythonhosted.org/packages/fb/aa/6584b56dc84ebe9cf93226a5cde4d99080c8e90ab40f0c27bda7a0f29aa1/nvidia_curand_cu12-10.3.9.90-py3-none-manylinux_2_27_x86_64.whl", hash = "sha256:b32331d4f4df5d6eefa0554c565b626c7216f87a06a4f56fab27c3b68a830ec9", size = 63619976, upload-time = "2025-03-07T01:46:23.323Z" }, + { url = "https://files.pythonhosted.org/packages/1e/72/7c2ae24fb6b63a32e6ae5d241cc65263ea18d08802aaae087d9f013335a2/nvidia_curand-10.4.0.35-py3-none-manylinux_2_27_aarch64.whl", hash = "sha256:133df5a7509c3e292aaa2b477afd0194f06ce4ea24d714d616ff36439cee349a", size = 61962106, upload-time = "2025-08-04T10:21:41.128Z" }, + { url = "https://files.pythonhosted.org/packages/a5/9f/be0a41ca4a4917abf5cb9ae0daff1a6060cc5de950aec0396de9f3b52bc5/nvidia_curand-10.4.0.35-py3-none-manylinux_2_27_x86_64.whl", hash = "sha256:1aee33a5da6e1db083fe2b90082def8915f30f3248d5896bcec36a579d941bfc", size = 59544258, upload-time = "2025-08-04T10:22:03.992Z" }, ] [[package]] -name = "nvidia-cusolver-cu12" -version = "11.7.3.90" +name = "nvidia-cusolver" +version = "12.0.4.66" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "nvidia-cublas-cu12" }, - { name = "nvidia-cusparse-cu12" }, - { name = "nvidia-nvjitlink-cu12" }, + { name = "nvidia-cublas" }, + { name = "nvidia-cusparse" }, + { name = "nvidia-nvjitlink" }, ] wheels = [ - { url = "https://files.pythonhosted.org/packages/85/48/9a13d2975803e8cf2777d5ed57b87a0b6ca2cc795f9a4f59796a910bfb80/nvidia_cusolver_cu12-11.7.3.90-py3-none-manylinux_2_27_x86_64.whl", hash = "sha256:4376c11ad263152bd50ea295c05370360776f8c3427b30991df774f9fb26c450", size = 267506905, upload-time = "2025-03-07T01:47:16.273Z" }, + { url = "https://files.pythonhosted.org/packages/c8/c3/b30c9e935fc01e3da443ec0116ed1b2a009bb867f5324d3f2d7e533e776b/nvidia_cusolver-12.0.4.66-py3-none-manylinux_2_27_aarch64.whl", hash = "sha256:02c2457eaa9e39de20f880f4bd8820e6a1cfb9f9a34f820eb12a155aa5bc92d2", size = 223467760, upload-time = "2025-09-04T08:33:04.222Z" }, + { url = "https://files.pythonhosted.org/packages/5f/67/cba3777620cdacb99102da4042883709c41c709f4b6323c10781a9c3aa34/nvidia_cusolver-12.0.4.66-py3-none-manylinux_2_27_x86_64.whl", hash = "sha256:0a759da5dea5c0ea10fd307de75cdeb59e7ea4fcb8add0924859b944babf1112", size = 200941980, upload-time = "2025-09-04T08:33:22.767Z" }, ] [[package]] -name = "nvidia-cusparse-cu12" -version = "12.5.8.93" +name = "nvidia-cusparse" +version = "12.6.3.3" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "nvidia-nvjitlink-cu12" }, + { name = "nvidia-nvjitlink" }, ] wheels = [ - { url = "https://files.pythonhosted.org/packages/c2/f5/e1854cb2f2bcd4280c44736c93550cc300ff4b8c95ebe370d0aa7d2b473d/nvidia_cusparse_cu12-12.5.8.93-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:1ec05d76bbbd8b61b06a80e1eaf8cf4959c3d4ce8e711b65ebd0443bb0ebb13b", size = 288216466, upload-time = "2025-03-07T01:48:13.779Z" }, + { url = "https://files.pythonhosted.org/packages/f8/94/5c26f33738ae35276672f12615a64bd008ed5be6d1ebcb23579285d960a9/nvidia_cusparse-12.6.3.3-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:80bcc4662f23f1054ee334a15c72b8940402975e0eab63178fc7e670aa59472c", size = 162155568, upload-time = "2025-09-04T08:33:42.864Z" }, + { url = "https://files.pythonhosted.org/packages/fa/18/623c77619c31d62efd55302939756966f3ecc8d724a14dab2b75f1508850/nvidia_cusparse-12.6.3.3-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:2b3c89c88d01ee0e477cb7f82ef60a11a4bcd57b6b87c33f789350b59759360b", size = 145942937, upload-time = "2025-09-04T08:33:58.029Z" }, ] [[package]] -name = "nvidia-cusparselt-cu12" -version = "0.7.1" +name = "nvidia-cusparselt-cu13" +version = "0.8.1" source = { registry = "https://pypi.org/simple" } wheels = [ - { url = "https://files.pythonhosted.org/packages/56/79/12978b96bd44274fe38b5dde5cfb660b1d114f70a65ef962bcbbed99b549/nvidia_cusparselt_cu12-0.7.1-py3-none-manylinux2014_x86_64.whl", hash = "sha256:f1bb701d6b930d5a7cea44c19ceb973311500847f81b634d802b7b539dc55623", size = 287193691, upload-time = "2025-02-26T00:15:44.104Z" }, + { url = "https://files.pythonhosted.org/packages/46/e1/cdc1797eadf82d3a9a575a19b33fdc871a97edbec42c00b5b5e914f4aff4/nvidia_cusparselt_cu13-0.8.1-py3-none-manylinux2014_aarch64.whl", hash = "sha256:4dca476c50bf4780d46cd0bfbd82e2bc10a08e4fef7950917ce8d7578d22a23f", size = 221051344, upload-time = "2025-09-05T18:49:51.289Z" }, + { url = "https://files.pythonhosted.org/packages/34/7d/2661f2fb3ac4302f3a246f5fc030213ac60c1fe0bce84f9783dbd831dbb7/nvidia_cusparselt_cu13-0.8.1-py3-none-manylinux2014_x86_64.whl", hash = "sha256:786ce87568c303fadb5afcc7102d454cd3040d75f6f8626f5db460d1871f4dd0", size = 170148586, upload-time = "2025-09-05T18:50:50.248Z" }, ] [[package]] -name = "nvidia-nccl-cu12" -version = "2.27.5" +name = "nvidia-nccl-cu13" +version = "2.29.7" source = { registry = "https://pypi.org/simple" } wheels = [ - { url = "https://files.pythonhosted.org/packages/6e/89/f7a07dc961b60645dbbf42e80f2bc85ade7feb9a491b11a1e973aa00071f/nvidia_nccl_cu12-2.27.5-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:ad730cf15cb5d25fe849c6e6ca9eb5b76db16a80f13f425ac68d8e2e55624457", size = 322348229, upload-time = "2025-06-26T04:11:28.385Z" }, + { url = "https://files.pythonhosted.org/packages/72/0d/daf50d44177ee0cbc7ff0a0c91eb5ff676c82be42f9a970bc7597f440c3a/nvidia_nccl_cu13-2.29.7-py3-none-manylinux_2_18_aarch64.whl", hash = "sha256:674a12383e3c38a1bcccae7d4f3633b37852230b6047883cb2f4c2d1b36d9bf5", size = 206014712, upload-time = "2026-03-03T05:34:20.843Z" }, + { url = "https://files.pythonhosted.org/packages/67/f4/58e4e91b6919367c7aafb8e36fce9aad1a3047e536bf7e2fd560927d3a4c/nvidia_nccl_cu13-2.29.7-py3-none-manylinux_2_18_x86_64.whl", hash = "sha256:edd81538446786ec3b73972543e53bb43bcaf0bfc8ef76cb679fcc390ffe136d", size = 205976000, upload-time = "2026-03-03T05:36:24.472Z" }, ] [[package]] -name = "nvidia-nvjitlink-cu12" -version = "12.8.93" +name = "nvidia-nvjitlink" +version = "13.0.88" source = { registry = "https://pypi.org/simple" } wheels = [ - { url = "https://files.pythonhosted.org/packages/f6/74/86a07f1d0f42998ca31312f998bd3b9a7eff7f52378f4f270c8679c77fb9/nvidia_nvjitlink_cu12-12.8.93-py3-none-manylinux2010_x86_64.manylinux_2_12_x86_64.whl", hash = "sha256:81ff63371a7ebd6e6451970684f916be2eab07321b73c9d244dc2b4da7f73b88", size = 39254836, upload-time = "2025-03-07T01:49:55.661Z" }, + { url = "https://files.pythonhosted.org/packages/56/7a/123e033aaff487c77107195fa5a2b8686795ca537935a24efae476c41f05/nvidia_nvjitlink-13.0.88-py3-none-manylinux2010_x86_64.manylinux_2_12_x86_64.whl", hash = "sha256:13a74f429e23b921c1109976abefacc69835f2f433ebd323d3946e11d804e47b", size = 40713933, upload-time = "2025-09-04T08:35:43.553Z" }, + { url = "https://files.pythonhosted.org/packages/ab/2c/93c5250e64df4f894f1cbb397c6fd71f79813f9fd79d7cd61de3f97b3c2d/nvidia_nvjitlink-13.0.88-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:e931536ccc7d467a98ba1d8b89ff7fa7f1fa3b13f2b0069118cd7f47bff07d0c", size = 38768748, upload-time = "2025-09-04T08:35:20.008Z" }, ] [[package]] -name = "nvidia-nvshmem-cu12" +name = "nvidia-nvshmem-cu13" version = "3.4.5" source = { registry = "https://pypi.org/simple" } wheels = [ - { url = "https://files.pythonhosted.org/packages/b5/09/6ea3ea725f82e1e76684f0708bbedd871fc96da89945adeba65c3835a64c/nvidia_nvshmem_cu12-3.4.5-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:042f2500f24c021db8a06c5eec2539027d57460e1c1a762055a6554f72c369bd", size = 139103095, upload-time = "2025-09-06T00:32:31.266Z" }, + { url = "https://files.pythonhosted.org/packages/dc/0f/05cc9c720236dcd2db9c1ab97fff629e96821be2e63103569da0c9b72f19/nvidia_nvshmem_cu13-3.4.5-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:6dc2a197f38e5d0376ad52cd1a2a3617d3cdc150fd5966f4aee9bcebb1d68fe9", size = 60215947, upload-time = "2025-09-06T00:32:20.022Z" }, + { url = "https://files.pythonhosted.org/packages/3c/35/a9bf80a609e74e3b000fef598933235c908fcefcef9026042b8e6dfde2a9/nvidia_nvshmem_cu13-3.4.5-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:290f0a2ee94c9f3687a02502f3b9299a9f9fe826e6d0287ee18482e78d495b80", size = 60412546, upload-time = "2025-09-06T00:32:41.564Z" }, ] [[package]] -name = "nvidia-nvtx-cu12" -version = "12.8.90" +name = "nvidia-nvtx" +version = "13.0.85" source = { registry = "https://pypi.org/simple" } wheels = [ - { url = "https://files.pythonhosted.org/packages/a2/eb/86626c1bbc2edb86323022371c39aa48df6fd8b0a1647bc274577f72e90b/nvidia_nvtx_cu12-12.8.90-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:5b17e2001cc0d751a5bc2c6ec6d26ad95913324a4adb86788c944f8ce9ba441f", size = 89954, upload-time = "2025-03-07T01:42:44.131Z" }, + { url = "https://files.pythonhosted.org/packages/c2/f3/d86c845465a2723ad7e1e5c36dcd75ddb82898b3f53be47ebd429fb2fa5d/nvidia_nvtx-13.0.85-py3-none-manylinux1_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:4936d1d6780fbe68db454f5e72a42ff64d1fd6397df9f363ae786930fd5c1cd4", size = 148047, upload-time = "2025-09-04T08:29:01.761Z" }, + { url = "https://files.pythonhosted.org/packages/a8/64/3708a90d1ebe202ffdeb7185f878a3c84d15c2b2c31858da2ce0583e2def/nvidia_nvtx-13.0.85-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:cb7780edb6b14107373c835bf8b72e7a178bac7367e23da7acb108f973f157a6", size = 148878, upload-time = "2025-09-04T08:28:53.627Z" }, ] [[package]] name = "openai" -version = "2.15.0" +version = "2.29.0" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "anyio" }, @@ -2277,9 +2560,9 @@ dependencies = [ { name = "tqdm" }, { name = "typing-extensions" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/94/f4/4690ecb5d70023ce6bfcfeabfe717020f654bde59a775058ec6ac4692463/openai-2.15.0.tar.gz", hash = "sha256:42eb8cbb407d84770633f31bf727d4ffb4138711c670565a41663d9439174fba", size = 627383, upload-time = "2026-01-09T22:10:08.603Z" } +sdist = { url = "https://files.pythonhosted.org/packages/b4/15/203d537e58986b5673e7f232453a2a2f110f22757b15921cbdeea392e520/openai-2.29.0.tar.gz", hash = "sha256:32d09eb2f661b38d3edd7d7e1a2943d1633f572596febe64c0cd370c86d52bec", size = 671128, upload-time = "2026-03-17T17:53:49.599Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/b5/df/c306f7375d42bafb379934c2df4c2fa3964656c8c782bac75ee10c102818/openai-2.15.0-py3-none-any.whl", hash = "sha256:6ae23b932cd7230f7244e52954daa6602716d6b9bf235401a107af731baea6c3", size = 1067879, upload-time = "2026-01-09T22:10:06.446Z" }, + { url = "https://files.pythonhosted.org/packages/d0/b1/35b6f9c8cf9318e3dbb7146cc82dab4cf61182a8d5406fc9b50864362895/openai-2.29.0-py3-none-any.whl", hash = "sha256:b7c5de513c3286d17c5e29b92c4c98ceaf0d775244ac8159aeb1bddf840eb42a", size = 1141533, upload-time = "2026-03-17T17:53:47.348Z" }, ] [[package]] @@ -2337,20 +2620,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/95/f1/b27d3e2e003cd9a3592c43d099d2ed8d0a947c15281bf8463a256db0b46c/opentelemetry_exporter_otlp_proto_http-1.39.1-py3-none-any.whl", hash = "sha256:d9f5207183dd752a412c4cd564ca8875ececba13be6e9c6c370ffb752fd59985", size = 19641, upload-time = "2025-12-11T13:32:22.248Z" }, ] -[[package]] -name = "opentelemetry-exporter-prometheus" -version = "0.60b1" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "opentelemetry-api" }, - { name = "opentelemetry-sdk" }, - { name = "prometheus-client" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/14/39/7dafa6fff210737267bed35a8855b6ac7399b9e582b8cf1f25f842517012/opentelemetry_exporter_prometheus-0.60b1.tar.gz", hash = "sha256:a4011b46906323f71724649d301b4dc188aaa068852e814f4df38cc76eac616b", size = 14976, upload-time = "2025-12-11T13:32:42.944Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/9b/0d/4be6bf5477a3eb3d917d2f17d3c0b6720cd6cb97898444a61d43cc983f5c/opentelemetry_exporter_prometheus-0.60b1-py3-none-any.whl", hash = "sha256:49f59178de4f4590e3cef0b8b95cf6e071aae70e1f060566df5546fad773b8fd", size = 13019, upload-time = "2025-12-11T13:32:23.974Z" }, -] - [[package]] name = "opentelemetry-instrumentation" version = "0.60b1" @@ -2439,6 +2708,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/20/12/38679034af332785aac8774540895e234f4d07f7545804097de4b666afd8/packaging-25.0-py3-none-any.whl", hash = "sha256:29572ef2b1f17581046b3a2227d5c611fb25ec70ca1ba8554b24b0e69331a484", size = 66469, upload-time = "2025-04-19T11:48:57.875Z" }, ] +[[package]] +name = "paginate" +version = "0.5.7" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/ec/46/68dde5b6bc00c1296ec6466ab27dddede6aec9af1b99090e1107091b3b84/paginate-0.5.7.tar.gz", hash = "sha256:22bd083ab41e1a8b4f3690544afb2c60c25e5c9a63a30fa2f483f6c60c8e5945", size = 19252, upload-time = "2024-08-25T14:17:24.139Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/90/96/04b8e52da071d28f5e21a805b19cb9390aa17a47462ac87f5e2696b9566d/paginate-0.5.7-py2.py3-none-any.whl", hash = "sha256:b885e2af73abcf01d9559fd5216b57ef722f8c42affbb63942377668e35c7591", size = 13746, upload-time = "2024-08-25T14:17:22.55Z" }, +] + [[package]] name = "pathable" version = "0.4.4" @@ -2449,12 +2727,12 @@ wheels = [ ] [[package]] -name = "pathvalidate" -version = "3.3.1" +name = "pathspec" +version = "1.0.4" source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/fa/2a/52a8da6fe965dea6192eb716b357558e103aea0a1e9a8352ad575a8406ca/pathvalidate-3.3.1.tar.gz", hash = "sha256:b18c07212bfead624345bb8e1d6141cdcf15a39736994ea0b94035ad2b1ba177", size = 63262, upload-time = "2025-06-15T09:07:20.736Z" } +sdist = { url = "https://files.pythonhosted.org/packages/fa/36/e27608899f9b8d4dff0617b2d9ab17ca5608956ca44461ac14ac48b44015/pathspec-1.0.4.tar.gz", hash = "sha256:0210e2ae8a21a9137c0d470578cb0e595af87edaa6ebf12ff176f14a02e0e645", size = 131200, upload-time = "2026-01-27T03:59:46.938Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/9a/70/875f4a23bfc4731703a5835487d0d2fb999031bd415e7d17c0ae615c18b7/pathvalidate-3.3.1-py3-none-any.whl", hash = "sha256:5263baab691f8e1af96092fa5137ee17df5bdfbd6cff1fcac4d6ef4bc2e1735f", size = 24305, upload-time = "2025-06-15T09:07:19.117Z" }, + { url = "https://files.pythonhosted.org/packages/ef/3c/2c197d226f9ea224a9ab8d197933f9da0ae0aac5b6e0f884e2b8d9c8e9f7/pathspec-1.0.4-py3-none-any.whl", hash = "sha256:fb6ae2fd4e7c921a165808a552060e722767cfa526f99ca5156ed2ce45a5c723", size = 55206, upload-time = "2026-01-27T03:59:45.137Z" }, ] [[package]] @@ -2521,15 +2799,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/5d/19/fd3ef348460c80af7bb4669ea7926651d1f95c23ff2df18b9d24bab4f3fa/pre_commit-4.5.1-py2.py3-none-any.whl", hash = "sha256:3b3afd891e97337708c1674210f8eba659b52a38ea5f822ff142d10786221f77", size = 226437, upload-time = "2025-12-16T21:14:32.409Z" }, ] -[[package]] -name = "prometheus-client" -version = "0.24.1" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/f0/58/a794d23feb6b00fc0c72787d7e87d872a6730dd9ed7c7b3e954637d8f280/prometheus_client-0.24.1.tar.gz", hash = "sha256:7e0ced7fbbd40f7b84962d5d2ab6f17ef88a72504dcf7c0b40737b43b2a461f9", size = 85616, upload-time = "2026-01-14T15:26:26.965Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/74/c3/24a2f845e3917201628ecaba4f18bab4d18a337834c1df2a159ee9d22a42/prometheus_client-0.24.1-py3-none-any.whl", hash = "sha256:150db128af71a5c2482b36e588fc8a6b95e498750da4b17065947c16070f4055", size = 64057, upload-time = "2026-01-14T15:26:24.42Z" }, -] - [[package]] name = "prompt-toolkit" version = "3.0.52" @@ -2643,21 +2912,21 @@ wheels = [ [[package]] name = "py-key-value-aio" -version = "0.3.0" +version = "0.4.4" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "beartype" }, - { name = "py-key-value-shared" }, + { name = "typing-extensions" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/93/ce/3136b771dddf5ac905cc193b461eb67967cf3979688c6696e1f2cdcde7ea/py_key_value_aio-0.3.0.tar.gz", hash = "sha256:858e852fcf6d696d231266da66042d3355a7f9871650415feef9fca7a6cd4155", size = 50801, upload-time = "2025-11-17T16:50:04.711Z" } +sdist = { url = "https://files.pythonhosted.org/packages/04/3c/0397c072a38d4bc580994b42e0c90c5f44f679303489e4376289534735e5/py_key_value_aio-0.4.4.tar.gz", hash = "sha256:e3012e6243ed7cc09bb05457bd4d03b1ba5c2b1ca8700096b3927db79ffbbe55", size = 92300, upload-time = "2026-02-16T21:21:43.245Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/99/10/72f6f213b8f0bce36eff21fda0a13271834e9eeff7f9609b01afdc253c79/py_key_value_aio-0.3.0-py3-none-any.whl", hash = "sha256:1c781915766078bfd608daa769fefb97e65d1d73746a3dfb640460e322071b64", size = 96342, upload-time = "2025-11-17T16:50:03.801Z" }, + { url = "https://files.pythonhosted.org/packages/32/69/f1b537ee70b7def42d63124a539ed3026a11a3ffc3086947a1ca6e861868/py_key_value_aio-0.4.4-py3-none-any.whl", hash = "sha256:18e17564ecae61b987f909fc2cd41ee2012c84b4b1dcb8c055cf8b4bc1bf3f5d", size = 152291, upload-time = "2026-02-16T21:21:44.241Z" }, ] [package.optional-dependencies] -disk = [ - { name = "diskcache" }, - { name = "pathvalidate" }, +filetree = [ + { name = "aiofile" }, + { name = "anyio" }, ] keyring = [ { name = "keyring" }, @@ -2665,30 +2934,14 @@ keyring = [ memory = [ { name = "cachetools" }, ] -redis = [ - { name = "redis" }, -] - -[[package]] -name = "py-key-value-shared" -version = "0.3.0" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "beartype" }, - { name = "typing-extensions" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/7b/e4/1971dfc4620a3a15b4579fe99e024f5edd6e0967a71154771a059daff4db/py_key_value_shared-0.3.0.tar.gz", hash = "sha256:8fdd786cf96c3e900102945f92aa1473138ebe960ef49da1c833790160c28a4b", size = 11666, upload-time = "2025-11-17T16:50:06.849Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/51/e4/b8b0a03ece72f47dce2307d36e1c34725b7223d209fc679315ffe6a4e2c3/py_key_value_shared-0.3.0-py3-none-any.whl", hash = "sha256:5b0efba7ebca08bb158b1e93afc2f07d30b8f40c2fc12ce24a4c0d84f42f9298", size = 19560, upload-time = "2025-11-17T16:50:05.954Z" }, -] [[package]] name = "pyasn1" -version = "0.6.2" +version = "0.6.3" source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/fe/b6/6e630dff89739fcd427e3f72b3d905ce0acb85a45d4ec3e2678718a3487f/pyasn1-0.6.2.tar.gz", hash = "sha256:9b59a2b25ba7e4f8197db7686c09fb33e658b98339fadb826e9512629017833b", size = 146586, upload-time = "2026-01-16T18:04:18.534Z" } +sdist = { url = "https://files.pythonhosted.org/packages/5c/5f/6583902b6f79b399c9c40674ac384fd9cd77805f9e6205075f828ef11fb2/pyasn1-0.6.3.tar.gz", hash = "sha256:697a8ecd6d98891189184ca1fa05d1bb00e2f84b5977c481452050549c8a72cf", size = 148685, upload-time = "2026-03-17T01:06:53.382Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/44/b5/a96872e5184f354da9c84ae119971a0a4c221fe9b27a4d94bd43f2596727/pyasn1-0.6.2-py3-none-any.whl", hash = "sha256:1eb26d860996a18e9b6ed05e7aae0e9fc21619fcee6af91cca9bad4fbea224bf", size = 83371, upload-time = "2026-01-16T18:04:17.174Z" }, + { url = "https://files.pythonhosted.org/packages/5d/a0/7d793dce3fa811fe047d6ae2431c672364b462850c6235ae306c0efd025f/pyasn1-0.6.3-py3-none-any.whl", hash = "sha256:a80184d120f0864a52a073acc6fc642847d0be408e7c7252f31390c0f4eadcde", size = 83997, upload-time = "2026-03-17T01:06:52.036Z" }, ] [[package]] @@ -2734,32 +2987,32 @@ email = [ [[package]] name = "pydantic-ai" -version = "1.46.0" +version = "1.102.0" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "pydantic-ai-slim", extra = ["ag-ui", "anthropic", "bedrock", "cli", "cohere", "evals", "fastmcp", "google", "groq", "huggingface", "logfire", "mcp", "mistral", "openai", "retries", "temporal", "ui", "vertexai", "xai"] }, + { name = "pydantic-ai-slim", extra = ["ag-ui", "anthropic", "bedrock", "cli", "cohere", "evals", "fastmcp", "google", "groq", "huggingface", "logfire", "mcp", "mistral", "openai", "retries", "spec", "temporal", "ui", "vertexai", "xai"] }, ] -sdist = { url = "https://files.pythonhosted.org/packages/7d/e9/2917eabd9a8f408748e1e91b8d0a1bf695ca7d785f6b88efc3e4bba2fa94/pydantic_ai-1.46.0.tar.gz", hash = "sha256:e71c7d7c905da6f34b8759ad9f6914c31035fed5623ca5ac35096f9d738019cf", size = 11795, upload-time = "2026-01-23T00:07:15.786Z" } +sdist = { url = "https://files.pythonhosted.org/packages/61/a8/c6cecf03aea4ae75126069c6b0f988263d1cb18b97d6d0a6634f5e397b56/pydantic_ai-1.102.0.tar.gz", hash = "sha256:5def631d6e1c68b5e992c88da21b78377fe9262aeaf7f9ca09f67c100a9d3878", size = 17795, upload-time = "2026-05-23T01:14:30.493Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/4e/9e/ff49bae2eeeb7f0afe0b8bfb49868f4e4e0f2d986be5f2f9883e09c3e09b/pydantic_ai-1.46.0-py3-none-any.whl", hash = "sha256:a9ac9413ae1e57d5f9ce563f6e46aceaaf9602540366e98363d08482e4ddc651", size = 7220, upload-time = "2026-01-23T00:07:08.263Z" }, + { url = "https://files.pythonhosted.org/packages/d1/57/de1ab45c2084cb2db886a09d93b005959134655f6ec348cf8a821a177b2f/pydantic_ai-1.102.0-py3-none-any.whl", hash = "sha256:bc38cf4936cf08fa3aaf9d34abf908fd73b47147768cdeb34ec3eaf43909aca8", size = 7587, upload-time = "2026-05-23T01:14:19.813Z" }, ] [[package]] name = "pydantic-ai-slim" -version = "1.46.0" +version = "1.102.0" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "genai-prices" }, - { name = "griffe" }, + { name = "griffelib" }, { name = "httpx" }, { name = "opentelemetry-api" }, { name = "pydantic" }, { name = "pydantic-graph" }, { name = "typing-inspection" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/c6/f3/c053fef7e4d55b7b28fea5d3a738e5e6fa15f227668faed53c76226ae79a/pydantic_ai_slim-1.46.0.tar.gz", hash = "sha256:8925bc2c54b6c1f5168142d703ecfdba65162d08dae9908bf583932fdf631d09", size = 393260, upload-time = "2026-01-23T00:07:18.831Z" } +sdist = { url = "https://files.pythonhosted.org/packages/e2/3e/14980440e8f0532535e1fbe936fec5f8d8e7bc6cafa81f6f3c51b1884fe5/pydantic_ai_slim-1.102.0.tar.gz", hash = "sha256:0b8f2b70fa2b40efcbd09d341a346934fc4e46622ae281f858c6bfd3d0d3152b", size = 739988, upload-time = "2026-05-23T01:14:32.808Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/7f/d8/640ccbd4d63021a7bd724571dfe92c5868e3890a1172e159b828c84c30dc/pydantic_ai_slim-1.46.0-py3-none-any.whl", hash = "sha256:2494ca9be6009a5e27db09fecb1ab49f0b569a6e7fcd2eda067262bcbd497856", size = 515335, upload-time = "2026-01-23T00:07:10.751Z" }, + { url = "https://files.pythonhosted.org/packages/b4/2e/089df86adaf904dd97a1b139d29fe728af0e41430d747f5b6315df3b0c1e/pydantic_ai_slim-1.102.0-py3-none-any.whl", hash = "sha256:f9fa9c3fb58a76f85522f78d1037d201b424de46d532263ed780b3730060449f", size = 919311, upload-time = "2026-05-23T01:14:23.464Z" }, ] [package.optional-dependencies] @@ -2777,6 +3030,7 @@ cli = [ { name = "argcomplete" }, { name = "prompt-toolkit" }, { name = "pyperclip" }, + { name = "pyyaml" }, { name = "rich" }, ] cohere = [ @@ -2795,13 +3049,13 @@ groq = [ { name = "groq" }, ] huggingface = [ - { name = "huggingface-hub", extra = ["inference"] }, + { name = "huggingface-hub" }, ] logfire = [ { name = "logfire", extra = ["httpx"] }, ] mcp = [ - { name = "mcp" }, + { name = "fastmcp-slim", extra = ["client"] }, ] mistral = [ { name = "mistralai" }, @@ -2813,6 +3067,10 @@ openai = [ retries = [ { name = "tenacity" }, ] +spec = [ + { name = "pydantic-handlebars" }, + { name = "pyyaml" }, +] temporal = [ { name = "temporalio" }, ] @@ -2900,7 +3158,7 @@ wheels = [ [[package]] name = "pydantic-evals" -version = "1.46.0" +version = "1.102.0" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "anyio" }, @@ -2910,14 +3168,14 @@ dependencies = [ { name = "pyyaml" }, { name = "rich" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/da/ce/044bde6ba4f0da335d7f7955c58b86e45ba275b009b46cd61d5b53b62f06/pydantic_evals-1.46.0.tar.gz", hash = "sha256:66c52ad006d6fa7d05f563d667d20377a46edb54ef638c2b83c7660215560f76", size = 47173, upload-time = "2026-01-23T00:07:20.254Z" } +sdist = { url = "https://files.pythonhosted.org/packages/2a/2a/2f0a18e170dc1db4b32120bea9e1162ef196c1f453db823878f5eaf7b8bb/pydantic_evals-1.102.0.tar.gz", hash = "sha256:711a6335d24a11c324e5a5c7758b12dfd77209f885ab2501d7eedb9dd5b75b18", size = 78557, upload-time = "2026-05-23T01:14:34.447Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/81/02/23cbcb3843b51bad4ecda57e2047fbbf82743e4bd29e694a17d366648470/pydantic_evals-1.46.0-py3-none-any.whl", hash = "sha256:6a7cdfd3bf5e5d99c76fb77e3d41897b9ef90c4ee300f937509cdbeaec8e16f9", size = 56346, upload-time = "2026-01-23T00:07:12.216Z" }, + { url = "https://files.pythonhosted.org/packages/e2/fd/2281c166b2c5cedab003b12bf8a630656cb5a9bbd552e4981ee190570d15/pydantic_evals-1.102.0-py3-none-any.whl", hash = "sha256:579edd6f7056d0fe52e03c7004377a0b9c42264c60a370258235fb0750fe20a2", size = 93529, upload-time = "2026-05-23T01:14:25.559Z" }, ] [[package]] name = "pydantic-graph" -version = "1.46.0" +version = "1.102.0" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "httpx" }, @@ -2925,46 +3183,35 @@ dependencies = [ { name = "pydantic" }, { name = "typing-inspection" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/b9/43/09cc322c1e7cf69e8f01fc6f09f7cd952b1fb49818cf2bee556f3b5fba07/pydantic_graph-1.46.0.tar.gz", hash = "sha256:ef0d316c95bdc37af20bdf3c343fb1caee2c8b536245d712c3ed46af0734319e", size = 58455, upload-time = "2026-01-23T00:07:21.125Z" } +sdist = { url = "https://files.pythonhosted.org/packages/51/37/4265a1a63eddf35a5aa621c9b2355525bdeae3eb59c3954b165fbfe31404/pydantic_graph-1.102.0.tar.gz", hash = "sha256:e285bd7115e4e92676eaf0a5e7e6faa64cda8c4819f67923a118c50666b909ab", size = 62584, upload-time = "2026-05-23T01:14:36.056Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/e0/e9/058fd0001c2aed3675bc80d404c6171a753a4ff08bb570ec252848d6146d/pydantic_graph-1.46.0-py3-none-any.whl", hash = "sha256:cdbc609df49e2eeb9d0d4e43f87288b79ed9d021157ba639e71d862da4b71443", size = 72325, upload-time = "2026-01-23T00:07:13.807Z" }, + { url = "https://files.pythonhosted.org/packages/a4/49/5597c52d50114440047dd4ce4f6505e32ee336f43267639907d1a17648ee/pydantic_graph-1.102.0-py3-none-any.whl", hash = "sha256:b1a28314adc4abca4db02cf095d064782ec5712e0847ce7a6b79a3c84bf1fc01", size = 80100, upload-time = "2026-05-23T01:14:27.583Z" }, ] [[package]] -name = "pydantic-settings" -version = "2.12.0" +name = "pydantic-handlebars" +version = "0.2.0" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "pydantic" }, - { name = "python-dotenv" }, - { name = "typing-inspection" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/43/4b/ac7e0aae12027748076d72a8764ff1c9d82ca75a7a52622e67ed3f765c54/pydantic_settings-2.12.0.tar.gz", hash = "sha256:005538ef951e3c2a68e1c08b292b5f2e71490def8589d4221b95dab00dafcfd0", size = 194184, upload-time = "2025-11-10T14:25:47.013Z" } +sdist = { url = "https://files.pythonhosted.org/packages/2d/a3/13b1f17648605d1872bbc6cc56f24d9a2f4151bbf0623b9f731282a061be/pydantic_handlebars-0.2.0.tar.gz", hash = "sha256:11ee67abddefcb624ede8c690bc0210248ac235a150d9423908a89630c9a4e98", size = 175652, upload-time = "2026-05-22T06:06:38.476Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/c1/60/5d4751ba3f4a40a6891f24eec885f51afd78d208498268c734e256fb13c4/pydantic_settings-2.12.0-py3-none-any.whl", hash = "sha256:fddb9fd99a5b18da837b29710391e945b1e30c135477f484084ee513adb93809", size = 51880, upload-time = "2025-11-10T14:25:45.546Z" }, + { url = "https://files.pythonhosted.org/packages/4d/f1/a27154170818efe3cb38af1eb54e0f7fc155873bd3b54f39a672a918e6cb/pydantic_handlebars-0.2.0-py3-none-any.whl", hash = "sha256:e5accc8ed0dc1bd953daa2eea2c0ee1eab7a6a27029da2439abacdf4ed46a4ae", size = 49954, upload-time = "2026-05-22T06:06:37.034Z" }, ] [[package]] -name = "pydocket" -version = "0.16.6" +name = "pydantic-settings" +version = "2.14.2" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "cloudpickle" }, - { name = "fakeredis", extra = ["lua"] }, - { name = "opentelemetry-api" }, - { name = "opentelemetry-exporter-prometheus" }, - { name = "opentelemetry-instrumentation" }, - { name = "prometheus-client" }, - { name = "py-key-value-aio", extra = ["memory", "redis"] }, - { name = "python-json-logger" }, - { name = "redis" }, - { name = "rich" }, - { name = "typer" }, - { name = "typing-extensions" }, + { name = "pydantic" }, + { name = "python-dotenv" }, + { name = "typing-inspection" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/72/00/26befe5f58df7cd1aeda4a8d10bc7d1908ffd86b80fd995e57a2a7b3f7bd/pydocket-0.16.6.tar.gz", hash = "sha256:b96c96ad7692827214ed4ff25fcf941ec38371314db5dcc1ae792b3e9d3a0294", size = 299054, upload-time = "2026-01-09T22:09:15.405Z" } +sdist = { url = "https://files.pythonhosted.org/packages/5c/b5/8f48e906c3e0205276e8bd8cb7512217a87b2685304d64be27cad5b3019f/pydantic_settings-2.14.2.tar.gz", hash = "sha256:c19dd64b19097f1de80184f0cc7b0272a13ae6e170cbf240a3e27e381ed14a5f", size = 237700, upload-time = "2026-06-19T13:44:56.324Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/0a/3f/7483e5a6dc6326b6e0c640619b5c5bd1d6e3c20e54d58f5fb86267cef00e/pydocket-0.16.6-py3-none-any.whl", hash = "sha256:683d21e2e846aa5106274e7d59210331b242d7fb0dce5b08d3b82065663ed183", size = 67697, upload-time = "2026-01-09T22:09:13.436Z" }, + { url = "https://files.pythonhosted.org/packages/77/c1/6e422f34e569cf8e18df68d1939c81c099d2b61e4f7d9621c8a77560799c/pydantic_settings-2.14.2-py3-none-any.whl", hash = "sha256:a20c97b37910b6550d5ea50fbcc2d4187defe58cd57070b73863d069419c9440", size = 61715, upload-time = "2026-06-19T13:44:55.02Z" }, ] [[package]] @@ -3019,11 +3266,11 @@ wheels = [ [[package]] name = "pyjwt" -version = "2.10.1" +version = "2.13.0" source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/e7/46/bd74733ff231675599650d3e47f361794b22ef3e3770998dda30d3b63726/pyjwt-2.10.1.tar.gz", hash = "sha256:3cc5772eb20009233caf06e9d8a0577824723b44e6648ee0a2aedb6cf9381953", size = 87785, upload-time = "2024-11-28T03:43:29.933Z" } +sdist = { url = "https://files.pythonhosted.org/packages/3b/81/58d0ac84e1ef3a3843791d6954d94c0b33d526c75eeb1efbce9d0a4c4077/pyjwt-2.13.0.tar.gz", hash = "sha256:41571c89ca91598c79e8ef18a2d07367d4810fbbd6f637794879baf1b7703423", size = 107515, upload-time = "2026-05-21T19:54:36.618Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/61/ad/689f02752eeec26aed679477e80e632ef1b682313be70793d798c1d5fc8f/PyJWT-2.10.1-py3-none-any.whl", hash = "sha256:dcdd193e30abefd5debf142f9adfcdd2b58004e644f25406ffaebd50bd98dacb", size = 22997, upload-time = "2024-11-28T03:43:27.893Z" }, + { url = "https://files.pythonhosted.org/packages/a3/5e/ecf12fdb62546d64385c158514e9b2b671f7832108ef2ecd2020ce0af2d1/pyjwt-2.13.0-py3-none-any.whl", hash = "sha256:66adcc2aff09b3f1bbd95fc1e1577df8ac8723c978552fd43304c8a290ac5728", size = 31274, upload-time = "2026-05-21T19:54:35.362Z" }, ] [package.optional-dependencies] @@ -3049,6 +3296,19 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/a6/92/d40f5d937517cc489ad848fc4414ecccc7592e4686b9071e09e64f5e378e/pylint-4.0.4-py3-none-any.whl", hash = "sha256:63e06a37d5922555ee2c20963eb42559918c20bd2b21244e4ef426e7c43b92e0", size = 536425, upload-time = "2025-11-30T13:29:02.53Z" }, ] +[[package]] +name = "pymdown-extensions" +version = "10.21.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "markdown" }, + { name = "pyyaml" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/9e/26/d1015444da4d952a1ca487a236b522eb979766f0295a0bd0c5fc089989a9/pymdown_extensions-10.21.3.tar.gz", hash = "sha256:72cfcf55f07aea0d4af2c4f11dd4e52466ddfb1bb819673146398e0bd3a77354", size = 854140, upload-time = "2026-05-13T12:57:32.267Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/7e/85/545a951eecc270fcd688288c600017e2050a1aacb56c711d208586d3e470/pymdown_extensions-10.21.3-py3-none-any.whl", hash = "sha256:d7a5d08014fc571e80ca21dd6f854e31f94c489800350564d55d15b3c41e76b6", size = 269002, upload-time = "2026-05-13T12:57:30.296Z" }, +] + [[package]] name = "pymgclient" version = "1.5.1" @@ -3072,15 +3332,15 @@ wheels = [ [[package]] name = "pyopenssl" -version = "25.3.0" +version = "26.2.0" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "cryptography" }, { name = "typing-extensions", marker = "python_full_version < '3.13'" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/80/be/97b83a464498a79103036bc74d1038df4a7ef0e402cfaf4d5e113fb14759/pyopenssl-25.3.0.tar.gz", hash = "sha256:c981cb0a3fd84e8602d7afc209522773b94c1c2446a3c710a75b06fe1beae329", size = 184073, upload-time = "2025-09-17T00:32:21.037Z" } +sdist = { url = "https://files.pythonhosted.org/packages/1a/51/27a5ad5f939d08f690a326ef9582cda7140555180db71695f6fb747d6a36/pyopenssl-26.2.0.tar.gz", hash = "sha256:8c6fcecd1183a7fc897548dfe388b0cdb7f37e018200d8409cf33959dbe35387", size = 182195, upload-time = "2026-05-04T23:06:09.72Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/d1/81/ef2b1dfd1862567d573a4fdbc9f969067621764fbb74338496840a1d2977/pyopenssl-25.3.0-py3-none-any.whl", hash = "sha256:1fda6fc034d5e3d179d39e59c1895c9faeaf40a79de5fc4cbbfbe0d36f4a77b6", size = 57268, upload-time = "2025-09-17T00:32:19.474Z" }, + { url = "https://files.pythonhosted.org/packages/73/b8/a0e2790ae249d6f38c9f66de7a211621a7ab2650217bcd04e1262f578a56/pyopenssl-26.2.0-py3-none-any.whl", hash = "sha256:4f9d971bc5298b8bc1fab282803da04bf000c755d4ad9d99b52de2569ca19a70", size = 55823, upload-time = "2026-05-04T23:06:08.395Z" }, ] [[package]] @@ -3094,7 +3354,7 @@ wheels = [ [[package]] name = "pytest" -version = "9.0.2" +version = "9.0.3" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "colorama", marker = "sys_platform == 'win32'" }, @@ -3103,9 +3363,9 @@ dependencies = [ { name = "pluggy" }, { name = "pygments" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/d1/db/7ef3487e0fb0049ddb5ce41d3a49c235bf9ad299b6a25d5780a89f19230f/pytest-9.0.2.tar.gz", hash = "sha256:75186651a92bd89611d1d9fc20f0b4345fd827c41ccd5c299a868a05d70edf11", size = 1568901, upload-time = "2025-12-06T21:30:51.014Z" } +sdist = { url = "https://files.pythonhosted.org/packages/7d/0d/549bd94f1a0a402dc8cf64563a117c0f3765662e2e668477624baeec44d5/pytest-9.0.3.tar.gz", hash = "sha256:b86ada508af81d19edeb213c681b1d48246c1a91d304c6c81a427674c17eb91c", size = 1572165, upload-time = "2026-04-07T17:16:18.027Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/3b/ab/b3226f0bd7cdcf710fbede2b3548584366da3b19b5021e74f5bde2a8fa3f/pytest-9.0.2-py3-none-any.whl", hash = "sha256:711ffd45bf766d5264d487b917733b453d917afd2b0ad65223959f59089f875b", size = 374801, upload-time = "2025-12-06T21:30:49.154Z" }, + { url = "https://files.pythonhosted.org/packages/d4/24/a372aaf5c9b7208e7112038812994107bc65a84cd00e0354a88c2c77a617/pytest-9.0.3-py3-none-any.whl", hash = "sha256:2c5efc453d45394fdd706ade797c0a81091eccd1d6e4bccfcd476e2b8e0ab5d9", size = 375249, upload-time = "2026-04-07T17:16:16.13Z" }, ] [[package]] @@ -3162,29 +3422,20 @@ wheels = [ [[package]] name = "python-dotenv" -version = "1.2.1" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/f0/26/19cadc79a718c5edbec86fd4919a6b6d3f681039a2f6d66d14be94e75fb9/python_dotenv-1.2.1.tar.gz", hash = "sha256:42667e897e16ab0d66954af0e60a9caa94f0fd4ecf3aaf6d2d260eec1aa36ad6", size = 44221, upload-time = "2025-10-26T15:12:10.434Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/14/1b/a298b06749107c305e1fe0f814c6c74aea7b2f1e10989cb30f544a1b3253/python_dotenv-1.2.1-py3-none-any.whl", hash = "sha256:b81ee9561e9ca4004139c6cbba3a238c32b03e4894671e181b671e8cb8425d61", size = 21230, upload-time = "2025-10-26T15:12:09.109Z" }, -] - -[[package]] -name = "python-json-logger" -version = "4.0.0" +version = "1.2.2" source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/29/bf/eca6a3d43db1dae7070f70e160ab20b807627ba953663ba07928cdd3dc58/python_json_logger-4.0.0.tar.gz", hash = "sha256:f58e68eb46e1faed27e0f574a55a0455eecd7b8a5b88b85a784519ba3cff047f", size = 17683, upload-time = "2025-10-06T04:15:18.984Z" } +sdist = { url = "https://files.pythonhosted.org/packages/82/ed/0301aeeac3e5353ef3d94b6ec08bbcabd04a72018415dcb29e588514bba8/python_dotenv-1.2.2.tar.gz", hash = "sha256:2c371a91fbd7ba082c2c1dc1f8bf89ca22564a087c2c287cd9b662adde799cf3", size = 50135, upload-time = "2026-03-01T16:00:26.196Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/51/e5/fecf13f06e5e5f67e8837d777d1bc43fac0ed2b77a676804df5c34744727/python_json_logger-4.0.0-py3-none-any.whl", hash = "sha256:af09c9daf6a813aa4cc7180395f50f2a9e5fa056034c9953aec92e381c5ba1e2", size = 15548, upload-time = "2025-10-06T04:15:17.553Z" }, + { url = "https://files.pythonhosted.org/packages/0b/d7/1959b9648791274998a9c3526f6d0ec8fd2233e4d4acce81bbae76b44b2a/python_dotenv-1.2.2-py3-none-any.whl", hash = "sha256:1d8214789a24de455a8b8bd8ae6fe3c6b69a5e3d64aa8a8e5d68e694bbcb285a", size = 22101, upload-time = "2026-03-01T16:00:25.09Z" }, ] [[package]] name = "python-multipart" -version = "0.0.22" +version = "0.0.31" source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/94/01/979e98d542a70714b0cb2b6728ed0b7c46792b695e3eaec3e20711271ca3/python_multipart-0.0.22.tar.gz", hash = "sha256:7340bef99a7e0032613f56dc36027b959fd3b30a787ed62d310e951f7c3a3a58", size = 37612, upload-time = "2026-01-25T10:15:56.219Z" } +sdist = { url = "https://files.pythonhosted.org/packages/64/7e/9b35ad8f3d9ca680f7c87a88f19612fdd8da9796c4d3b46e560ac79dcc4a/python_multipart-0.0.31.tar.gz", hash = "sha256:fc631183bb13e56db3158a4909908dfb2e23565286744e798241e63750e5d680", size = 46689, upload-time = "2026-06-04T08:27:49.014Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/1b/d0/397f9626e711ff749a95d96b7af99b9c566a9bb5129b8e4c10fc4d100304/python_multipart-0.0.22-py3-none-any.whl", hash = "sha256:2b2cd894c83d21bf49d702499531c7bafd057d730c201782048f7945d82de155", size = 24579, upload-time = "2026-01-25T10:15:54.811Z" }, + { url = "https://files.pythonhosted.org/packages/5e/1e/7f7f299527a5a8ad90acd5f2f78dfa6c8495c6301a3205106ea68a84de96/python_multipart-0.0.31-py3-none-any.whl", hash = "sha256:8408153d68a9773291fc1da39a8b85a50044bddbabd2dd72e9229776b7b15e28", size = 29996, upload-time = "2026-06-04T08:27:47.804Z" }, ] [[package]] @@ -3258,9 +3509,21 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/f1/12/de94a39c2ef588c7e6455cfbe7343d3b2dc9d6b6b2f40c4c6565744c873d/pyyaml-6.0.3-cp314-cp314t-win_arm64.whl", hash = "sha256:ebc55a14a21cb14062aa4162f906cd962b28e2e9ea38f9b4391244cd8de4ae0b", size = 149341, upload-time = "2025-09-25T21:32:56.828Z" }, ] +[[package]] +name = "pyyaml-env-tag" +version = "1.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "pyyaml" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/eb/2e/79c822141bfd05a853236b504869ebc6b70159afc570e1d5a20641782eaa/pyyaml_env_tag-1.1.tar.gz", hash = "sha256:2eb38b75a2d21ee0475d6d97ec19c63287a7e140231e4214969d0eac923cd7ff", size = 5737, upload-time = "2025-05-13T15:24:01.64Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/04/11/432f32f8097b03e3cd5fe57e88efb685d964e2e5178a48ed61e841f7fdce/pyyaml_env_tag-1.1-py3-none-any.whl", hash = "sha256:17109e1a528561e32f026364712fee1264bc2ea6715120891174ed1b980d2e04", size = 4722, upload-time = "2025-05-13T15:23:59.629Z" }, +] + [[package]] name = "qdrant-client" -version = "1.16.2" +version = "1.18.0" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "grpcio" }, @@ -3271,9 +3534,9 @@ dependencies = [ { name = "pydantic" }, { name = "urllib3" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/ca/7d/3cd10e26ae97b35cf856ca1dc67576e42414ae39502c51165bb36bb1dff8/qdrant_client-1.16.2.tar.gz", hash = "sha256:ca4ef5f9be7b5eadeec89a085d96d5c723585a391eb8b2be8192919ab63185f0", size = 331112, upload-time = "2025-12-12T10:58:30.866Z" } +sdist = { url = "https://files.pythonhosted.org/packages/65/45/5b1bdd15a3c7730eefb9c113600829e20d689b82b5a23f9e07d107094004/qdrant_client-1.18.0.tar.gz", hash = "sha256:52e8ece1a7d40519801bf0b70713bfa0f6b7ae28c7275bbe0b0286fbed7f6db4", size = 352580, upload-time = "2026-05-11T14:12:38.702Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/08/13/8ce16f808297e16968269de44a14f4fef19b64d9766be1d6ba5ba78b579d/qdrant_client-1.16.2-py3-none-any.whl", hash = "sha256:442c7ef32ae0f005e88b5d3c0783c63d4912b97ae756eb5e052523be682f17d3", size = 377186, upload-time = "2025-12-12T10:58:29.282Z" }, + { url = "https://files.pythonhosted.org/packages/d6/10/c437bd2ac41ef30d3019063e6ce537dc111e9214473b337ee88f7fa6359a/qdrant_client-1.18.0-py3-none-any.whl", hash = "sha256:093aa8cf8a420ee3ad2a68b007e1378d7992b2600e0b53c193fc172674f659cd", size = 398126, upload-time = "2026-05-11T14:12:36.998Z" }, ] [[package]] @@ -3289,15 +3552,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/93/f7/d00d9b4a0313a6be3a3e0818e6375e15da6d7076f4ae47d1324e7ca986a1/radon-6.0.1-py2.py3-none-any.whl", hash = "sha256:632cc032364a6f8bb1010a2f6a12d0f14bc7e5ede76585ef29dc0cecf4cd8859", size = 52784, upload-time = "2023-03-26T06:24:33.949Z" }, ] -[[package]] -name = "redis" -version = "7.1.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/43/c8/983d5c6579a411d8a99bc5823cc5712768859b5ce2c8afe1a65b37832c81/redis-7.1.0.tar.gz", hash = "sha256:b1cc3cfa5a2cb9c2ab3ba700864fb0ad75617b41f01352ce5779dabf6d5f9c3c", size = 4796669, upload-time = "2025-11-19T15:54:39.961Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/89/f0/8956f8a86b20d7bb9d6ac0187cf4cd54d8065bc9a1a09eb8011d4d326596/redis-7.1.0-py3-none-any.whl", hash = "sha256:23c52b208f92b56103e17c5d06bdc1a6c2c0b3106583985a76a18f83b265de2b", size = 354159, upload-time = "2025-11-19T15:54:38.064Z" }, -] - [[package]] name = "referencing" version = "0.36.2" @@ -3402,7 +3656,7 @@ wheels = [ [[package]] name = "requests" -version = "2.32.5" +version = "2.33.0" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "certifi" }, @@ -3410,9 +3664,9 @@ dependencies = [ { name = "idna" }, { name = "urllib3" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/c9/74/b3ff8e6c8446842c3f5c837e9c3dfcfe2018ea6ecef224c710c85ef728f4/requests-2.32.5.tar.gz", hash = "sha256:dbba0bac56e100853db0ea71b82b4dfd5fe2bf6d3754a8893c3af500cec7d7cf", size = 134517, upload-time = "2025-08-18T20:46:02.573Z" } +sdist = { url = "https://files.pythonhosted.org/packages/34/64/8860370b167a9721e8956ae116825caff829224fbca0ca6e7bf8ddef8430/requests-2.33.0.tar.gz", hash = "sha256:c7ebc5e8b0f21837386ad0e1c8fe8b829fa5f544d8df3b2253bff14ef29d7652", size = 134232, upload-time = "2026-03-25T15:10:41.586Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/1e/db/4254e3eabe8020b458f1a747140d32277ec7a271daf1d235b70dc0b4e6e3/requests-2.32.5-py3-none-any.whl", hash = "sha256:2462f94637a34fd532264295e186976db0f5d453d1cdd31473c85a6a161affb6", size = 64738, upload-time = "2025-08-18T20:46:00.542Z" }, + { url = "https://files.pythonhosted.org/packages/56/5d/c814546c2333ceea4ba42262d8c4d55763003e767fa169adc693bd524478/requests-2.33.0-py3-none-any.whl", hash = "sha256:3324635456fa185245e24865e810cecec7b4caf933d7eb133dcde67d48cee69b", size = 65017, upload-time = "2026-03-25T15:10:40.382Z" }, ] [[package]] @@ -3522,18 +3776,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/d0/02/fa464cdfbe6b26e0600b62c528b72d8608f5cc49f96b8d6e38c95d60c676/rpds_py-0.30.0-cp314-cp314t-win_amd64.whl", hash = "sha256:27f4b0e92de5bfbc6f86e43959e6edd1425c33b5e69aab0984a72047f2bcf1e3", size = 226532, upload-time = "2025-11-30T20:24:14.634Z" }, ] -[[package]] -name = "rsa" -version = "4.9.1" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "pyasn1" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/da/8a/22b7beea3ee0d44b1916c0c1cb0ee3af23b700b6da9f04991899d0c555d4/rsa-4.9.1.tar.gz", hash = "sha256:e7bdbfdb5497da4c07dfd35530e1a902659db6ff241e39d9953cad06ebd0ae75", size = 29034, upload-time = "2025-04-16T09:51:18.218Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/64/8d/0133e4eb4beed9e425d9a98ed6e081a55d195481b7632472be1af08d2f6b/rsa-4.9.1-py3-none-any.whl", hash = "sha256:68635866661c6836b8d39430f97a996acbd61bfa49406748ea243539fe239762", size = 34696, upload-time = "2025-04-16T09:51:17.142Z" }, -] - [[package]] name = "ruamel-yaml" version = "0.17.40" @@ -3612,14 +3854,14 @@ wheels = [ [[package]] name = "s3transfer" -version = "0.16.0" +version = "0.17.0" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "botocore" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/05/04/74127fc843314818edfa81b5540e26dd537353b123a4edc563109d8f17dd/s3transfer-0.16.0.tar.gz", hash = "sha256:8e990f13268025792229cd52fa10cb7163744bf56e719e0b9cb925ab79abf920", size = 153827, upload-time = "2025-12-01T02:30:59.114Z" } +sdist = { url = "https://files.pythonhosted.org/packages/9b/ec/7c692cde9125b77e84b307354d4fb705f98b8ccad59a036d5957ca75bfc3/s3transfer-0.17.0.tar.gz", hash = "sha256:9edeb6d1c3c2f89d6050348548834ad8289610d886e5bf7b7207728bd43ce33a", size = 155337, upload-time = "2026-04-29T22:07:36.33Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/fc/51/727abb13f44c1fcf6d145979e1535a35794db0f6e450a0cb46aa24732fe2/s3transfer-0.16.0-py3-none-any.whl", hash = "sha256:18e25d66fed509e3868dc1572b3f427ff947dd2c56f844a5bf09481ad3f3b2fe", size = 86830, upload-time = "2025-12-01T02:30:57.729Z" }, + { url = "https://files.pythonhosted.org/packages/87/72/c6c32d2b657fa3dad1de340254e14390b1e334ce38268b7ad51abda3c8c2/s3transfer-0.17.0-py3-none-any.whl", hash = "sha256:ce3801712acf4ad3e89fb9990df97b4972e93f4b3b0004d214be5bce12814c20", size = 86811, upload-time = "2026-04-29T22:07:34.966Z" }, ] [[package]] @@ -3725,15 +3967,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/e9/44/75a9c9421471a6c4805dbf2356f7c181a29c1879239abab1ea2cc8f38b40/sniffio-1.3.1-py3-none-any.whl", hash = "sha256:2f6da418d1f1e0fddd844478f41680e794e6051915791a034ff65e5f100525a2", size = 10235, upload-time = "2024-02-25T23:20:01.196Z" }, ] -[[package]] -name = "sortedcontainers" -version = "2.4.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/e8/c4/ba2f8066cceb6f23394729afe52f3bf7adec04bf9ed2c820b39e19299111/sortedcontainers-2.4.0.tar.gz", hash = "sha256:25caa5a06cc30b6b83d11423433f65d1f9d76c4c6a0c90e3379eaa43b9bfdb88", size = 30594, upload-time = "2021-05-16T22:03:42.897Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/32/46/9cb0e58b2deb7f82b84065f37f3bffeb12413f947f9388e4cac22c4621ce/sortedcontainers-2.4.0-py2.py3-none-any.whl", hash = "sha256:a163dcaede0f1c021485e957a39245190e74249897e2ae4b2aa38595db237ee0", size = 29575, upload-time = "2021-05-16T22:03:41.177Z" }, -] - [[package]] name = "sse-starlette" version = "3.2.0" @@ -3749,15 +3982,15 @@ wheels = [ [[package]] name = "starlette" -version = "0.52.1" +version = "1.3.1" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "anyio" }, { name = "typing-extensions", marker = "python_full_version < '3.13'" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/c4/68/79977123bb7be889ad680d79a40f339082c1978b5cfcf62c2d8d196873ac/starlette-0.52.1.tar.gz", hash = "sha256:834edd1b0a23167694292e94f597773bc3f89f362be6effee198165a35d62933", size = 2653702, upload-time = "2026-01-18T13:34:11.062Z" } +sdist = { url = "https://files.pythonhosted.org/packages/eb/e3/7c1dc7381d9f8ab7d854328ebfa884e62cb3f3d8549ddfd37c7814f42afa/starlette-1.3.1.tar.gz", hash = "sha256:05d0213193f2fbaae60e2ecb593b4add4262ad4e46536b54abe36f11a71724e0", size = 2703240, upload-time = "2026-06-12T09:23:11.602Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/81/0d/13d1d239a25cbfb19e740db83143e95c772a1fe10202dda4b76792b114dd/starlette-0.52.1-py3-none-any.whl", hash = "sha256:0029d43eb3d273bc4f83a08720b4912ea4b071087a3b48db01b7c839f7954d74", size = 74272, upload-time = "2026-01-18T13:34:09.188Z" }, + { url = "https://files.pythonhosted.org/packages/ec/bb/2799cc2ede3ed41131f8975621e7213dfc7ef4acbbaadfa440f32500c370/starlette-1.3.1-py3-none-any.whl", hash = "sha256:c7372aae11c3c3f26a42df7bd626cec2f47d03483d261d369516a615a53714c6", size = 73632, upload-time = "2026-06-12T09:23:10.017Z" }, ] [[package]] @@ -3783,7 +4016,7 @@ wheels = [ [[package]] name = "temporalio" -version = "1.20.0" +version = "1.27.2" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "nexus-rpc" }, @@ -3791,13 +4024,13 @@ dependencies = [ { name = "types-protobuf" }, { name = "typing-extensions" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/21/db/7d5118d28b0918888e1ec98f56f659fdb006351e06d95f30f4274962a76f/temporalio-1.20.0.tar.gz", hash = "sha256:5a6a85b7d298b7359bffa30025f7deac83c74ac095a4c6952fbf06c249a2a67c", size = 1850498, upload-time = "2025-11-25T21:25:20.225Z" } +sdist = { url = "https://files.pythonhosted.org/packages/ca/62/2bc1a9ad29382a3a99f088907ef2024a94420cfef340be1b33026c632828/temporalio-1.27.2.tar.gz", hash = "sha256:633bf2379492f3db1e887d1e64fdac00d9c2ddc3e9382b831d5af68256912e92", size = 2503041, upload-time = "2026-05-14T02:17:57.565Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/f4/1b/e69052aa6003eafe595529485d9c62d1382dd5e671108f1bddf544fb6032/temporalio-1.20.0-cp310-abi3-macosx_10_12_x86_64.whl", hash = "sha256:fba70314b4068f8b1994bddfa0e2ad742483f0ae714d2ef52e63013ccfd7042e", size = 12061638, upload-time = "2025-11-25T21:24:57.918Z" }, - { url = "https://files.pythonhosted.org/packages/ae/3b/3e8c67ed7f23bedfa231c6ac29a7a9c12b89881da7694732270f3ecd6b0c/temporalio-1.20.0-cp310-abi3-macosx_11_0_arm64.whl", hash = "sha256:ffc5bb6cabc6ae67f0bfba44de6a9c121603134ae18784a2ff3a7f230ad99080", size = 11562603, upload-time = "2025-11-25T21:25:01.721Z" }, - { url = "https://files.pythonhosted.org/packages/6d/be/ed0cc11702210522a79e09703267ebeca06eb45832b873a58de3ca76b9d0/temporalio-1.20.0-cp310-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a1e80c1e4cdf88fa8277177f563edc91466fe4dc13c0322f26e55c76b6a219e6", size = 11824016, upload-time = "2025-11-25T21:25:06.771Z" }, - { url = "https://files.pythonhosted.org/packages/9d/97/09c5cafabc80139d97338a2bdd8ec22e08817dfd2949ab3e5b73565006eb/temporalio-1.20.0-cp310-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ba92d909188930860c9d89ca6d7a753bc5a67e4e9eac6cea351477c967355eed", size = 12189521, upload-time = "2025-11-25T21:25:12.091Z" }, - { url = "https://files.pythonhosted.org/packages/11/23/5689c014a76aff3b744b3ee0d80815f63b1362637814f5fbb105244df09b/temporalio-1.20.0-cp310-abi3-win_amd64.whl", hash = "sha256:eacfd571b653e0a0f4aa6593f4d06fc628797898f0900d400e833a1f40cad03a", size = 12745027, upload-time = "2025-11-25T21:25:16.827Z" }, + { url = "https://files.pythonhosted.org/packages/64/85/9da14f9fbdfae95435d29353bb1c55891581ad6b23c86ca56e72d83035ed/temporalio-1.27.2-cp310-abi3-macosx_10_12_x86_64.whl", hash = "sha256:860f706380faafec8f183f9194d0883c8033a4211c5d19c2c962c45b06cf99e9", size = 14602829, upload-time = "2026-05-14T02:17:45.624Z" }, + { url = "https://files.pythonhosted.org/packages/24/51/b7437991e71eea082dc53222da11f064974917cd59063ba57e13e5895fbc/temporalio-1.27.2-cp310-abi3-macosx_11_0_arm64.whl", hash = "sha256:a8dc0c680e351f3132809861888d8326dbd5030dd4e570663597e7d4768d9502", size = 13997680, upload-time = "2026-05-14T02:17:53.968Z" }, + { url = "https://files.pythonhosted.org/packages/8c/5d/358065040e6f0cedbf669acd333622999eec737ff868ca7829d727b77746/temporalio-1.27.2-cp310-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:805f3de4d193dec52e040e41dbfc9ab44be0206d2e81142ceefaf7b7208058d1", size = 14252199, upload-time = "2026-05-14T02:17:36.972Z" }, + { url = "https://files.pythonhosted.org/packages/72/8a/85d2eab07c3e23fc1124203e76857c69ab9b22d8ccebad0835e294edb754/temporalio-1.27.2-cp310-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5bc996cb501b8a918f50037ccee6facb05bb70984acada4c2a3e01f5e7957a38", size = 14779945, upload-time = "2026-05-14T02:18:05.513Z" }, + { url = "https://files.pythonhosted.org/packages/67/81/c9b08609e2a92ecf62c97c59cabfa0608337c8d5cc9941eed5d9a7778840/temporalio-1.27.2-cp310-abi3-win_amd64.whl", hash = "sha256:62a84ae9a60c17932971e4ca3b0f3cd6f32f173b8183e759989376503fb95af6", size = 14981897, upload-time = "2026-05-14T02:17:27.333Z" }, ] [[package]] @@ -3927,57 +4160,42 @@ wheels = [ [[package]] name = "torch" -version = "2.10.0" +version = "2.12.1" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "cuda-bindings", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "cuda-bindings", marker = "sys_platform == 'linux'" }, + { name = "cuda-toolkit", extra = ["cudart", "cufft", "cufile", "cupti", "curand", "cusolver", "cusparse", "nvjitlink", "nvrtc", "nvtx"], marker = "sys_platform == 'linux'" }, { name = "filelock" }, { name = "fsspec" }, { name = "jinja2" }, { name = "networkx" }, - { name = "nvidia-cublas-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "nvidia-cuda-cupti-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "nvidia-cuda-nvrtc-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "nvidia-cuda-runtime-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "nvidia-cudnn-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "nvidia-cufft-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "nvidia-cufile-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "nvidia-curand-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "nvidia-cusolver-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "nvidia-cusparse-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "nvidia-cusparselt-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "nvidia-nccl-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "nvidia-nvjitlink-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "nvidia-nvshmem-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "nvidia-nvtx-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-cublas", marker = "sys_platform == 'linux'" }, + { name = "nvidia-cudnn-cu13", marker = "sys_platform == 'linux'" }, + { name = "nvidia-cusparselt-cu13", marker = "sys_platform == 'linux'" }, + { name = "nvidia-nccl-cu13", marker = "sys_platform == 'linux'" }, + { name = "nvidia-nvshmem-cu13", marker = "sys_platform == 'linux'" }, { name = "setuptools" }, { name = "sympy" }, - { name = "triton", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "triton", marker = "sys_platform == 'linux'" }, { name = "typing-extensions" }, ] wheels = [ - { url = "https://files.pythonhosted.org/packages/c9/2f/0b295dd8d199ef71e6f176f576473d645d41357b7b8aa978cc6b042575df/torch-2.10.0-1-cp312-none-macosx_11_0_arm64.whl", hash = "sha256:6abb224c2b6e9e27b592a1c0015c33a504b00a0e0938f1499f7f514e9b7bfb5c", size = 79498197, upload-time = "2026-02-06T17:37:27.627Z" }, - { url = "https://files.pythonhosted.org/packages/a4/1b/af5fccb50c341bd69dc016769503cb0857c1423fbe9343410dfeb65240f2/torch-2.10.0-1-cp313-none-macosx_11_0_arm64.whl", hash = "sha256:7350f6652dfd761f11f9ecb590bfe95b573e2961f7a242eccb3c8e78348d26fe", size = 79498248, upload-time = "2026-02-06T17:37:31.982Z" }, - { url = "https://files.pythonhosted.org/packages/cc/af/758e242e9102e9988969b5e621d41f36b8f258bb4a099109b7a4b4b50ea4/torch-2.10.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:5fd4117d89ffd47e3dcc71e71a22efac24828ad781c7e46aaaf56bf7f2796acf", size = 145996088, upload-time = "2026-01-21T16:24:44.171Z" }, - { url = "https://files.pythonhosted.org/packages/23/8e/3c74db5e53bff7ed9e34c8123e6a8bfef718b2450c35eefab85bb4a7e270/torch-2.10.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:787124e7db3b379d4f1ed54dd12ae7c741c16a4d29b49c0226a89bea50923ffb", size = 915711952, upload-time = "2026-01-21T16:23:53.503Z" }, - { url = "https://files.pythonhosted.org/packages/6e/01/624c4324ca01f66ae4c7cd1b74eb16fb52596dce66dbe51eff95ef9e7a4c/torch-2.10.0-cp312-cp312-win_amd64.whl", hash = "sha256:2c66c61f44c5f903046cc696d088e21062644cbe541c7f1c4eaae88b2ad23547", size = 113757972, upload-time = "2026-01-21T16:24:39.516Z" }, - { url = "https://files.pythonhosted.org/packages/c9/5c/dee910b87c4d5c0fcb41b50839ae04df87c1cfc663cf1b5fca7ea565eeaa/torch-2.10.0-cp312-none-macosx_11_0_arm64.whl", hash = "sha256:6d3707a61863d1c4d6ebba7be4ca320f42b869ee657e9b2c21c736bf17000294", size = 79498198, upload-time = "2026-01-21T16:24:34.704Z" }, - { url = "https://files.pythonhosted.org/packages/c9/6f/f2e91e34e3fcba2e3fc8d8f74e7d6c22e74e480bbd1db7bc8900fdf3e95c/torch-2.10.0-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:5c4d217b14741e40776dd7074d9006fd28b8a97ef5654db959d8635b2fe5f29b", size = 146004247, upload-time = "2026-01-21T16:24:29.335Z" }, - { url = "https://files.pythonhosted.org/packages/98/fb/5160261aeb5e1ee12ee95fe599d0541f7c976c3701d607d8fc29e623229f/torch-2.10.0-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:6b71486353fce0f9714ca0c9ef1c850a2ae766b409808acd58e9678a3edb7738", size = 915716445, upload-time = "2026-01-21T16:22:45.353Z" }, - { url = "https://files.pythonhosted.org/packages/6a/16/502fb1b41e6d868e8deb5b0e3ae926bbb36dab8ceb0d1b769b266ad7b0c3/torch-2.10.0-cp313-cp313-win_amd64.whl", hash = "sha256:c2ee399c644dc92ef7bc0d4f7e74b5360c37cdbe7c5ba11318dda49ffac2bc57", size = 113757050, upload-time = "2026-01-21T16:24:19.204Z" }, - { url = "https://files.pythonhosted.org/packages/1a/0b/39929b148f4824bc3ad6f9f72a29d4ad865bcf7ebfc2fa67584773e083d2/torch-2.10.0-cp313-cp313t-macosx_14_0_arm64.whl", hash = "sha256:3202429f58309b9fa96a614885eace4b7995729f44beb54d3e4a47773649d382", size = 79851305, upload-time = "2026-01-21T16:24:09.209Z" }, - { url = "https://files.pythonhosted.org/packages/d8/14/21fbce63bc452381ba5f74a2c0a959fdf5ad5803ccc0c654e752e0dbe91a/torch-2.10.0-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:aae1b29cd68e50a9397f5ee897b9c24742e9e306f88a807a27d617f07adb3bd8", size = 146005472, upload-time = "2026-01-21T16:22:29.022Z" }, - { url = "https://files.pythonhosted.org/packages/54/fd/b207d1c525cb570ef47f3e9f836b154685011fce11a2f444ba8a4084d042/torch-2.10.0-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:6021db85958db2f07ec94e1bc77212721ba4920c12a18dc552d2ae36a3eb163f", size = 915612644, upload-time = "2026-01-21T16:21:47.019Z" }, - { url = "https://files.pythonhosted.org/packages/36/53/0197f868c75f1050b199fe58f9bf3bf3aecac9b4e85cc9c964383d745403/torch-2.10.0-cp313-cp313t-win_amd64.whl", hash = "sha256:ff43db38af76fda183156153983c9a096fc4c78d0cd1e07b14a2314c7f01c2c8", size = 113997015, upload-time = "2026-01-21T16:23:00.767Z" }, - { url = "https://files.pythonhosted.org/packages/0e/13/e76b4d9c160e89fff48bf16b449ea324bda84745d2ab30294c37c2434c0d/torch-2.10.0-cp313-none-macosx_11_0_arm64.whl", hash = "sha256:cdf2a523d699b70d613243211ecaac14fe9c5df8a0b0a9c02add60fb2a413e0f", size = 79498248, upload-time = "2026-01-21T16:23:09.315Z" }, - { url = "https://files.pythonhosted.org/packages/4f/93/716b5ac0155f1be70ed81bacc21269c3ece8dba0c249b9994094110bfc51/torch-2.10.0-cp314-cp314-macosx_14_0_arm64.whl", hash = "sha256:bf0d9ff448b0218e0433aeb198805192346c4fd659c852370d5cc245f602a06a", size = 79464992, upload-time = "2026-01-21T16:23:05.162Z" }, - { url = "https://files.pythonhosted.org/packages/69/2b/51e663ff190c9d16d4a8271203b71bc73a16aa7619b9f271a69b9d4a936b/torch-2.10.0-cp314-cp314-manylinux_2_28_aarch64.whl", hash = "sha256:233aed0659a2503b831d8a67e9da66a62c996204c0bba4f4c442ccc0c68a3f60", size = 146018567, upload-time = "2026-01-21T16:22:23.393Z" }, - { url = "https://files.pythonhosted.org/packages/5e/cd/4b95ef7f293b927c283db0b136c42be91c8ec6845c44de0238c8c23bdc80/torch-2.10.0-cp314-cp314-manylinux_2_28_x86_64.whl", hash = "sha256:682497e16bdfa6efeec8cde66531bc8d1fbbbb4d8788ec6173c089ed3cc2bfe5", size = 915721646, upload-time = "2026-01-21T16:21:16.983Z" }, - { url = "https://files.pythonhosted.org/packages/56/97/078a007208f8056d88ae43198833469e61a0a355abc0b070edd2c085eb9a/torch-2.10.0-cp314-cp314-win_amd64.whl", hash = "sha256:6528f13d2a8593a1a412ea07a99812495bec07e9224c28b2a25c0a30c7da025c", size = 113752373, upload-time = "2026-01-21T16:22:13.471Z" }, - { url = "https://files.pythonhosted.org/packages/d8/94/71994e7d0d5238393df9732fdab607e37e2b56d26a746cb59fdb415f8966/torch-2.10.0-cp314-cp314t-macosx_14_0_arm64.whl", hash = "sha256:f5ab4ba32383061be0fb74bda772d470140a12c1c3b58a0cfbf3dae94d164c28", size = 79850324, upload-time = "2026-01-21T16:22:09.494Z" }, - { url = "https://files.pythonhosted.org/packages/e2/65/1a05346b418ea8ccd10360eef4b3e0ce688fba544e76edec26913a8d0ee0/torch-2.10.0-cp314-cp314t-manylinux_2_28_aarch64.whl", hash = "sha256:716b01a176c2a5659c98f6b01bf868244abdd896526f1c692712ab36dbaf9b63", size = 146006482, upload-time = "2026-01-21T16:22:18.42Z" }, - { url = "https://files.pythonhosted.org/packages/1d/b9/5f6f9d9e859fc3235f60578fa64f52c9c6e9b4327f0fe0defb6de5c0de31/torch-2.10.0-cp314-cp314t-manylinux_2_28_x86_64.whl", hash = "sha256:d8f5912ba938233f86361e891789595ff35ca4b4e2ac8fe3670895e5976731d6", size = 915613050, upload-time = "2026-01-21T16:20:49.035Z" }, - { url = "https://files.pythonhosted.org/packages/66/4d/35352043ee0eaffdeff154fad67cd4a31dbed7ff8e3be1cc4549717d6d51/torch-2.10.0-cp314-cp314t-win_amd64.whl", hash = "sha256:71283a373f0ee2c89e0f0d5f446039bdabe8dbc3c9ccf35f0f784908b0acd185", size = 113995816, upload-time = "2026-01-21T16:22:05.312Z" }, + { url = "https://files.pythonhosted.org/packages/f0/54/efb7ebca77970012b0cc21687a55d70eb2ba514b2c2b8e18d9fb1222f3be/torch-2.12.1-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:d2dd0f2c5f7ccbddaf34cade0deaf476808368f902b9cdb7f36a2ab42301bc0e", size = 87991951, upload-time = "2026-06-17T21:07:49.309Z" }, + { url = "https://files.pythonhosted.org/packages/1e/00/4210d76ca7424981f04033ebe7e48816ab83287a62538747a58825db770c/torch-2.12.1-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:2de4e19b88a481482c6c75291f2d6a52eda3ce51f311b29aa9b68499c830c07c", size = 426382721, upload-time = "2026-06-17T21:06:41.842Z" }, + { url = "https://files.pythonhosted.org/packages/76/1f/bc9f5a5aa569307076365f25afcebacb22e9c754b1bcfbaaa146627c7fda/torch-2.12.1-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:649e4ced014ba646f76f8cb9c9726735a6323eb321b7919f942790a923f90921", size = 532261322, upload-time = "2026-06-17T21:06:06.673Z" }, + { url = "https://files.pythonhosted.org/packages/9e/49/c549461daa008159d006a76a991fbc2f26fa8bac27a4030c858463dcb20f/torch-2.12.1-cp312-cp312-win_amd64.whl", hash = "sha256:e86550597877fb272ddc52db2f85b82cb601ea7bd932576a0340152cae2200b3", size = 122988095, upload-time = "2026-06-17T21:07:44.9Z" }, + { url = "https://files.pythonhosted.org/packages/ff/4a/0300261818e1560d72cc160ac826005507e8b7ca0a35788b591436d05b4a/torch-2.12.1-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:c75e93173c700bccd6bfcc4a9d19ce242ab6dacd1f1781483027a16239b9e650", size = 87992358, upload-time = "2026-06-17T21:07:40.299Z" }, + { url = "https://files.pythonhosted.org/packages/30/a7/874a5ca05e8f159211dca7921060f7057acc1adb26431e119fd150623efc/torch-2.12.1-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:fcb61ccd20784b62bdd78ec84238a5cfb383b4994902e03bac95505ab360884c", size = 426386134, upload-time = "2026-06-17T21:07:31.481Z" }, + { url = "https://files.pythonhosted.org/packages/e1/75/20bb8fe9c1ad6538cce8cd0391b51927ae5af0b17ed1eab44b8824465dc1/torch-2.12.1-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:f4afc8083dff08719edbea346644476e3cec0cf40ebe256be0ee5d5b7c7e8c0d", size = 532268019, upload-time = "2026-06-17T21:05:37.925Z" }, + { url = "https://files.pythonhosted.org/packages/d1/fa/824ddb662af55b2eabc0dbb7b57c7c0b1bcd93693754a2b8509ec4d16490/torch-2.12.1-cp313-cp313-win_amd64.whl", hash = "sha256:f92609e3b3ce72f25e2eb780d043ced2480c1a86c47c852604fc7a9108648386", size = 122987777, upload-time = "2026-06-17T21:07:09.49Z" }, + { url = "https://files.pythonhosted.org/packages/63/b7/1b49fe7086ea36839cc80abc43174c43d0ab6f676c0891c871c162f44fe3/torch-2.12.1-cp314-cp314-macosx_14_0_arm64.whl", hash = "sha256:e9b6f7d2dd66ea87a3ae620069d31335d594c06effb1a383bdd21cfe61e44ece", size = 88010025, upload-time = "2026-06-17T21:07:03.934Z" }, + { url = "https://files.pythonhosted.org/packages/d7/06/5b44063a6545036dcc680d2d303b137d9176cfb2cc1e1863e3ef94abeb52/torch-2.12.1-cp314-cp314-manylinux_2_28_aarch64.whl", hash = "sha256:7973ccd3d2cd35c74449213f7bded199bec6c6247e705cbeda7407af79703d91", size = 426392891, upload-time = "2026-06-17T21:05:52.261Z" }, + { url = "https://files.pythonhosted.org/packages/f8/dd/c9ce9a4b0eb3c5bb92d9ea56766e2c22559f0b45171149188494edcce80f/torch-2.12.1-cp314-cp314-manylinux_2_28_x86_64.whl", hash = "sha256:c64ac4aac16be5e296dcd912305605804b203333c690bf98c55bc09494ee92ad", size = 532272494, upload-time = "2026-06-17T21:06:22.72Z" }, + { url = "https://files.pythonhosted.org/packages/21/7c/f3a601fc1b1f663ff269bfe553654e638651939aa6563e8daa7167c33098/torch-2.12.1-cp314-cp314-win_amd64.whl", hash = "sha256:f6dc4caf7eb4adb38a2d9f536b51db56310fdd1254e69a2d96767e1367c892b3", size = 122987254, upload-time = "2026-06-17T21:06:33.199Z" }, + { url = "https://files.pythonhosted.org/packages/e6/8c/b8087556cf81ddd808dbeb34afb8396d7ae7a1694ab489f08b1a0004e7d0/torch-2.12.1-cp314-cp314t-macosx_14_0_arm64.whl", hash = "sha256:2afbb2bdaa8a95040e733f05492ddf133c3967c9b7ce0abd218d704b6cab437d", size = 88303173, upload-time = "2026-06-17T21:05:06.603Z" }, + { url = "https://files.pythonhosted.org/packages/4a/07/fe09d1699fbed2afa10ebc692ff2b99d113f2605b6748cea633989e2789a/torch-2.12.1-cp314-cp314t-manylinux_2_28_aarch64.whl", hash = "sha256:97eba061fcb042fed191400b15568990073d67eaacaa6ee9b7ca01dd8b790fe9", size = 426404009, upload-time = "2026-06-17T21:04:57.557Z" }, + { url = "https://files.pythonhosted.org/packages/2e/f7/0ce4f6c1962c60ded7270e0a9eb560fb615c92b89d332cf9e3dff36d5ecc/torch-2.12.1-cp314-cp314t-manylinux_2_28_x86_64.whl", hash = "sha256:3867b861391701012adb2df93360efb88494dca245a185e3bb7624495cfe3f33", size = 532184292, upload-time = "2026-06-17T21:05:17.526Z" }, + { url = "https://files.pythonhosted.org/packages/70/db/e384c12aba30320ca92aaaf557456cbcb26f04b4df307728bb8f019f5000/torch-2.12.1-cp314-cp314t-win_amd64.whl", hash = "sha256:dd15595f8fc764cffde8c6361a3beb6ef69a028c851b1b3e70e077f615980d4e", size = 123231142, upload-time = "2026-06-17T21:05:27.061Z" }, ] [[package]] @@ -3994,45 +4212,66 @@ wheels = [ [[package]] name = "transformers" -version = "4.57.6" +version = "5.3.0" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "filelock" }, { name = "huggingface-hub" }, { name = "numpy" }, { name = "packaging" }, { name = "pyyaml" }, { name = "regex" }, - { name = "requests" }, { name = "safetensors" }, { name = "tokenizers" }, { name = "tqdm" }, + { name = "typer" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/c4/35/67252acc1b929dc88b6602e8c4a982e64f31e733b804c14bc24b47da35e6/transformers-4.57.6.tar.gz", hash = "sha256:55e44126ece9dc0a291521b7e5492b572e6ef2766338a610b9ab5afbb70689d3", size = 10134912, upload-time = "2026-01-16T10:38:39.284Z" } +sdist = { url = "https://files.pythonhosted.org/packages/fc/1a/70e830d53ecc96ce69cfa8de38f163712d2b43ac52fbd743f39f56025c31/transformers-5.3.0.tar.gz", hash = "sha256:009555b364029da9e2946d41f1c5de9f15e6b1df46b189b7293f33a161b9c557", size = 8830831, upload-time = "2026-03-04T17:41:46.119Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/03/b8/e484ef633af3887baeeb4b6ad12743363af7cce68ae51e938e00aaa0529d/transformers-4.57.6-py3-none-any.whl", hash = "sha256:4c9e9de11333ddfe5114bc872c9f370509198acf0b87a832a0ab9458e2bd0550", size = 11993498, upload-time = "2026-01-16T10:38:31.289Z" }, + { url = "https://files.pythonhosted.org/packages/b8/88/ae8320064e32679a5429a2c9ebbc05c2bf32cefb6e076f9b07f6d685a9b4/transformers-5.3.0-py3-none-any.whl", hash = "sha256:50ac8c89c3c7033444fb3f9f53138096b997ebb70d4b5e50a2e810bf12d3d29a", size = 10661827, upload-time = "2026-03-04T17:41:42.722Z" }, ] [[package]] name = "tree-sitter" -version = "0.25.0" +version = "0.25.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/66/7c/0350cfc47faadc0d3cf7d8237a4e34032b3014ddf4a12ded9933e1648b55/tree-sitter-0.25.2.tar.gz", hash = "sha256:fe43c158555da46723b28b52e058ad444195afd1db3ca7720c59a254544e9c20", size = 177961, upload-time = "2025-09-25T17:37:59.751Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/3c/9e/20c2a00a862f1c2897a436b17edb774e831b22218083b459d0d081c9db33/tree_sitter-0.25.2-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:ddabfff809ffc983fc9963455ba1cecc90295803e06e140a4c83e94c1fa3d960", size = 146941, upload-time = "2025-09-25T17:37:34.813Z" }, + { url = "https://files.pythonhosted.org/packages/ef/04/8512e2062e652a1016e840ce36ba1cc33258b0dcc4e500d8089b4054afec/tree_sitter-0.25.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:c0c0ab5f94938a23fe81928a21cc0fac44143133ccc4eb7eeb1b92f84748331c", size = 137699, upload-time = "2025-09-25T17:37:36.349Z" }, + { url = "https://files.pythonhosted.org/packages/47/8a/d48c0414db19307b0fb3bb10d76a3a0cbe275bb293f145ee7fba2abd668e/tree_sitter-0.25.2-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:dd12d80d91d4114ca097626eb82714618dcdfacd6a5e0955216c6485c350ef99", size = 607125, upload-time = "2025-09-25T17:37:37.725Z" }, + { url = "https://files.pythonhosted.org/packages/39/d1/b95f545e9fc5001b8a78636ef942a4e4e536580caa6a99e73dd0a02e87aa/tree_sitter-0.25.2-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b43a9e4c89d4d0839de27cd4d6902d33396de700e9ff4c5ab7631f277a85ead9", size = 635418, upload-time = "2025-09-25T17:37:38.922Z" }, + { url = "https://files.pythonhosted.org/packages/de/4d/b734bde3fb6f3513a010fa91f1f2875442cdc0382d6a949005cd84563d8f/tree_sitter-0.25.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:fbb1706407c0e451c4f8cc016fec27d72d4b211fdd3173320b1ada7a6c74c3ac", size = 631250, upload-time = "2025-09-25T17:37:40.039Z" }, + { url = "https://files.pythonhosted.org/packages/46/f2/5f654994f36d10c64d50a192239599fcae46677491c8dd53e7579c35a3e3/tree_sitter-0.25.2-cp312-cp312-win_amd64.whl", hash = "sha256:6d0302550bbe4620a5dc7649517c4409d74ef18558276ce758419cf09e578897", size = 127156, upload-time = "2025-09-25T17:37:41.132Z" }, + { url = "https://files.pythonhosted.org/packages/67/23/148c468d410efcf0a9535272d81c258d840c27b34781d625f1f627e2e27d/tree_sitter-0.25.2-cp312-cp312-win_arm64.whl", hash = "sha256:0c8b6682cac77e37cfe5cf7ec388844957f48b7bd8d6321d0ca2d852994e10d5", size = 113984, upload-time = "2025-09-25T17:37:42.074Z" }, + { url = "https://files.pythonhosted.org/packages/8c/67/67492014ce32729b63d7ef318a19f9cfedd855d677de5773476caf771e96/tree_sitter-0.25.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:0628671f0de69bb279558ef6b640bcfc97864fe0026d840f872728a86cd6b6cd", size = 146926, upload-time = "2025-09-25T17:37:43.041Z" }, + { url = "https://files.pythonhosted.org/packages/4e/9c/a278b15e6b263e86c5e301c82a60923fa7c59d44f78d7a110a89a413e640/tree_sitter-0.25.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:f5ddcd3e291a749b62521f71fc953f66f5fd9743973fd6dd962b092773569601", size = 137712, upload-time = "2025-09-25T17:37:44.039Z" }, + { url = "https://files.pythonhosted.org/packages/54/9a/423bba15d2bf6473ba67846ba5244b988cd97a4b1ea2b146822162256794/tree_sitter-0.25.2-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:bd88fbb0f6c3a0f28f0a68d72df88e9755cf5215bae146f5a1bdc8362b772053", size = 607873, upload-time = "2025-09-25T17:37:45.477Z" }, + { url = "https://files.pythonhosted.org/packages/ed/4c/b430d2cb43f8badfb3a3fa9d6cd7c8247698187b5674008c9d67b2a90c8e/tree_sitter-0.25.2-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b878e296e63661c8e124177cc3084b041ba3f5936b43076d57c487822426f614", size = 636313, upload-time = "2025-09-25T17:37:46.68Z" }, + { url = "https://files.pythonhosted.org/packages/9d/27/5f97098dbba807331d666a0997662e82d066e84b17d92efab575d283822f/tree_sitter-0.25.2-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:d77605e0d353ba3fe5627e5490f0fbfe44141bafa4478d88ef7954a61a848dae", size = 631370, upload-time = "2025-09-25T17:37:47.993Z" }, + { url = "https://files.pythonhosted.org/packages/d4/3c/87caaed663fabc35e18dc704cd0e9800a0ee2f22bd18b9cbe7c10799895d/tree_sitter-0.25.2-cp313-cp313-win_amd64.whl", hash = "sha256:463c032bd02052d934daa5f45d183e0521ceb783c2548501cf034b0beba92c9b", size = 127157, upload-time = "2025-09-25T17:37:48.967Z" }, + { url = "https://files.pythonhosted.org/packages/d5/23/f8467b408b7988aff4ea40946a4bd1a2c1a73d17156a9d039bbaff1e2ceb/tree_sitter-0.25.2-cp313-cp313-win_arm64.whl", hash = "sha256:b3f63a1796886249bd22c559a5944d64d05d43f2be72961624278eff0dcc5cb8", size = 113975, upload-time = "2025-09-25T17:37:49.922Z" }, + { url = "https://files.pythonhosted.org/packages/07/e3/d9526ba71dfbbe4eba5e51d89432b4b333a49a1e70712aa5590cd22fc74f/tree_sitter-0.25.2-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:65d3c931013ea798b502782acab986bbf47ba2c452610ab0776cf4a8ef150fc0", size = 146776, upload-time = "2025-09-25T17:37:50.898Z" }, + { url = "https://files.pythonhosted.org/packages/42/97/4bd4ad97f85a23011dd8a535534bb1035c4e0bac1234d58f438e15cff51f/tree_sitter-0.25.2-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:bda059af9d621918efb813b22fb06b3fe00c3e94079c6143fcb2c565eb44cb87", size = 137732, upload-time = "2025-09-25T17:37:51.877Z" }, + { url = "https://files.pythonhosted.org/packages/b6/19/1e968aa0b1b567988ed522f836498a6a9529a74aab15f09dd9ac1e41f505/tree_sitter-0.25.2-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:eac4e8e4c7060c75f395feec46421eb61212cb73998dbe004b7384724f3682ab", size = 609456, upload-time = "2025-09-25T17:37:52.925Z" }, + { url = "https://files.pythonhosted.org/packages/48/b6/cf08f4f20f4c9094006ef8828555484e842fc468827ad6e56011ab668dbd/tree_sitter-0.25.2-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:260586381b23be33b6191a07cea3d44ecbd6c01aa4c6b027a0439145fcbc3358", size = 636772, upload-time = "2025-09-25T17:37:54.647Z" }, + { url = "https://files.pythonhosted.org/packages/57/e2/d42d55bf56360987c32bc7b16adb06744e425670b823fb8a5786a1cea991/tree_sitter-0.25.2-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:7d2ee1acbacebe50ba0f85fff1bc05e65d877958f00880f49f9b2af38dce1af0", size = 631522, upload-time = "2025-09-25T17:37:55.833Z" }, + { url = "https://files.pythonhosted.org/packages/03/87/af9604ebe275a9345d88c3ace0cf2a1341aa3f8ef49dd9fc11662132df8a/tree_sitter-0.25.2-cp314-cp314-win_amd64.whl", hash = "sha256:4973b718fcadfb04e59e746abfbb0288694159c6aeecd2add59320c03368c721", size = 130864, upload-time = "2025-09-25T17:37:57.453Z" }, + { url = "https://files.pythonhosted.org/packages/a6/6e/e64621037357acb83d912276ffd30a859ef117f9c680f2e3cb955f47c680/tree_sitter-0.25.2-cp314-cp314-win_arm64.whl", hash = "sha256:b8d4429954a3beb3e844e2872610d2a4800ba4eb42bb1990c6a4b1949b18459f", size = 117470, upload-time = "2025-09-25T17:37:58.431Z" }, +] + +[[package]] +name = "tree-sitter-c" +version = "0.24.1" source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/98/21/e952c3180f0fd83d09cee9e0bc29f67827c659cee45077ae06eb7d813cfc/tree-sitter-0.25.0.tar.gz", hash = "sha256:15c88775cf24db06677bafe62df058a6457d8a6dde67baa48dd3723b905e79a6", size = 177740, upload-time = "2025-07-20T13:17:48.886Z" } +sdist = { url = "https://files.pythonhosted.org/packages/f1/f5/ba8cd08d717277551ade8537d3aa2a94b907c6c6e0fbcf4e4d8b1c747fa3/tree_sitter_c-0.24.1.tar.gz", hash = "sha256:7d2d0cda0b8dda428c81440c1e94367f9f13548eedca3f49768bde66b1422ad6", size = 228014, upload-time = "2025-05-24T17:32:58.384Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/c5/75/36a4726a09aeb0477ca4a45aba4abf9705642b871539005ca91ddd68faa3/tree_sitter-0.25.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:d9efacce0140ad74f97e027fb4ae693debff05f6246f3e024937f9500a0e874a", size = 147016, upload-time = "2025-07-20T13:17:33.921Z" }, - { url = "https://files.pythonhosted.org/packages/ff/5e/a549a21e459de94056cf48ca5e10e3774bc9b0460ffb3aec469a5f6001c0/tree_sitter-0.25.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:82b4a5535107d2b8feee085edcafa89858faa4e1a98e94cfe1740c0ca8c28d84", size = 140832, upload-time = "2025-07-20T13:17:34.82Z" }, - { url = "https://files.pythonhosted.org/packages/d7/ed/7cc29a309e5f5cc209902c93589d29a4faeb656c7eecc1abd86842633b8f/tree_sitter-0.25.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c613372545490dfba3b3e7d934fda1156e3d16b27c0335c65a92f2b4fa6af5da", size = 617875, upload-time = "2025-07-20T13:17:35.693Z" }, - { url = "https://files.pythonhosted.org/packages/76/fc/43a61a35f021429d905ce272be9a9ea6dad6fe2c849782c53bd083a935cf/tree_sitter-0.25.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:241a90c815a354594d3147012ce470cfc797695ab768e29198815e147ef3c165", size = 635857, upload-time = "2025-07-20T13:17:36.676Z" }, - { url = "https://files.pythonhosted.org/packages/9b/28/c9236c505e35b3aedb3c941a359a708c173cbedab8d843fec729bab81ed9/tree_sitter-0.25.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:6f0b01b5068f1888af223021ba461480df28c76f39893c8113aae2154a2b81fd", size = 632649, upload-time = "2025-07-20T13:17:37.56Z" }, - { url = "https://files.pythonhosted.org/packages/13/d3/5dff82a02646619545c4e7c9b9ec87bc126f1937760228fcf2e91f5079c7/tree_sitter-0.25.0-cp312-cp312-win_amd64.whl", hash = "sha256:1807bd1dae1f50721d65b270e6ffa85de84234ae39f98f4da702db56c2627e23", size = 126785, upload-time = "2025-07-20T13:17:38.488Z" }, - { url = "https://files.pythonhosted.org/packages/71/61/4fffd405569d9c1551906766825da75a2d8f1c075be8994542d5d7ba7768/tree_sitter-0.25.0-cp312-cp312-win_arm64.whl", hash = "sha256:7848be6aeab5c1d62d649506d80d0e463727cb1bb55f423e88bf317db0be8d67", size = 113615, upload-time = "2025-07-20T13:17:39.965Z" }, - { url = "https://files.pythonhosted.org/packages/7a/fd/7578088dddec9b89b60d8dfea1901f3a5dff61b66d3c637c309b6209c8db/tree_sitter-0.25.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:689a19d51103f727a545ec9ba9cd377267445859838c38ec55d159dc57e82e8a", size = 147009, upload-time = "2025-07-20T13:17:41.038Z" }, - { url = "https://files.pythonhosted.org/packages/7a/3e/6e3dac18c119acf738174a19ce91d89b34f6ad1ca1c5dd57b245ae15c935/tree_sitter-0.25.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:86288b218ef958dcafe40030d6d70c99baffaf808bd81b49de160f9724fc0ba4", size = 140828, upload-time = "2025-07-20T13:17:42.023Z" }, - { url = "https://files.pythonhosted.org/packages/fa/21/94d26f5d488d85bf5201280f82ce7de374ce30ed5d5469e57623d64ead9a/tree_sitter-0.25.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5241610319177ee2f68b8e719bf1e1b309155e126d9cd567ff84f20878d7e5d0", size = 618600, upload-time = "2025-07-20T13:17:43.203Z" }, - { url = "https://files.pythonhosted.org/packages/67/74/e852445871c0a82bfa5e3d16541e0ce6775ef458d3a8f03ab3737c661832/tree_sitter-0.25.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:ae1553d652a54926f80dc0a42fba07db110bb1a3ebaf47d1c4c64f8d44dd8207", size = 636691, upload-time = "2025-07-20T13:17:44.382Z" }, - { url = "https://files.pythonhosted.org/packages/87/67/759afe10e0018aa3ca3269df0257228b2df120e3956171a3667b133f3100/tree_sitter-0.25.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:ccac581551407a73a519b872553973598b69d3d237ffaf32408fb38ecb775484", size = 632730, upload-time = "2025-07-20T13:17:45.687Z" }, - { url = "https://files.pythonhosted.org/packages/8d/42/24a80dafdb32f1f7d16e3236f2ba8a2bc7b0e5c2a19c7b45f874f0980e90/tree_sitter-0.25.0-cp313-cp313-win_amd64.whl", hash = "sha256:d58e912869514ebb441b15c22a13a9c78f1b69be15f6a42b1d18e3f790e5d6ba", size = 126779, upload-time = "2025-07-20T13:17:46.943Z" }, - { url = "https://files.pythonhosted.org/packages/6f/2e/6af369e9d6deab9baaa60e2fa91acf82a68c63d835a2fe4f4265674ecc53/tree_sitter-0.25.0-cp313-cp313-win_arm64.whl", hash = "sha256:a1b8302161fa8da52cfafcd7575fa7d5806a9608a0b51c7a1fe45bfe70b62d46", size = 113623, upload-time = "2025-07-20T13:17:47.718Z" }, + { url = "https://files.pythonhosted.org/packages/15/c7/c817be36306e457c2d36cc324789046390d9d8c555c38772429ffdb7d361/tree_sitter_c-0.24.1-cp310-abi3-macosx_10_9_x86_64.whl", hash = "sha256:9c06ac26a1efdcc8b26a8a6970fbc6997c4071857359e5837d4c42892d45fe1e", size = 80940, upload-time = "2025-05-24T17:32:49.967Z" }, + { url = "https://files.pythonhosted.org/packages/7a/42/283909467290b24fdbc29bb32ee20e409a19a55002b43175d66d091ca1a4/tree_sitter_c-0.24.1-cp310-abi3-macosx_11_0_arm64.whl", hash = "sha256:942bcd7cbecd810dcf7ca6f8f834391ebf0771a89479646d891ba4ca2fdfdc88", size = 86304, upload-time = "2025-05-24T17:32:51.271Z" }, + { url = "https://files.pythonhosted.org/packages/94/53/fb4f61d4e5f15ec3da85774a4df8e58d3b5b73036cf167f0203b4dd9d158/tree_sitter_c-0.24.1-cp310-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9a74cfd7a11ca5a961fafd4d751892ee65acae667d2818968a6f079397d8d28c", size = 109996, upload-time = "2025-05-24T17:32:52.119Z" }, + { url = "https://files.pythonhosted.org/packages/5e/e8/fc541d34ee81c386c5453c2596c1763e8e9cd7cb0725f39d7dfa2276afa4/tree_sitter_c-0.24.1-cp310-abi3-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a6a807705a3978911dc7ee26a7ad36dcfacb6adfc13c190d496660ec9bd66707", size = 98137, upload-time = "2025-05-24T17:32:53.361Z" }, + { url = "https://files.pythonhosted.org/packages/32/c6/d0563319cae0d5b5780a92e2806074b24afea2a07aa4c10599b899bda3ec/tree_sitter_c-0.24.1-cp310-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:789781afcb710df34144f7e2a20cd80e325114b9119e3956c6bd1dd2d365df98", size = 94148, upload-time = "2025-05-24T17:32:54.855Z" }, + { url = "https://files.pythonhosted.org/packages/50/5a/6361df7f3fa2310c53a0d26b4702a261c332da16fa9d801e381e3a86e25f/tree_sitter_c-0.24.1-cp310-abi3-win_amd64.whl", hash = "sha256:290bff0f9c79c966496ebae45042f77543e6e4aea725f40587a8611d566231a8", size = 84703, upload-time = "2025-05-24T17:32:56.084Z" }, + { url = "https://files.pythonhosted.org/packages/22/6a/210a302e8025ac492cbaea58d3720d66b7d8034c5d747ac5e4d2d235aa25/tree_sitter_c-0.24.1-cp310-abi3-win_arm64.whl", hash = "sha256:d46bbda06f838c2dcb91daf767813671fd366b49ad84ff37db702129267b46e1", size = 82715, upload-time = "2025-05-24T17:32:57.248Z" }, ] [[package]] @@ -4113,6 +4352,20 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/73/ac/2615b858c9fc6c2f5458c6375c501392ef45c486e576985393521ca50971/tree_sitter_lua-0.4.1-cp310-abi3-win_arm64.whl", hash = "sha256:081577e4ca58f3b4f1856794f3e2f5a0955476b68a2a50baf85c9bb05b932738", size = 22752, upload-time = "2025-12-31T12:50:38.117Z" }, ] +[[package]] +name = "tree-sitter-php" +version = "0.24.1" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a4/c8/1a499038cb4036bea1d560ffbc807a6fb940261aa22296bd49a62ed8bcba/tree_sitter_php-0.24.1-cp310-abi3-macosx_10_9_x86_64.whl", hash = "sha256:d56e2dcf025450f84a2cdbf4b18a09e6cb88b92e9e6858e63de3d4133ab2e43e", size = 219550, upload-time = "2025-08-16T22:14:30.212Z" }, + { url = "https://files.pythonhosted.org/packages/ab/5e/b52f2599acb29f6899470f7137d3d491c752b88df3950fb7408aea57ddca/tree_sitter_php-0.24.1-cp310-abi3-macosx_11_0_arm64.whl", hash = "sha256:29759c67d4c27a68c227ed82c0b7e4699617b1bd23757d50c081f81a12b4f80d", size = 229632, upload-time = "2025-08-16T22:14:31.85Z" }, + { url = "https://files.pythonhosted.org/packages/6b/58/ca290da45380bd6ba7c6b0b98cc5fc30325c32c7f14f0c93196a451b19c4/tree_sitter_php-0.24.1-cp310-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:94b89832ac09f078eed2acd88598838bc51012224cbcebb916dbb6a37e74357e", size = 325351, upload-time = "2025-08-16T22:14:33Z" }, + { url = "https://files.pythonhosted.org/packages/9a/c6/fd863a7a779d0ab67688939eba0e08bff7b1ffe731288d3d3610df21217b/tree_sitter_php-0.24.1-cp310-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:7a1404a30f2972498ace040b0029738b8dac45d0a12932ccb8b605eb94bafbe4", size = 313021, upload-time = "2025-08-16T22:14:34.394Z" }, + { url = "https://files.pythonhosted.org/packages/48/ed/aace12f30c4f5474a9ad0e9da85c060174e3764342c9860974bb0feb02fc/tree_sitter_php-0.24.1-cp310-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:3e96f61462a960c78e5389c7ba6c16c25e66b465c763b8e63ad66423326c2fa7", size = 305905, upload-time = "2025-08-16T22:14:35.846Z" }, + { url = "https://files.pythonhosted.org/packages/4e/c4/6c690c33b1ae9cae9505c0a2896f046fda174d72c46bdafce6aab3b2f2e7/tree_sitter_php-0.24.1-cp310-abi3-win_amd64.whl", hash = "sha256:1a1b65b72a8410d421f914ee13d38fd546a94d01cb834f69b27c78ba7589a5b5", size = 208014, upload-time = "2025-08-16T22:14:37.206Z" }, + { url = "https://files.pythonhosted.org/packages/7b/69/54c670d725c092b89e76ca6984582b6a768b128ac1859ed48141b124da1d/tree_sitter_php-0.24.1-cp310-abi3-win_arm64.whl", hash = "sha256:56a70c5ef1bddb15f220a479b2f2edf3042c764b6c443921fbd7ca9174d664e3", size = 206033, upload-time = "2025-08-16T22:14:38.632Z" }, +] + [[package]] name = "tree-sitter-python" version = "0.25.0" @@ -4176,14 +4429,17 @@ wheels = [ [[package]] name = "triton" -version = "3.6.0" +version = "3.7.1" source = { registry = "https://pypi.org/simple" } wheels = [ - { url = "https://files.pythonhosted.org/packages/ab/a8/cdf8b3e4c98132f965f88c2313a4b493266832ad47fb52f23d14d4f86bb5/triton-3.6.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:74caf5e34b66d9f3a429af689c1c7128daba1d8208df60e81106b115c00d6fca", size = 188266850, upload-time = "2026-01-20T16:00:43.041Z" }, - { url = "https://files.pythonhosted.org/packages/f9/0b/37d991d8c130ce81a8728ae3c25b6e60935838e9be1b58791f5997b24a54/triton-3.6.0-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:10c7f76c6e72d2ef08df639e3d0d30729112f47a56b0c81672edc05ee5116ac9", size = 188289450, upload-time = "2026-01-20T16:00:49.136Z" }, - { url = "https://files.pythonhosted.org/packages/35/f8/9c66bfc55361ec6d0e4040a0337fb5924ceb23de4648b8a81ae9d33b2b38/triton-3.6.0-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d002e07d7180fd65e622134fbd980c9a3d4211fb85224b56a0a0efbd422ab72f", size = 188400296, upload-time = "2026-01-20T16:00:56.042Z" }, - { url = "https://files.pythonhosted.org/packages/df/3d/9e7eee57b37c80cec63322c0231bb6da3cfe535a91d7a4d64896fcb89357/triton-3.6.0-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a17a5d5985f0ac494ed8a8e54568f092f7057ef60e1b0fa09d3fd1512064e803", size = 188273063, upload-time = "2026-01-20T16:01:07.278Z" }, - { url = "https://files.pythonhosted.org/packages/f6/56/6113c23ff46c00aae423333eb58b3e60bdfe9179d542781955a5e1514cb3/triton-3.6.0-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:46bd1c1af4b6704e554cad2eeb3b0a6513a980d470ccfa63189737340c7746a7", size = 188397994, upload-time = "2026-01-20T16:01:14.236Z" }, + { url = "https://files.pythonhosted.org/packages/94/fa/f856e24deb462d5f18bd4b5a746957862ab9b6ee5834bda60605ec348366/triton-3.7.1-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9497f2e696ee368862a181a90b2dcc03ca978cc4f602abd67c7d81022a6988e1", size = 184692359, upload-time = "2026-06-17T20:03:48.288Z" }, + { url = "https://files.pythonhosted.org/packages/c4/6f/fb96d15db6f36d6eae4cafb998c2e0353bf59d7c4ea1662d7497f269134a/triton-3.7.1-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:7e40869937a68206ec70d7f25bb7ec6433cb083f9135e1f36dbd318dc449a728", size = 197719725, upload-time = "2026-06-17T19:53:20.419Z" }, + { url = "https://files.pythonhosted.org/packages/00/42/c5089d4d9327fcd1e862c599cc2927f39418f84dd11a84cb2ccff9d4787a/triton-3.7.1-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:cdbfc09d9ec58bc5e68321525653220de7515c199e7a8097a97c85e62b52cd0a", size = 184694629, upload-time = "2026-06-17T20:03:53.444Z" }, + { url = "https://files.pythonhosted.org/packages/07/42/2c3ac59253ae8892b6f307875263dd23dc875cdf732d3aea40d6d41fb7cb/triton-3.7.1-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:58c0e131da05134a2a4788ccbcc0c1105cf0f54c8e98f19e34cd465396dc15eb", size = 197729241, upload-time = "2026-06-17T19:53:27.801Z" }, + { url = "https://files.pythonhosted.org/packages/40/71/e01aa7ad573883ed9456f130226babdec70b005e098c4d6226a6238e761b/triton-3.7.1-cp314-cp314-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:fe4ea396a06171f1f1f58cbd39c70b09294398f7dd7c620939bab54ad6f934fa", size = 184705764, upload-time = "2026-06-17T20:03:59.064Z" }, + { url = "https://files.pythonhosted.org/packages/a4/09/5683146fda6a2b569deb78ccfd8fbfea8bfe55f726b081c0a6bb18dd6f28/triton-3.7.1-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:2020153b08280415ec0da6607834e79166442147e78e144df06b508c75b186d2", size = 197729537, upload-time = "2026-06-17T19:53:35.516Z" }, + { url = "https://files.pythonhosted.org/packages/e9/f8/448220c3092019f9fdfab39ec47985968181d67da34b44f6a7f6280a5cbb/triton-3.7.1-cp314-cp314t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c58e4c61f0c73b5dba3b5d19b4a7093c32f90dc18b2a7f121a7c16ccd31107b7", size = 184814760, upload-time = "2026-06-17T20:04:04.984Z" }, + { url = "https://files.pythonhosted.org/packages/f0/ac/229b7d4589d2e5937310e72c6d46e89599d16a4a12b479ffa1499fee8eb8/triton-3.7.1-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:10ba85fa2cca4a2fbdeb36bf1cb082f2c252bda55bf9fccd74f65ec5bc647e68", size = 197824404, upload-time = "2026-06-17T19:53:42.772Z" }, ] [[package]] @@ -4285,13 +4541,22 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/c7/b0/003792df09decd6849a5e39c28b513c06e84436a54440380862b5aeff25d/tzdata-2025.3-py2.py3-none-any.whl", hash = "sha256:06a47e5700f3081aab02b2e513160914ff0694bce9947d6b76ebd6bf57cfc5d1", size = 348521, upload-time = "2025-12-13T17:45:33.889Z" }, ] +[[package]] +name = "uncalled-for" +version = "0.3.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/b5/82/345cc927f7fbdae6065e7768759932fcc827fc20b29b45dfbafa2f1f7da4/uncalled_for-0.3.2.tar.gz", hash = "sha256:89f5dbcd71e2b8f47c030b1fa302e6cce2ec795d1ac565eeb6525c5fe55cb8a2", size = 50032, upload-time = "2026-05-06T13:38:25.204Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/3b/25/2c87754f3a9e692315f7b811244090e68f362979fc8886b3fbd2985a1d8c/uncalled_for-0.3.2-py3-none-any.whl", hash = "sha256:0ff60b142c7d1f8070bde9d42afaa70aedc77dcc10998c227687e9c15713418e", size = 11444, upload-time = "2026-05-06T13:38:24.025Z" }, +] + [[package]] name = "urllib3" -version = "2.6.3" +version = "2.7.0" source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/c7/24/5f1b3bdffd70275f6661c76461e25f024d5a38a46f04aaca912426a2b1d3/urllib3-2.6.3.tar.gz", hash = "sha256:1b62b6884944a57dbe321509ab94fd4d3b307075e0c2eae991ac71ee15ad38ed", size = 435556, upload-time = "2026-01-07T16:24:43.925Z" } +sdist = { url = "https://files.pythonhosted.org/packages/53/0c/06f8b233b8fd13b9e5ee11424ef85419ba0d8ba0b3138bf360be2ff56953/urllib3-2.7.0.tar.gz", hash = "sha256:231e0ec3b63ceb14667c67be60f2f2c40a518cb38b03af60abc813da26505f4c", size = 433602, upload-time = "2026-05-07T16:13:18.596Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/39/08/aaaad47bc4e9dc8c725e68f9d04865dbcb2052843ff09c97b08904852d84/urllib3-2.6.3-py3-none-any.whl", hash = "sha256:bf272323e553dfb2e87d9bfd225ca7b0f467b919d7bbd355436d3fd37cb0acd4", size = 131584, upload-time = "2026-01-07T16:24:42.685Z" }, + { url = "https://files.pythonhosted.org/packages/7f/3e/5db95bcf282c52709639744ca2a8b149baccf648e39c8cc87553df9eae0c/urllib3-2.7.0-py3-none-any.whl", hash = "sha256:9fb4c81ebbb1ce9531cce37674bbc6f1360472bc18ca9a553ede278ef7276897", size = 131087, upload-time = "2026-05-07T16:13:17.151Z" }, ] [[package]] @@ -4354,6 +4619,92 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/33/e8/e40370e6d74ddba47f002a32919d91310d6074130fe4e17dabcafc15cbf1/watchdog-6.0.0-py3-none-win_ia64.whl", hash = "sha256:a1914259fa9e1454315171103c6a30961236f508b9b623eae470268bbcc6a22f", size = 79067, upload-time = "2024-11-01T14:07:11.845Z" }, ] +[[package]] +name = "watchfiles" +version = "1.2.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "anyio" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/cd/41/5e1a4bb12aac5f1493fa1bdc11154eca3b258ca4eba65d39c473fe19d8e9/watchfiles-1.2.0.tar.gz", hash = "sha256:c995fba777f1ea992f090f9236e9284cf7a5d1a0130dd5a3d82c598cacd76838", size = 108252, upload-time = "2026-05-18T04:32:04.251Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/b8/2f/e42c992d2afda3108ea1c02acecc991b9f31d05c14adc2a7cee9ee211fc4/watchfiles-1.2.0-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:bc13eb17538be00c874699dc0abe4ee2bc8d50bb1166a6b9e175ef3fd7eb8f26", size = 400115, upload-time = "2026-05-18T04:32:02.06Z" }, + { url = "https://files.pythonhosted.org/packages/5f/8f/6af2ea19065c91d8b0ea3516fdfc8c0d349f407e8e9fbf4e5a17360de8ad/watchfiles-1.2.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:2d95ddc1eb6914154253d239089900813f6a767e174b8e6a50e7fdacb7e4236c", size = 393659, upload-time = "2026-05-18T04:30:50.951Z" }, + { url = "https://files.pythonhosted.org/packages/13/01/b32a967c56fb3e3e5be3db52c3d3b87fa4513aa367d8ed1ad96d42952e5f/watchfiles-1.2.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8f70d8b291ef6e88d19b1f297a6905ddb978888d9272b0d05e6f53309856bcfc", size = 453207, upload-time = "2026-05-18T04:31:04.231Z" }, + { url = "https://files.pythonhosted.org/packages/04/98/97557a812180338cb1abd32e1cffcc4588f59b5f23e0cb006b2ba95ba64a/watchfiles-1.2.0-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:56d8641cf834c2836922899105bd3ce3d0dfc69291d52edf0b4d0436829b34c0", size = 459273, upload-time = "2026-05-18T04:31:50.377Z" }, + { url = "https://files.pythonhosted.org/packages/e8/a8/b4b08dcb7653b8087c6586f7ce649505900e866bbcfe40dc9587af02e686/watchfiles-1.2.0-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2581a94056e55d7d0a31a823ea92bf73749c489ca2285bfdc0fbe6b2bb49d50c", size = 489927, upload-time = "2026-05-18T04:31:42.485Z" }, + { url = "https://files.pythonhosted.org/packages/50/94/3dceea03545d2e5ddfd839f0ddd5e1cecbf1697b5a428d5ba11cef6af95d/watchfiles-1.2.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:41bc1199f7523b3f82843c88cbb979180c949caef0342cf90968f178e5d49b01", size = 570476, upload-time = "2026-05-18T04:31:03.071Z" }, + { url = "https://files.pythonhosted.org/packages/cc/f2/d39a5450c3532092b91f81d274360e613c2371bc874a89c7a1a3c5e8d138/watchfiles-1.2.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7571e4464cb6e434958f867f7f730b8ab0b75e3f8e5eac0499168486ab3c33a8", size = 465650, upload-time = "2026-05-18T04:30:12.701Z" }, + { url = "https://files.pythonhosted.org/packages/22/24/ed72f68cbc1333ca9b9f2200aa048bb6658ae41709bc1caad4310f4bdffd/watchfiles-1.2.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e53a384f76b631c3ae5334ce6a52f0baa3a911eb94a4eac7f160079868b716d5", size = 456398, upload-time = "2026-05-18T04:30:13.784Z" }, + { url = "https://files.pythonhosted.org/packages/0d/64/982ef4a4e5bab5b6e5b6becc8cd5e732f6130a78b855f0abec6439a9a135/watchfiles-1.2.0-cp312-cp312-manylinux_2_31_riscv64.whl", hash = "sha256:d20029a60a71a052a24c4db7673bc4de39ab89adbaccbfb5d67987c5d73f424d", size = 465140, upload-time = "2026-05-18T04:31:52.111Z" }, + { url = "https://files.pythonhosted.org/packages/a0/0c/95282abf4ed680b6096010bcfc30c5fa7a041fc5aa5a2ad17a2cc6c75bba/watchfiles-1.2.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:2cb93af48550faf1cea04c303107c8b75833de7013e57ce27d3b8d21d8d0f58c", size = 630259, upload-time = "2026-05-18T04:31:25.676Z" }, + { url = "https://files.pythonhosted.org/packages/30/45/607c1de1530c4bdcf2cf1d1ecc2505ddba5d96bd43ba9f2b0e79876f850f/watchfiles-1.2.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:2995c176de7692b86a2e4c58d9ec718f753150a979cb4a754e2b4ffa38e70906", size = 659859, upload-time = "2026-05-18T04:30:24.333Z" }, + { url = "https://files.pythonhosted.org/packages/fa/08/d9e2e0f9e8e6791d33aefc694ad7eefa7f901f63caff84a81ded38692f9c/watchfiles-1.2.0-cp312-cp312-win32.whl", hash = "sha256:7a2cffd17d27d2ecbb310c2b1d8174f222a5495b1a721894afa88ec11e25b898", size = 275480, upload-time = "2026-05-18T04:30:31.307Z" }, + { url = "https://files.pythonhosted.org/packages/1c/e6/9d42569c0102645cc8cea5d8c7d8a1e9d4ada2cb7f05f75e554b8aa2202a/watchfiles-1.2.0-cp312-cp312-win_amd64.whl", hash = "sha256:f155b3a1b2a5fc89cdc70d47ee5d54e3b75e88efa34982028a35daef9ba00379", size = 288718, upload-time = "2026-05-18T04:32:10.745Z" }, + { url = "https://files.pythonhosted.org/packages/0a/26/88e0dc6ee3898169d7fa22bb6a69cabf2502d2ee25cb8c876d1262d204f8/watchfiles-1.2.0-cp312-cp312-win_arm64.whl", hash = "sha256:8fa585ede612ee9f9e91b18bebf9ba11b9ae29a4e3a0d0cf6fca3e382133f0d5", size = 281026, upload-time = "2026-05-18T04:30:22.23Z" }, + { url = "https://files.pythonhosted.org/packages/d1/4d/70a7feced9f87e2ff26dba42667290f41694fc64646c67261fbb8cab5d5c/watchfiles-1.2.0-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:01ea8d66f0693b9b60a6541c8d10263091ca9a9060d242f3c1f3143f9aad2c98", size = 399730, upload-time = "2026-05-18T04:31:38.162Z" }, + { url = "https://files.pythonhosted.org/packages/31/3a/0da302f2307aee316922806ebd5726c542cbd787c938271cf14a074c7daf/watchfiles-1.2.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:7ba0480b9a74af058f43b337e937a451e109295c420916d68ad24e3dc02f5e44", size = 392842, upload-time = "2026-05-18T04:30:27.051Z" }, + { url = "https://files.pythonhosted.org/packages/db/ef/d5bdb705c224dbc256aa0c1ec47bf4e61ec52558f2afb44a71a1fe4d7015/watchfiles-1.2.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4f34e26a19f91f710c08e0183429f0d1d15df734e6bc78c31e77b9ea9c433658", size = 452989, upload-time = "2026-05-18T04:31:11.945Z" }, + { url = "https://files.pythonhosted.org/packages/71/29/5495f2c1661949ef7a35e4d71111d129cfe7606414a26887a919d0a55406/watchfiles-1.2.0-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:b4e77f6a55f858504069abd35d336a637555c09bca453dde1ee1e5ada8a6a1fb", size = 458978, upload-time = "2026-05-18T04:30:52.606Z" }, + { url = "https://files.pythonhosted.org/packages/d5/8c/7f9c07c433811c2fffd93e13fdfb7135de9aab5f2ae41be08960fa0047dc/watchfiles-1.2.0-cp313-cp313-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0cb4d80e212f116474a545c21c912b445f16bb0cef9e6a73a498164223e14e2f", size = 490248, upload-time = "2026-05-18T04:31:36.003Z" }, + { url = "https://files.pythonhosted.org/packages/3c/11/d93632febc52fbc21be90231bb7c17fd5387f46c9076fd40a5f9c2ae6910/watchfiles-1.2.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b974946a10af379d425e2eef5b62f5c6ebeaccf91d45eaad6f5b27ecd4f91aa0", size = 571847, upload-time = "2026-05-18T04:31:10.862Z" }, + { url = "https://files.pythonhosted.org/packages/55/b4/383173e73aabb07ad1d9c7aa859d95437ac46a6d6a1e11005facda0c9d19/watchfiles-1.2.0-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:86bc13c25a8d1fcd70b51d0ce7c9b65e90de5666fcbfd3e34957cc73ee19aeb5", size = 465974, upload-time = "2026-05-18T04:30:17.006Z" }, + { url = "https://files.pythonhosted.org/packages/a7/6c/89b1a230a78f57c52dd8893adb1f92f94411721b6ec12596c56d98c74356/watchfiles-1.2.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ca148d73dea36c9763aaa351e4d7a51780ec1584217c45276f4fe8239c768b71", size = 454782, upload-time = "2026-05-18T04:30:35.656Z" }, + { url = "https://files.pythonhosted.org/packages/24/62/1732118367cfff0a9fce3bf62ff4bfded09ef5df21d9d446b858b3f70a96/watchfiles-1.2.0-cp313-cp313-manylinux_2_31_riscv64.whl", hash = "sha256:c525543d91961c6955b2636b308569e84a1d1c5f5f2932041ab9ef46422f43e3", size = 465182, upload-time = "2026-05-18T04:30:20.846Z" }, + { url = "https://files.pythonhosted.org/packages/28/96/716f7e5f51339bf22963f3345f9f27d7f3b30e2eadc597e257c881dd3c53/watchfiles-1.2.0-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:a204794696ffb8f9b10fba6f7cb5216d42f3b2b71860ccac6b6e42f5f10973b0", size = 629841, upload-time = "2026-05-18T04:31:05.397Z" }, + { url = "https://files.pythonhosted.org/packages/4c/fe/c40783950fd771ccf66ab3ec2722d188a9af1c7f96c6e811f36e40c6e03f/watchfiles-1.2.0-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:10d86db20695afe7997ac9e1717637d6714a8d0220458c33f3d2061f54cec427", size = 658028, upload-time = "2026-05-18T04:31:48.22Z" }, + { url = "https://files.pythonhosted.org/packages/71/72/4508db1856d1d87fcbb3b63f4839bab1b5682cb0e8d224d122263c09654a/watchfiles-1.2.0-cp313-cp313-win32.whl", hash = "sha256:eb283ee99e21ad6443c8cdb06ac5b34b1308c329cbdf03fa02b445363714c799", size = 275183, upload-time = "2026-05-18T04:30:59.57Z" }, + { url = "https://files.pythonhosted.org/packages/f9/36/14b76ca57652e5cc5fd1c11f32a261292c08a0d19a00351013c2549cbfb2/watchfiles-1.2.0-cp313-cp313-win_amd64.whl", hash = "sha256:a0f27f01bee51861392bb6b7c4fdb290b27d1eb194e9e28788d68102a0e898d9", size = 288059, upload-time = "2026-05-18T04:32:07.937Z" }, + { url = "https://files.pythonhosted.org/packages/1b/8d/0a85e395398d8d20fadfe5c5d32c726eee17a519e78fb356f2cf7531bffe/watchfiles-1.2.0-cp313-cp313-win_arm64.whl", hash = "sha256:3651aa7058595e9cfb75d35dd5ada2bf9f48a5b8a0f3562821d3e210c507e077", size = 280186, upload-time = "2026-05-18T04:31:54.484Z" }, + { url = "https://files.pythonhosted.org/packages/37/68/36db056f1fdcc5f07302f56e631774d6835bcd6fa3ace402304621d5f9e5/watchfiles-1.2.0-cp313-cp313t-macosx_10_12_x86_64.whl", hash = "sha256:faea288b6f0ab1902ef08f4ca6de005dccf856c4e0c4f21b8c5fce02d90a1b08", size = 399031, upload-time = "2026-05-18T04:30:44.576Z" }, + { url = "https://files.pythonhosted.org/packages/c1/64/01a9d6f66a82a5c101ce939274106cc72759d62427e153f01edd2b9f87c2/watchfiles-1.2.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:01859b11fd9fbca670f4d5da00fbac282cfea9bd67a2125d8b2833a3b5617ea9", size = 391205, upload-time = "2026-05-18T04:30:25.413Z" }, + { url = "https://files.pythonhosted.org/packages/84/2c/0a44fe058cb4bb7b8ede6b6670698bbb7c0400740e378d00022189b7b31d/watchfiles-1.2.0-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fff610d7bb2256a317bb1e96f0d7862c7aa8076733ee5df0fd41bbe76a24a4f4", size = 451892, upload-time = "2026-05-18T04:32:14.005Z" }, + { url = "https://files.pythonhosted.org/packages/67/a1/351e0d56cd35e6488b5c8b4fb11a809a5bc923e8fe8fed9faf8920be0c89/watchfiles-1.2.0-cp313-cp313t-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:b141a4891c995a039cd89e9a49e62df1dc8a559a5d1a6e4c7106d16c12777a55", size = 458867, upload-time = "2026-05-18T04:31:22.279Z" }, + { url = "https://files.pythonhosted.org/packages/d5/7d/9d09605187f1b838998624049fcf8bf47b73c1a3b76901fcac1782f62277/watchfiles-1.2.0-cp313-cp313t-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f22943b7770483f6ea0721c6b11d022947a98eb0acae14694de034f4d0d38925", size = 490217, upload-time = "2026-05-18T04:31:43.657Z" }, + { url = "https://files.pythonhosted.org/packages/60/5d/a17a16eccb182f04188cd308ec24b1a71a9b5c4e7098269cf35d9fa56d02/watchfiles-1.2.0-cp313-cp313t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1bc6195825b7dcd217968bb1f801a60fd4c16e8eeab5bedc7fe917d7d5995ab4", size = 571458, upload-time = "2026-05-18T04:32:11.875Z" }, + { url = "https://files.pythonhosted.org/packages/d3/3d/4dd457062083ab1938e5dfd45032eb425cee2ac817287ca8ff4356183e5d/watchfiles-1.2.0-cp313-cp313t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d4a4b147f5dca2a5d325a06a832fb43f345751adfbc63204aec30e0d9ca965a2", size = 464707, upload-time = "2026-05-18T04:30:43.492Z" }, + { url = "https://files.pythonhosted.org/packages/c6/71/ea8c57b128f5383de74d0c7d2d9c57ad7c9a65a930c451bd25d524b295b7/watchfiles-1.2.0-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4543579a9bdb0c9560039b4ffddbdb39545707659fbc430ce4c10f3f68d557f9", size = 454663, upload-time = "2026-05-18T04:30:16.061Z" }, + { url = "https://files.pythonhosted.org/packages/53/fd/2e812bf938406d7db351f0703ddd3fc6c061cf30d96153a77bc79a943a44/watchfiles-1.2.0-cp313-cp313t-manylinux_2_31_riscv64.whl", hash = "sha256:20aa0e708b920bde876a4aa82dc7dd6ebea228a63a67cda6632c2fc87b787efa", size = 463537, upload-time = "2026-05-18T04:31:44.9Z" }, + { url = "https://files.pythonhosted.org/packages/86/56/d17a7f1dd1bc3035f1072694a551301272f1739c2d8e319c927cb9e29b38/watchfiles-1.2.0-cp313-cp313t-musllinux_1_1_aarch64.whl", hash = "sha256:d413349d565dab74297f2a63e84a097936be69bf8f3b3801f27f380e32040f44", size = 629194, upload-time = "2026-05-18T04:31:14.141Z" }, + { url = "https://files.pythonhosted.org/packages/be/06/f1ff66bf5cae50aa4062779a0ecd0bbaf15e466195719074078947d9a17d/watchfiles-1.2.0-cp313-cp313t-musllinux_1_1_x86_64.whl", hash = "sha256:f28b2725eb8cce327b9b3ab02415c853011dc55c95832fe90de6bc56f5315f72", size = 656194, upload-time = "2026-05-18T04:31:47.14Z" }, + { url = "https://files.pythonhosted.org/packages/e7/54/a9c7ea9a82a4ac65e7004c0a03920b5cdd2f9c3b678757d9cd425aa51d53/watchfiles-1.2.0-cp314-cp314-macosx_10_12_x86_64.whl", hash = "sha256:b8c8358484d5fa12ef34f05b7f4168eaf1932f408725ff6d023c33ec17bd79d4", size = 400205, upload-time = "2026-05-18T04:32:05.153Z" }, + { url = "https://files.pythonhosted.org/packages/aa/5d/c9ab3534374a4a67450696905d6ef16a04405448b8dc52bd752ae50423d4/watchfiles-1.2.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:9f04b092229ad2c50126dd3c922c8822e51e605993764a33058d4a791ab42281", size = 392508, upload-time = "2026-05-18T04:30:54.849Z" }, + { url = "https://files.pythonhosted.org/packages/26/ca/1ad30103535cf0cecd7b993e8d50edc5351b1820e38f2d22e3df58962feb/watchfiles-1.2.0-cp314-cp314-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7a7ce236284f002a156f70add88efe5c70879cccbb658be0822c54b1306fc09d", size = 452448, upload-time = "2026-05-18T04:30:53.727Z" }, + { url = "https://files.pythonhosted.org/packages/37/a1/ceee2cdf2afbd715fa07758d39c9859513eae411b23196f7fd039e5feedd/watchfiles-1.2.0-cp314-cp314-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:b9909cc2b48468b575eefa944919e1fe8a36c5849d5c7c168f80a8c1db69398e", size = 459605, upload-time = "2026-05-18T04:30:23.312Z" }, + { url = "https://files.pythonhosted.org/packages/e8/f6/421e30fd1cb3907a84ed92ab3f1983e37ba2dca015e9a894a048418417a2/watchfiles-1.2.0-cp314-cp314-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0a37faaed405c67e28e6be45a1fa4f206ef5a2860f27c237db9fa30704c38242", size = 490757, upload-time = "2026-05-18T04:30:47.358Z" }, + { url = "https://files.pythonhosted.org/packages/41/b0/55ed1b97ed08be7bba6f9a541cac15f2a858e1d74d2b07b6da70a82aab00/watchfiles-1.2.0-cp314-cp314-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9649193aa27bd9ff2e80ff29bfaa93085496c7a3a377592823cc58b77ee88add", size = 568672, upload-time = "2026-05-18T04:30:38.915Z" }, + { url = "https://files.pythonhosted.org/packages/d1/cf/d8ae8a80dd7bafab395ea7681c10237311bbf34d37704a8c744e7cf31fc7/watchfiles-1.2.0-cp314-cp314-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4e4ff8e37f99cf1da89e255e07c9c4b37c214038c4283707bdec308cb1b0ea1f", size = 464197, upload-time = "2026-05-18T04:30:09.914Z" }, + { url = "https://files.pythonhosted.org/packages/7c/8a/3076c496ca8dafe0e8cd03fcebdfc47be4b1174b4e5b24ff6e396e6b3af2/watchfiles-1.2.0-cp314-cp314-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:054dc20fd2e3132b4c3883b4a00d72fd6e1f56fdaf89fccd12e8057d74cd74d7", size = 453181, upload-time = "2026-05-18T04:30:14.829Z" }, + { url = "https://files.pythonhosted.org/packages/e5/10/9745e17c98e7b8a86454df0a3c7b5686bd650383f1e9f26e4ebcbd6cc0c0/watchfiles-1.2.0-cp314-cp314-manylinux_2_31_riscv64.whl", hash = "sha256:e140ed30ebde76796b686e67c182cff10ea2fbab186fafd1560f74bb5a473a6e", size = 465109, upload-time = "2026-05-18T04:30:28.123Z" }, + { url = "https://files.pythonhosted.org/packages/8f/95/8ef4a95481d3e0cb52d62a06fa6e972e81424be2d9698b91a2fecca9904c/watchfiles-1.2.0-cp314-cp314-musllinux_1_1_aarch64.whl", hash = "sha256:bb7e52ecf68ba46d22df23467b87cffeb2146908aa523ebfe803019618cfda06", size = 630653, upload-time = "2026-05-18T04:31:49.304Z" }, + { url = "https://files.pythonhosted.org/packages/fd/e4/3b3bf36b0f829b50c6ebcb8d031583863c59f923d6a6af3d485e470d0fac/watchfiles-1.2.0-cp314-cp314-musllinux_1_1_x86_64.whl", hash = "sha256:23282a321c8baf9b3a3c4afff673f9fe65eb7fdc2338d765ccad9d3d1916a5ba", size = 657838, upload-time = "2026-05-18T04:31:06.497Z" }, + { url = "https://files.pythonhosted.org/packages/21/b1/6cbbb50c1f3002ab568777d44aa21206dfb8807a840990c4037523b51812/watchfiles-1.2.0-cp314-cp314-win32.whl", hash = "sha256:c0db965c5f79aa49fe672d297cf1febc5ad149b658594944f49a54a2b96270a7", size = 275108, upload-time = "2026-05-18T04:30:06.891Z" }, + { url = "https://files.pythonhosted.org/packages/92/45/190ce6db8dcb4536682cf75d3889ff1a27182a58cb519d343cb6d9ea63d8/watchfiles-1.2.0-cp314-cp314-win_amd64.whl", hash = "sha256:71283b39fd17e5408eb123bd37aeecfd9d54c81fc184421943208aadb879d103", size = 288441, upload-time = "2026-05-18T04:32:12.901Z" }, + { url = "https://files.pythonhosted.org/packages/74/0d/3eae1c2313ab08378431d907c3f8095ecca00f3eda33111cf4f0f2591799/watchfiles-1.2.0-cp314-cp314-win_arm64.whl", hash = "sha256:c5c19526f4e54a00f2666a6c0e9e40d582c09e865055ea7378bf0009aab857b3", size = 280684, upload-time = "2026-05-18T04:31:26.902Z" }, + { url = "https://files.pythonhosted.org/packages/b1/75/fb64e6c25d6b5ca636d03df34ffb1c6e9873303e76d27967e045f8df088f/watchfiles-1.2.0-cp314-cp314t-macosx_10_12_x86_64.whl", hash = "sha256:d73a585accffa5ae39c17264c36ec3166d2fad7000c780f5ef83b2722afb9dd2", size = 398857, upload-time = "2026-05-18T04:32:17.108Z" }, + { url = "https://files.pythonhosted.org/packages/73/4e/9f7adf01754cbf81843722ccfec169d8f26c69778281a302855cecd2ee08/watchfiles-1.2.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:ae99b14c5f21e026e0e9d96f40e07d8570ebee6cafd9d8fc318354606daa7a28", size = 392413, upload-time = "2026-05-18T04:31:07.911Z" }, + { url = "https://files.pythonhosted.org/packages/47/c8/bec626bcc2d69f44b9acb24ce7d60ed7b16b73628eea747fcbd169d8edda/watchfiles-1.2.0-cp314-cp314t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4429f3b105524a10b72c3a819b091c495d2811d419c1e1e8df773a5a5974f831", size = 452409, upload-time = "2026-05-18T04:31:20.142Z" }, + { url = "https://files.pythonhosted.org/packages/00/b7/b6362068e81e7c556d155a34c35d40ac3ef42d747b06d7f6e5bf58e359c2/watchfiles-1.2.0-cp314-cp314t-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:43d818978d06062d9b22c4fab2ebe44cf5213d42dc8e62bda8c2760cfa2eeb33", size = 458827, upload-time = "2026-05-18T04:32:06.219Z" }, + { url = "https://files.pythonhosted.org/packages/67/f8/9a813fa42afb1e0b4625e75f0479826644d3ee8dc287e093799bc01f390c/watchfiles-1.2.0-cp314-cp314t-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b9f732dc58b2dbe69e464ccf8fff7a03b0dd0be439da4c0720d3558527d3d6b4", size = 490104, upload-time = "2026-05-18T04:31:56.034Z" }, + { url = "https://files.pythonhosted.org/packages/2f/bf/27dfb6094ca4c9aad21298b5525b6c53cb36121ee454331d05161e58d130/watchfiles-1.2.0-cp314-cp314t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8f200104103feb097de4cab8fe4f5dd18a2026934c7dea98c55a2f5fd6d5a33b", size = 571360, upload-time = "2026-05-18T04:31:57.133Z" }, + { url = "https://files.pythonhosted.org/packages/fb/39/44a096d67270ea93df91d33877dbe91fbda3aa4f8ec2edf799d93eda8736/watchfiles-1.2.0-cp314-cp314t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:63ac26eefbf4af1741247d6fb68b11c49a25b2f7413fbd318a83a12aaa9cf666", size = 464644, upload-time = "2026-05-18T04:30:57.33Z" }, + { url = "https://files.pythonhosted.org/packages/0e/80/c7472203bad6268e3ef1ad260739704847898938ad7ea8b63a5131f46b50/watchfiles-1.2.0-cp314-cp314t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0c4997d4e4a55f0d02b6cde327322daf3a0400e5df6c6b15948994bf72497925", size = 454771, upload-time = "2026-05-18T04:30:48.736Z" }, + { url = "https://files.pythonhosted.org/packages/51/cf/3b10b268b4b7f0fc26e9debb5eef1998b515887840f444cd3ec80c688755/watchfiles-1.2.0-cp314-cp314t-manylinux_2_31_riscv64.whl", hash = "sha256:4c887eba18b7945ac73067a8b4a66f21cd46c2539b2bc68588f7be6c7eb6d26b", size = 463494, upload-time = "2026-05-18T04:31:33.826Z" }, + { url = "https://files.pythonhosted.org/packages/3d/3e/a4302545cd589262a0dc7d140e86f7688eba3f9c72776c27f7e23b8864c4/watchfiles-1.2.0-cp314-cp314t-musllinux_1_1_aarch64.whl", hash = "sha256:3416ff151bb6b5a8d8d11664974fbef4d9305b9b2957839ab5a270468fd8df30", size = 629383, upload-time = "2026-05-18T04:31:15.596Z" }, + { url = "https://files.pythonhosted.org/packages/db/99/d5649df0a9a410d45b7c882304d0b790903ac9b6e8f2cfd12114e0c6b9f2/watchfiles-1.2.0-cp314-cp314t-musllinux_1_1_x86_64.whl", hash = "sha256:0e831a271c035d89789cffc386b6aa1375f39f1cd25eb7ca0997e4970d152fc5", size = 656093, upload-time = "2026-05-18T04:31:58.707Z" }, + { url = "https://files.pythonhosted.org/packages/92/b9/362702539275019a54dd2e94511b31a9b89c5f9e6a21966de7eb692549fc/watchfiles-1.2.0-cp315-cp315-macosx_10_12_x86_64.whl", hash = "sha256:37a6721cdf3f65dbb13aa9503510ccb4451603ac837e44d265d7992a597e1374", size = 400109, upload-time = "2026-05-18T04:31:16.879Z" }, + { url = "https://files.pythonhosted.org/packages/8f/75/71d5ba62db781e5587bded1d944c675374bc4aa37ff33d5018d98e8b6538/watchfiles-1.2.0-cp315-cp315-macosx_11_0_arm64.whl", hash = "sha256:2b37d10b5a63bd4d87e18472d80fa525bd670586fae62e5dd580452764879b65", size = 392167, upload-time = "2026-05-18T04:31:28.058Z" }, + { url = "https://files.pythonhosted.org/packages/3c/01/c66dd95d0423fe30d31820e2d1d5bda773764131bbb6ac0cb1cf303ac328/watchfiles-1.2.0-cp315-cp315-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0a105bc2283f67e8fbec74253ec2d94925de92ed72c0393f1206bf326b7b7b69", size = 452372, upload-time = "2026-05-18T04:31:00.836Z" }, + { url = "https://files.pythonhosted.org/packages/91/15/2fe99557e72f85627c6a8eed50d889e8d101623e060a22ad75b875cb932d/watchfiles-1.2.0-cp315-cp315-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:5327989a465505f05cfe06f04fa9d0c2fd5432bb243e10e6f012b1bdca3c8579", size = 459596, upload-time = "2026-05-18T04:31:34.96Z" }, + { url = "https://files.pythonhosted.org/packages/ed/23/d4acfa0023367428ed48351b3b9b267893037b6cadae55620c61c24bcfd4/watchfiles-1.2.0-cp315-cp315-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ecb47f183a8025b2aa18b546725c3657e542112ae9c0613a2af79b4fa8d04ad7", size = 490869, upload-time = "2026-05-18T04:31:59.923Z" }, + { url = "https://files.pythonhosted.org/packages/a4/5f/3164cbdce06c9fb95c4f7b9e2f9760b5e2797af43a9ecc317ef42a23a278/watchfiles-1.2.0-cp315-cp315-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8520a4ab0e37f770afc34459c4f8f7019e153f9124dc101c15538365875d1ab2", size = 571641, upload-time = "2026-05-18T04:32:00.948Z" }, + { url = "https://files.pythonhosted.org/packages/41/e6/85d3731c55e65cd7690f3f803d24c139588aaf863e4bf2148fe7a7fa1a19/watchfiles-1.2.0-cp315-cp315-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:71cd71740ed2c15211ebb237ced4e39a1cdf6f80566e5fe95428da1626f4fde6", size = 464444, upload-time = "2026-05-18T04:30:34.298Z" }, + { url = "https://files.pythonhosted.org/packages/f4/7d/562641012b8b09872742c3b8adf9629ec479fd78f8d68ae4a0c13da8add6/watchfiles-1.2.0-cp315-cp315-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f88af53d6ddaf72179ef613ddc905e6f4785f712b49b80b3bef9f3525e6194b4", size = 453593, upload-time = "2026-05-18T04:31:23.464Z" }, + { url = "https://files.pythonhosted.org/packages/56/fe/cb8ef3d6f929d14158fdaaad9925985b7310abc9384dcd4d82dd0016fb59/watchfiles-1.2.0-cp315-cp315-manylinux_2_31_riscv64.whl", hash = "sha256:cee9d5efd929efdac5f7e58f72b3376f676b64050a91c5b99a7094c5b2317488", size = 465096, upload-time = "2026-05-18T04:31:30.384Z" }, + { url = "https://files.pythonhosted.org/packages/25/91/80908e835e100527a9267147b08c0eee1fa6ab0ffec15edc04d1d44885f7/watchfiles-1.2.0-cp315-cp315-musllinux_1_1_aarch64.whl", hash = "sha256:b718bf356bbc15e559bd8ef41782b573b8ae0e3f177ab244b440568d7ea02cfb", size = 630638, upload-time = "2026-05-18T04:30:49.89Z" }, + { url = "https://files.pythonhosted.org/packages/46/4b/95ab2f256bb4af3cb2eb23b9317bda984ee6e0f11733a5c004a6c95b06e3/watchfiles-1.2.0-cp315-cp315-musllinux_1_1_x86_64.whl", hash = "sha256:922c0e019fe68b3ae392965a766b02a71ba1168c932cebc3733cd52c5fe5b377", size = 657684, upload-time = "2026-05-18T04:31:32.027Z" }, +] + [[package]] name = "wcmatch" version = "8.5.2" @@ -4466,10 +4817,11 @@ wheels = [ [[package]] name = "xai-sdk" -version = "1.5.0" +version = "1.12.2" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "aiohttp" }, + { name = "googleapis-common-protos" }, { name = "grpcio" }, { name = "opentelemetry-sdk" }, { name = "packaging" }, @@ -4477,9 +4829,9 @@ dependencies = [ { name = "pydantic" }, { name = "requests" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/b9/54/378c681c2c4512de78b49b65af1b7aaea0e0740dfa4a3389535e65422f70/xai_sdk-1.5.0.tar.gz", hash = "sha256:f88529d844f962fbb24464351a5962cc21a7d080e088bf656709ca7856270c8c", size = 349692, upload-time = "2025-12-05T03:27:36.93Z" } +sdist = { url = "https://files.pythonhosted.org/packages/e7/21/b6683eeb797bac6dd46e55e9fbdb15c598b34fadd862120da4c09d1d01d0/xai_sdk-1.12.2.tar.gz", hash = "sha256:917d1887e6afdb49fff9f0dc6ae1bceede43a747365a406a3486af3e23509be4", size = 414440, upload-time = "2026-05-07T00:07:01.244Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/d6/34/cd3681e5f786e37fb2dbb195fa3d5eb2a5e2be9b20d3abf01b40c9aba839/xai_sdk-1.5.0-py3-none-any.whl", hash = "sha256:4dc56bec2d67811c67030a50b42c4a1bc60f43947d4baaa840acf0aef246e816", size = 204314, upload-time = "2025-12-05T03:27:35.67Z" }, + { url = "https://files.pythonhosted.org/packages/99/b1/76da151f71a2dc9a65ef725ad4bac597a8d02da6618fb0474468a3355a34/xai_sdk-1.12.2-py3-none-any.whl", hash = "sha256:a3b4079f0629637009c5e3d58388f8c88591658dde31f202d5f5e8560fe6e120", size = 256654, upload-time = "2026-05-07T00:06:59.56Z" }, ] [[package]]