Skip to content

[BUG] bftree-spherical quantization shows recall 0 #1189

Description

@snash4

Actual Behavior

I am using graph-index-build-bftree-spherical-quantization based on benchmark example to build an index on Ada dataset. Tried different configurations, it shows Recall 0

Ls,   KNN,    Avg cmps,   Avg hops,   QPS - mean(max),             Avg Latency,             p99 Latency,   Recall,   Threads
=============================================================================================================================
 50,    50,   1821.4697,    60.7184,   1361.5 (1399.9),   36914.7us (37347.6us),     82491.0us (83761us),        0,        56
100,    50,   2886.2495,   110.5933,     810.8 (816.8),   61367.2us (62271.0us),   133940.4us (136886us),        0,        56

Example Code

This is the conf file.

{
    "search_directories": [
        "https://github.com/ssd2/nash/anndata/ada/"
    ],
    "jobs": [
        {
            "type": "graph-index-build-bftree-spherical-quantization",
            "content": {
                "build": {
                    "data_type": "float32",
                    "data": "ada_002_1000000_base_vectors.bin",
                    "distance": "squared_l2",
                    "max_degree": 64,
                    "l_build": 50,
                    "insert_retry": null,
                    "start_point_strategy": "medoid",
                    "alpha": 1.2,
                    "backedge_ratio": 1.0,
                    "num_threads": 56,
                    "multi_insert": null
                },
                "search_phase": {
                    "search-type": "topk",
                    "queries": "ada_002_1000000_query_vectors_10000.bin",
                    "groundtruth": "https://github.com/ssd2/nash/anndata/ada/ada_new_gt10",
                    "reps": 5,
                    "num_threads": [
                        56
                    ],
                    "runs": [
                        {
                            "search_n": 50,
                            "search_l": [
                                50,
                                100 
                            ],
                            "recall_k": 10
                        }
                    ]
                },
                "seed": 42,
                "num_bits": 2,
                "pre_scale": "reciprocal_mean_norm",
                "transform_kind": "null",
                "vector_store_config": {
                    "cb_size_byte": 67108864,
                    "leaf_page_size": 4096,
                    "cb_max_record_size": null,
                    "cb_min_record_size": null,
                    "read_promotion_rate": null,
                    "scan_promotion_rate": null,
                    "cb_copy_on_access_ratio": null,
                    "read_record_cache": null,
                    "cache_only": null
                },
                "neighbor_store_config": {
                    "cb_size_byte": 67108864,
                    "leaf_page_size": 4096,
                    "cb_max_record_size": null,
                    "cb_min_record_size": null,
                    "read_promotion_rate": null,
                    "scan_promotion_rate": null,
                    "cb_copy_on_access_ratio": null,
                    "read_record_cache": null,
                    "cache_only": null
                },
                "quant_store_config": {
                    "cb_size_byte": 67108864,
                    "leaf_page_size": 4096,
                    "cb_max_record_size": null,
                    "cb_min_record_size": null,
                    "read_promotion_rate": null,
                    "scan_promotion_rate": null,
                    "cb_copy_on_access_ratio": null,
                    "read_record_cache": null,
                    "cache_only": null
                }
            }
        }
    ]
}

Dataset Description

Please tell us about the shape and datatype of your data, (e.g. 128 dimensions, 12.3 billion points, floats)

  • Dimensions: 1536
  • Number of Points: 1M
  • Data type: float32

Your Environment

  • DiskANN version (or commit built from): Commit 3aa44ac

Additional Details

The similar configuration works on Sift 100k dataset.
But on wikipedia-cohere dataset (768 dim), it again shows Recall 0

Metadata

Metadata

Assignees

No one assigned

    Labels

    bugSomething isn't working

    Type

    No type
    No fields configured for issues without a type.

    Projects

    Status
    No status

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions