feat: New Fabric sample datasets (#2219)

* add two new fabric sample datasets.

* Update Fabric Home with two sample datasets. One regular and one for vector search.

* Update specs for sample data container

* Add telemetry instead of console log

* Add sampleDataFile to telemetry when importing sample data

---------

Co-authored-by: Mark Brown <mjbrown@microsoft.com>
Co-authored-by: Laurent Nguyen <languye@microsoft.com>
This commit is contained in:
Laurent Nguyen
2025-10-03 17:31:05 +02:00
committed by GitHub
parent 909957a9a1
commit cd27814fad
10 changed files with 388404 additions and 26 deletions

View File

@@ -1,9 +1,13 @@
import { JSONObject } from "@azure/cosmos";
import { BackendDefaults } from "Common/Constants";
import { createCollection } from "Common/dataAccess/createCollection";
import Explorer from "Explorer/Explorer";
import { useDatabases } from "Explorer/useDatabases";
import { DEFAULT_FABRIC_NATIVE_CONTAINER_THROUGHPUT, isFabricNative } from "Platform/Fabric/FabricUtil";
import { Action, ActionModifiers } from "Shared/Telemetry/TelemetryConstants";
import * as DataModels from "../../Contracts/DataModels";
import * as ViewModels from "../../Contracts/ViewModels";
import * as TelemetryProcessor from "../../Shared/Telemetry/TelemetryProcessor";
/**
* Public for unit tests
@@ -26,12 +30,20 @@ const hasContainer = (
export const checkContainerExists = (databaseName: string, containerName: string) =>
hasContainer(databaseName, containerName, useDatabases.getState().databases);
export enum SampleDataFile {
COPILOT = "Copilot",
FABRIC_SAMPLE_DATA = "FabricSampleData",
FABRIC_SAMPLE_VECTOR_DATA = "FabricSampleVectorData",
}
export const createContainer = async (
databaseName: string,
containerName: string,
explorer: Explorer,
sampleDataFile: SampleDataFile,
): Promise<ViewModels.Collection> => {
const createRequest: DataModels.CreateCollectionParams = {
autoPilotMaxThroughput: isFabricNative() ? DEFAULT_FABRIC_NATIVE_CONTAINER_THROUGHPUT : undefined,
createNewDatabase: false,
collectionId: containerName,
databaseId: databaseName,
@@ -41,6 +53,44 @@ export const createContainer = async (
kind: "Hash",
version: BackendDefaults.partitionKeyVersion,
},
vectorEmbeddingPolicy:
sampleDataFile === SampleDataFile.FABRIC_SAMPLE_VECTOR_DATA
? {
vectorEmbeddings: [
{
path: "/descriptionVector",
dataType: "float32",
distanceFunction: "cosine",
dimensions: 512,
},
],
}
: undefined,
indexingPolicy:
sampleDataFile === SampleDataFile.FABRIC_SAMPLE_VECTOR_DATA
? {
automatic: true,
indexingMode: "consistent",
includedPaths: [
{
path: "/*",
},
],
excludedPaths: [
{
path: '/"_etag"/?',
},
],
fullTextIndexes: [],
vectorIndexes: [
{
path: "/descriptionVector",
type: "quantizedFlat",
quantizationByteSize: 64,
},
],
}
: undefined,
};
await createCollection(createRequest);
await explorer.refreshAllDatabases();
@@ -55,10 +105,39 @@ export const createContainer = async (
const SAMPLE_DATA_PARTITION_KEY = "category"; // This pkey is specifically set for queryCopilotSampleData.json below
export const importData = async (collection: ViewModels.Collection): Promise<void> => {
// TODO: keep same chunk as ContainerSampleGenerator
const dataFileContent = await import(
/* webpackChunkName: "queryCopilotSampleData" */ "../../../sampleData/queryCopilotSampleData.json"
);
await collection.bulkInsertDocuments(dataFileContent.data);
export const importData = async (sampleDataFile: SampleDataFile, collection: ViewModels.Collection): Promise<void> => {
let documents: JSONObject[] = undefined;
switch (sampleDataFile) {
case SampleDataFile.COPILOT:
documents = (
await import(/* webpackChunkName: "queryCopilotSampleData" */ "../../../sampleData/queryCopilotSampleData.json")
).data;
break;
case SampleDataFile.FABRIC_SAMPLE_DATA:
documents = (await import(/* webpackChunkName: "fabricSampleData" */ "../../../sampleData/fabricSampleData.json"))
.default;
break;
case SampleDataFile.FABRIC_SAMPLE_VECTOR_DATA:
documents = (
await import(
/* webpackChunkName: "fabricSampleDataVectors" */ "../../../sampleData/fabricSampleDataVectors.json"
)
).default;
break;
default:
throw new Error(`Unknown sample data file: ${sampleDataFile}`);
}
if (!documents) {
throw new Error(`Failed to load sample data file: ${sampleDataFile}`);
}
// Time it
const start = performance.now();
await collection.bulkInsertDocuments(documents);
const end = performance.now();
TelemetryProcessor.trace(Action.ImportSampleData, ActionModifiers.Success, {
documentsCount: documents.length,
durationMs: end - start,
sampleDataFile,
});
};