mirror of
https://github.com/NekoMonci12/RakunNakun-AI.git
synced 2025-12-19 14:59:15 +00:00
95 lines
2.7 KiB
JavaScript
95 lines
2.7 KiB
JavaScript
require('dotenv').config();
|
|
const crypto = require('crypto');
|
|
const { MongoClient } = require('mongodb');
|
|
const { getVoyageEmbeddings } = require('./embedding');
|
|
|
|
const MONGO_URL = process.env.MONGO_URL;
|
|
const DB_NAME = process.env.MONGO_DB_NAME;
|
|
const COLLECTION_NAME = process.env.MONGO_COLLECTION_NAME;
|
|
const BATCH_SIZE = 500;
|
|
|
|
// Toggle: set true to overwrite all embeddings, false to update only missing ones
|
|
const OVERWRITE_EMBEDDINGS = true;
|
|
|
|
function computeHash(input) {
|
|
return crypto.createHash('sha256').update(input.trim().toLowerCase()).digest('hex');
|
|
}
|
|
|
|
(async () => {
|
|
const client = new MongoClient(MONGO_URL);
|
|
await client.connect();
|
|
const db = client.db(DB_NAME);
|
|
const collection = db.collection(COLLECTION_NAME);
|
|
|
|
// Select documents based on the toggle
|
|
const query = OVERWRITE_EMBEDDINGS
|
|
? {} // all documents
|
|
: {
|
|
$or: [
|
|
{ embedding: { $exists: false } },
|
|
{ hash: { $exists: false } }
|
|
]
|
|
};
|
|
|
|
const cursor = collection.find(query);
|
|
|
|
let updatedCount = 0;
|
|
let batchDocs = [];
|
|
|
|
while (await cursor.hasNext()) {
|
|
const doc = await cursor.next();
|
|
|
|
if (!OVERWRITE_EMBEDDINGS) {
|
|
// Only push docs missing embedding or hash
|
|
const needsEmbedding = !doc.embedding;
|
|
const needsHash = !doc.hash;
|
|
if (!needsEmbedding && !needsHash) continue;
|
|
}
|
|
|
|
batchDocs.push(doc);
|
|
|
|
if (batchDocs.length === BATCH_SIZE) {
|
|
const texts = batchDocs.map(d => d.key);
|
|
try {
|
|
const embeddings = await getVoyageEmbeddings(texts);
|
|
|
|
for (let i = 0; i < batchDocs.length; i++) {
|
|
const hash = computeHash(batchDocs[i].key);
|
|
await collection.updateOne(
|
|
{ _id: batchDocs[i]._id },
|
|
{ $set: { embedding: embeddings[i], hash } }
|
|
);
|
|
console.log(`✅ Updated embedding & hash for: ${batchDocs[i].key}`);
|
|
updatedCount++;
|
|
}
|
|
} catch (err) {
|
|
console.warn(`⚠️ Failed batch: ${err.message}`);
|
|
}
|
|
|
|
batchDocs = [];
|
|
await new Promise(r => setTimeout(r, 1000));
|
|
}
|
|
}
|
|
|
|
if (batchDocs.length > 0) {
|
|
const texts = batchDocs.map(d => d.key);
|
|
try {
|
|
const embeddings = await getVoyageEmbeddings(texts);
|
|
for (let i = 0; i < batchDocs.length; i++) {
|
|
const hash = computeHash(batchDocs[i].key);
|
|
await collection.updateOne(
|
|
{ _id: batchDocs[i]._id },
|
|
{ $set: { embedding: embeddings[i], hash } }
|
|
);
|
|
console.log(`✅ Updated embedding & hash for: ${batchDocs[i].key}`);
|
|
updatedCount++;
|
|
}
|
|
} catch (err) {
|
|
console.warn(`⚠️ Failed batch: ${err.message}`);
|
|
}
|
|
}
|
|
|
|
console.log(`🎉 Migration complete. ${updatedCount} entries updated.`);
|
|
await client.close();
|
|
})();
|