Files
RakunNakun-AI/migration.js
2025-06-03 16:50:53 +07:00

95 lines
2.7 KiB
JavaScript

require('dotenv').config();
const crypto = require('crypto');
const { MongoClient } = require('mongodb');
const { getVoyageEmbeddings } = require('./embedding');
const MONGO_URL = process.env.MONGO_URL;
const DB_NAME = process.env.MONGO_DB_NAME;
const COLLECTION_NAME = process.env.MONGO_COLLECTION_NAME;
const BATCH_SIZE = 500;
// Toggle: set true to overwrite all embeddings, false to update only missing ones
const OVERWRITE_EMBEDDINGS = true;
function computeHash(input) {
return crypto.createHash('sha256').update(input.trim().toLowerCase()).digest('hex');
}
(async () => {
const client = new MongoClient(MONGO_URL);
await client.connect();
const db = client.db(DB_NAME);
const collection = db.collection(COLLECTION_NAME);
// Select documents based on the toggle
const query = OVERWRITE_EMBEDDINGS
? {} // all documents
: {
$or: [
{ embedding: { $exists: false } },
{ hash: { $exists: false } }
]
};
const cursor = collection.find(query);
let updatedCount = 0;
let batchDocs = [];
while (await cursor.hasNext()) {
const doc = await cursor.next();
if (!OVERWRITE_EMBEDDINGS) {
// Only push docs missing embedding or hash
const needsEmbedding = !doc.embedding;
const needsHash = !doc.hash;
if (!needsEmbedding && !needsHash) continue;
}
batchDocs.push(doc);
if (batchDocs.length === BATCH_SIZE) {
const texts = batchDocs.map(d => d.key);
try {
const embeddings = await getVoyageEmbeddings(texts);
for (let i = 0; i < batchDocs.length; i++) {
const hash = computeHash(batchDocs[i].key);
await collection.updateOne(
{ _id: batchDocs[i]._id },
{ $set: { embedding: embeddings[i], hash } }
);
console.log(`✅ Updated embedding & hash for: ${batchDocs[i].key}`);
updatedCount++;
}
} catch (err) {
console.warn(`⚠️ Failed batch: ${err.message}`);
}
batchDocs = [];
await new Promise(r => setTimeout(r, 1000));
}
}
if (batchDocs.length > 0) {
const texts = batchDocs.map(d => d.key);
try {
const embeddings = await getVoyageEmbeddings(texts);
for (let i = 0; i < batchDocs.length; i++) {
const hash = computeHash(batchDocs[i].key);
await collection.updateOne(
{ _id: batchDocs[i]._id },
{ $set: { embedding: embeddings[i], hash } }
);
console.log(`✅ Updated embedding & hash for: ${batchDocs[i].key}`);
updatedCount++;
}
} catch (err) {
console.warn(`⚠️ Failed batch: ${err.message}`);
}
}
console.log(`🎉 Migration complete. ${updatedCount} entries updated.`);
await client.close();
})();