Moderation Kit
Multi-category content safety classification with configurable severity thresholds and actions.
Configuration
0.7
LenientStrict
Sample Inputs
Integration Code
import { createModerator } from 'agent-tools-kit/safety'
const moderator = createModerator({
categories: ['hate', 'harassment', 'violence', 'self-harm', 'sexual', 'spam'],
language: 'en',
threshold: 0.7,
actions: {
onFlag: 'block',
onPass: 'allow',
webhookUrl: process.env.MODERATION_WEBHOOK, // optional
}
})
// Middleware: validate before sending to LLM
app.use('/api/chat', async (req, res, next) => {
const result = await moderator.classify(req.body.message)
if (result.flagged) {
// Log for compliance
await moderator.logViolation({
userId: req.user.id,
content: req.body.message,
categories: result.flaggedCategories,
scores: result.scores,
})
return res.status(422).json({
error: 'Content policy violation',
categories: result.flaggedCategories,
})
}
// Also moderate LLM output
res.on('finish', async () => {
const outputCheck = await moderator.classify(res.locals.aiResponse)
if (outputCheck.flagged) {
await moderator.quarantine(res.locals.responseId)
}
})
next()
})