Moderation Kit

Multi-category content safety classification with configurable severity thresholds and actions.

Configuration

0.7
LenientStrict

Sample Inputs

Integration Code

import { createModerator } from 'agent-tools-kit/safety'

const moderator = createModerator({
  categories: ['hate', 'harassment', 'violence', 'self-harm', 'sexual', 'spam'],
  language: 'en',
  threshold: 0.7,
  actions: {
    onFlag: 'block',
    onPass: 'allow',
    webhookUrl: process.env.MODERATION_WEBHOOK, // optional
  }
})

// Middleware: validate before sending to LLM
app.use('/api/chat', async (req, res, next) => {
  const result = await moderator.classify(req.body.message)

  if (result.flagged) {
    // Log for compliance
    await moderator.logViolation({
      userId: req.user.id,
      content: req.body.message,
      categories: result.flaggedCategories,
      scores: result.scores,
    })
    return res.status(422).json({
      error: 'Content policy violation',
      categories: result.flaggedCategories,
    })
  }

  // Also moderate LLM output
  res.on('finish', async () => {
    const outputCheck = await moderator.classify(res.locals.aiResponse)
    if (outputCheck.flagged) {
      await moderator.quarantine(res.locals.responseId)
    }
  })

  next()
})