Phase 3: PDF/DOCX extraction, chunking, LLM client with mock interface
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,62 @@
|
||||
package parse
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"strings"
|
||||
"unicode"
|
||||
|
||||
"github.com/ledongthuc/pdf"
|
||||
)
|
||||
|
||||
// ErrScanPDF is returned when extracted text is empty or non-textual,
|
||||
// indicating a scan-based (image-only) PDF that cannot be parsed.
|
||||
var ErrScanPDF = errors.New("scan-based PDF: please convert to text first")
|
||||
|
||||
// ExtractPDF reads a PDF and returns its concatenated plain text.
|
||||
// Returns ErrScanPDF if the content appears to be empty or non-textual.
|
||||
func ExtractPDF(r io.Reader) (string, error) {
|
||||
data, err := io.ReadAll(r)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("read pdf: %w", err)
|
||||
}
|
||||
reader, err := pdf.NewReader(bytes.NewReader(data), int64(len(data)))
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("parse pdf: %w", err)
|
||||
}
|
||||
var sb strings.Builder
|
||||
for i := 1; i <= reader.NumPage(); i++ {
|
||||
page := reader.Page(i)
|
||||
if page.V.IsNull() {
|
||||
continue
|
||||
}
|
||||
text, err := page.GetPlainText(nil)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
sb.WriteString(text)
|
||||
sb.WriteByte('\n')
|
||||
}
|
||||
text := sb.String()
|
||||
if isGibberish(text) {
|
||||
return "", ErrScanPDF
|
||||
}
|
||||
return text, nil
|
||||
}
|
||||
|
||||
// isGibberish returns true when text is too short or has < 2% alphanumeric content.
|
||||
func isGibberish(text string) bool {
|
||||
runes := []rune(text)
|
||||
if len(runes) < 50 {
|
||||
return true
|
||||
}
|
||||
var alpha int
|
||||
for _, c := range runes {
|
||||
if unicode.IsLetter(c) || unicode.IsDigit(c) {
|
||||
alpha++
|
||||
}
|
||||
}
|
||||
return float64(alpha)/float64(len(runes)) < 0.02
|
||||
}
|
||||
Reference in New Issue
Block a user