e53e7662e9
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
63 lines
1.4 KiB
Go
63 lines
1.4 KiB
Go
package parse
|
|
|
|
import (
|
|
"bytes"
|
|
"errors"
|
|
"fmt"
|
|
"io"
|
|
"strings"
|
|
"unicode"
|
|
|
|
"github.com/ledongthuc/pdf"
|
|
)
|
|
|
|
// ErrScanPDF is returned when extracted text is empty or non-textual,
|
|
// indicating a scan-based (image-only) PDF that cannot be parsed.
|
|
var ErrScanPDF = errors.New("scan-based PDF: please convert to text first")
|
|
|
|
// ExtractPDF reads a PDF and returns its concatenated plain text.
|
|
// Returns ErrScanPDF if the content appears to be empty or non-textual.
|
|
func ExtractPDF(r io.Reader) (string, error) {
|
|
data, err := io.ReadAll(r)
|
|
if err != nil {
|
|
return "", fmt.Errorf("read pdf: %w", err)
|
|
}
|
|
reader, err := pdf.NewReader(bytes.NewReader(data), int64(len(data)))
|
|
if err != nil {
|
|
return "", fmt.Errorf("parse pdf: %w", err)
|
|
}
|
|
var sb strings.Builder
|
|
for i := 1; i <= reader.NumPage(); i++ {
|
|
page := reader.Page(i)
|
|
if page.V.IsNull() {
|
|
continue
|
|
}
|
|
text, err := page.GetPlainText(nil)
|
|
if err != nil {
|
|
continue
|
|
}
|
|
sb.WriteString(text)
|
|
sb.WriteByte('\n')
|
|
}
|
|
text := sb.String()
|
|
if isGibberish(text) {
|
|
return "", ErrScanPDF
|
|
}
|
|
return text, nil
|
|
}
|
|
|
|
// isGibberish returns true when text is too short or has < 2% alphanumeric content.
|
|
func isGibberish(text string) bool {
|
|
runes := []rune(text)
|
|
if len(runes) < 50 {
|
|
return true
|
|
}
|
|
var alpha int
|
|
for _, c := range runes {
|
|
if unicode.IsLetter(c) || unicode.IsDigit(c) {
|
|
alpha++
|
|
}
|
|
}
|
|
return float64(alpha)/float64(len(runes)) < 0.02
|
|
}
|