package parse import ( "bytes" "errors" "fmt" "io" "strings" "unicode" "github.com/ledongthuc/pdf" ) // ErrScanPDF is returned when extracted text is empty or non-textual, // indicating a scan-based (image-only) PDF that cannot be parsed. var ErrScanPDF = errors.New("scan-based PDF: please convert to text first") // ExtractPDF reads a PDF and returns its concatenated plain text. // Returns ErrScanPDF if the content appears to be empty or non-textual. func ExtractPDF(r io.Reader) (string, error) { data, err := io.ReadAll(r) if err != nil { return "", fmt.Errorf("read pdf: %w", err) } reader, err := pdf.NewReader(bytes.NewReader(data), int64(len(data))) if err != nil { return "", fmt.Errorf("parse pdf: %w", err) } var sb strings.Builder for i := 1; i <= reader.NumPage(); i++ { page := reader.Page(i) if page.V.IsNull() { continue } text, err := page.GetPlainText(nil) if err != nil { continue } sb.WriteString(text) sb.WriteByte('\n') } text := sb.String() if isGibberish(text) { return "", ErrScanPDF } return text, nil } // isGibberish returns true when text is too short or has < 2% alphanumeric content. func isGibberish(text string) bool { runes := []rune(text) if len(runes) < 50 { return true } var alpha int for _, c := range runes { if unicode.IsLetter(c) || unicode.IsDigit(c) { alpha++ } } return float64(alpha)/float64(len(runes)) < 0.02 }