Phase 3: PDF/DOCX extraction, chunking, LLM client with mock interface
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,74 @@
|
||||
package parse
|
||||
|
||||
import (
|
||||
"archive/zip"
|
||||
"bytes"
|
||||
"encoding/xml"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// ExtractDOCX reads a .docx file and returns its plain text.
|
||||
// DOCX is a ZIP archive; we unzip word/document.xml, walk <w:t> nodes
|
||||
// for text, and emit a newline at each <w:p> boundary.
|
||||
func ExtractDOCX(r io.Reader) (string, error) {
|
||||
data, err := io.ReadAll(r)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("read docx: %w", err)
|
||||
}
|
||||
zr, err := zip.NewReader(bytes.NewReader(data), int64(len(data)))
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("open docx zip: %w", err)
|
||||
}
|
||||
var docFile *zip.File
|
||||
for _, f := range zr.File {
|
||||
if f.Name == "word/document.xml" {
|
||||
docFile = f
|
||||
break
|
||||
}
|
||||
}
|
||||
if docFile == nil {
|
||||
return "", errors.New("word/document.xml not found in docx")
|
||||
}
|
||||
rc, err := docFile.Open()
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("open document.xml: %w", err)
|
||||
}
|
||||
defer rc.Close()
|
||||
return parseDocXML(rc)
|
||||
}
|
||||
|
||||
func parseDocXML(r io.Reader) (string, error) {
|
||||
dec := xml.NewDecoder(r)
|
||||
var sb strings.Builder
|
||||
var inText bool
|
||||
for {
|
||||
tok, err := dec.Token()
|
||||
if err == io.EOF {
|
||||
break
|
||||
}
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("parse document.xml: %w", err)
|
||||
}
|
||||
switch t := tok.(type) {
|
||||
case xml.StartElement:
|
||||
if t.Name.Local == "t" {
|
||||
inText = true
|
||||
}
|
||||
case xml.EndElement:
|
||||
if t.Name.Local == "t" {
|
||||
inText = false
|
||||
}
|
||||
if t.Name.Local == "p" {
|
||||
sb.WriteByte('\n')
|
||||
}
|
||||
case xml.CharData:
|
||||
if inText {
|
||||
sb.Write([]byte(t))
|
||||
}
|
||||
}
|
||||
}
|
||||
return strings.TrimSpace(sb.String()), nil
|
||||
}
|
||||
Reference in New Issue
Block a user