package parse import ( "archive/zip" "bytes" "encoding/xml" "errors" "fmt" "io" "strings" ) // ExtractDOCX reads a .docx file and returns its plain text. // DOCX is a ZIP archive; we unzip word/document.xml, walk nodes // for text, and emit a newline at each boundary. func ExtractDOCX(r io.Reader) (string, error) { data, err := io.ReadAll(r) if err != nil { return "", fmt.Errorf("read docx: %w", err) } zr, err := zip.NewReader(bytes.NewReader(data), int64(len(data))) if err != nil { return "", fmt.Errorf("open docx zip: %w", err) } var docFile *zip.File for _, f := range zr.File { if f.Name == "word/document.xml" { docFile = f break } } if docFile == nil { return "", errors.New("word/document.xml not found in docx") } rc, err := docFile.Open() if err != nil { return "", fmt.Errorf("open document.xml: %w", err) } defer rc.Close() return parseDocXML(rc) } func parseDocXML(r io.Reader) (string, error) { dec := xml.NewDecoder(r) var sb strings.Builder var inText bool for { tok, err := dec.Token() if err == io.EOF { break } if err != nil { return "", fmt.Errorf("parse document.xml: %w", err) } switch t := tok.(type) { case xml.StartElement: if t.Name.Local == "t" { inText = true } case xml.EndElement: if t.Name.Local == "t" { inText = false } if t.Name.Local == "p" { sb.WriteByte('\n') } case xml.CharData: if inText { sb.Write([]byte(t)) } } } return strings.TrimSpace(sb.String()), nil }