PoshCode Archive  Artifact Content

Artifact ed8cb84c5f1034824b1b2b95da90b6c6ade14fde97b25f014108d2724e017efc:

  • File read-doc-without-word.ps1 — part of check-in [a218d9c27b] at 2018-06-10 14:16:49 on branch trunk — A suggestion how to read Word document without MS Office (user: qwerty size: 4865)

# encoding: ascii
# api: powershell
# title: read doc without word
# description: A suggestion how to read Word document without MS Office
# version: 0.1
# type: function
# author: qwerty
# license: CC0
# function: ConvertFrom-Docx
# x-poshcode-id: 6486
# x-archived: 2017-03-24T16:05:49
# x-published: 2017-08-21T20:27:00
#
#
<#
Original https://github.com/gregzakh/alt-ps/blob/master/ConvertFrom-Docx.ps1
#>
function ConvertFrom-Docx {
  <#
    .SYNOPSIS
        Converts Word document (.docx) to a text plain.
    .DESCRIPTION
        This function is provided AS IS since it does not provide specific
        checks such as file format, compression and etc.
    .EXAMPLE
        PS C:\> Get-ArchiveContent \doc\pecoff_v83.docx
        
        DateTime           Attributes    Size Compressed Name
        --------           ----------    ---- ---------- ----
        01.01.1980 0:00:00          0    2751        497 [Content_Types].xml
        01.01.1980 0:00:00          0     590        243 _rels/.rels
        01.01.1980 0:00:00          0    4755        756 word/_rels/document.xml.rels
        01.01.1980 0:00:00          0 1329567     123898 word/document.xml
        01.01.1980 0:00:00          0    1755        610 word/header2.xml
        01.01.1980 0:00:00          0    1231        391 word/header1.xml
        01.01.1980 0:00:00          0    1226        417 word/footer3.xml
        01.01.1980 0:00:00          0    1896        665 word/footer2.xml
        01.01.1980 0:00:00          0    1430        426 word/footer1.xml
        01.01.1980 0:00:00          0    2469        915 word/header3.xml
        01.01.1980 0:00:00          0    1734        467 word/endnotes.xml
        01.01.1980 0:00:00          0    1740        466 word/footnotes.xml
        01.01.1980 0:00:00          0     289        188 word/_rels/header3.xml.rels
        01.01.1980 0:00:00          0    6992       1686 word/theme/theme1.xml
        01.01.1980 0:00:00          0    2616       2616 word/media/image3.png
        01.01.1980 0:00:00          0   25088      14337 word/embeddings/Microsoft_Visio_2003-2010_Drawing22.vsd
        01.01.1980 0:00:00          0   52736      26744 word/embeddings/Microsoft_Visio_2003-2010_Drawing11.vsd
        01.01.1980 0:00:00          0   24232       6385 word/media/image1.emf
        01.01.1980 0:00:00          0    5364       1841 word/media/image2.emf
        01.01.1980 0:00:00          0   27660       6148 word/settings.xml
        01.01.1980 0:00:00          0    3653        844 word/fontTable.xml
        01.01.1980 0:00:00          0    1593        409 word/webSettings.xml
        01.01.1980 0:00:00          0   58024       7397 word/styles.xml
        01.01.1980 0:00:00          0   58777       7534 word/stylesWithEffects.xml
        01.01.1980 0:00:00          0     978        481 docProps/app.xml
        01.01.1980 0:00:00          0     621        325 docProps/core.xml
        01.01.1980 0:00:00          0   87512       6025 word/numbering.xml
        
        #required data is kept in "word/document.xml" file
        PS C:\> ConvertFrom-Docx \doc\pecoff_v83.docx
        ...
    .NOTES
        Get-ArchiveContent (https://github.com/gregzakh/alt-ps/blob/master/Get-ArchiveContent.ps1)
  #>
  param(
    [Parameter(Mandatory=$true)]
    [ValidateNotNullOrEmpty()]
    [ValidateScript({Test-Path $_})]
    [String]$Path
  )
  
  begin {}
  process {
    try {
      $fs = [IO.File]::OpenRead((Convert-Path $Path))
      $br = New-Object IO.BinaryReader($fs)
      
      while ($true) { # just walk around of local headers
        if ($br.ReadUInt32() -ne 0x4034b50) { break }
        
        $fs.Position += 14
        # compressed and uncompressed size
        $csz, $usz = $br.ReadUInt32(), $br.ReadUInt32()
        # name length and extra field size
        $nsz, $xsz = $br.ReadUInt16(), $br.ReadUInt16()
        
        if ((-join $br.ReadChars($nsz)) -eq 'word/document.xml') {
          $fs.Position += $xsz # skip extra field
          
          try {
            $ds = New-Object IO.Compression.DeflateStream($fs, 'Decompress')
            
            $buf = New-Object Byte[]($usz)
            [void]$ds.Read($buf, 0, $buf.Length)
            $xml = [xml][Text.Encoding]::UTF8.GetString($buf)
          }
          finally {
            if ($ds) {
              $ds.Dispose()
              $ds.Close()
            }
          }
          
          break # that's done!
        }
        # next entry (compressed size plus extra field size)
        $fs.Position += $csz + $xsz
      }
    }
    finally {
      if ($br) { $br.Close() }
      if ($fs) {
        $fs.Dispose()
        $fs.Close()
      }
    }
  }
  end {
    if ($xml) { $xml.document.InnerText }
  }
}