Source code for word2txt.cs

```
using System;
```
```
using System.Collections.Generic;
```
```
using System.IO;
```
```
using System.IO.Compression;
```
```
using System.Linq;
```
```
using System.Text;
```
```
using System.Text.RegularExpressions;
```
```
using System.Windows.Forms;
```

using Word = Microsoft.Office.Interop.Word;

```
 
```
```
 
```
```
namespace RobvanderWoude
```
```
{
```
```
	internal class Word2Txt
```
```
	{
```
```
		static string progver = "1.05";
```
```
 
```
```
 
```

		static string plaintext = string.Empty;

```
 
```
```
 
```
```
		static int Main( string[] args )
```
```
		{
```
```
			int rc = 0;
```
```
			string document = string.Empty;
```
```
			bool success = false;
```
```
			bool usexmlencoding = false;
```
```
			string xmlencoding = string.Empty;
```
```
			Encoding encoding = null;
```
```
 
```
```
 
```
```
			#region Parse Command Line
```
```
 
```

			if ( args.Length == 0 || args.Length > 2 )

```
			{
```
```
				return ShowHelp( );
```
```
			}
```
```
 
```
```
			foreach ( string arg in args )
```
```
			{
```
```
				if ( arg[0] == '/' )
```
```
				{
```
```
					if ( arg == "/?" )
```
```
					{
```
```
						return ShowHelp( );
```
```
					}
```

					else if ( arg.StartsWith( "/D", StringComparison.OrdinalIgnoreCase ) )

```
					{
```
```
						usexmlencoding = true;
```
```
					}
```
```
					else if ( arg.ToUpper( ) == "/E" )
```
```
					{
```
```
						return ListEncodings( );
```
```
					}
```
```
					else
```
```
					{
```

						return ShowHelp( "Invalid command line switch {0}", arg );

```
					}
```
```
				}
```
```
				else
```
```
				{
```

					if ( string.IsNullOrWhiteSpace( document ) )

```
					{
```
```
						document = arg;
```
```
						if ( !File.Exists( document ) )
```
```
						{
```

							return ShowHelp( "File \"{0}\" not found", document );

```
						}
```
```
					}
```
```
					else if ( encoding == null )
```
```
					{
```
```
						encoding = GetEncoding( arg );
```
```
						if ( encoding == null )
```
```
						{
```

							return ShowHelp( "Invalid encoding \"{0}\"", args[1] );

```
						}
```
```
					}
```
```
					else
```
```
					{
```

						return ShowHelp( "Too many command line arguments" );

```
					}
```
```
				}
```
```
			}
```
```
 
```

			if ( string.IsNullOrWhiteSpace( document ) )

```
			{
```
```
				return ShowHelp( );
```
```
			}
```
```
 
```
```
			#endregion Parse Command Line
```
```
 
```
```
 
```
```
			#region Extract Text
```
```
 
```
```
			// First try using Word if possible
```
```
			if ( IsWordInstalled( ) )
```
```
			{
```

				// If Word is installed, this program can handle ANY document format that is recognized by Word

```
				success = ReadWordFile( document );
```
```
			}
```
```
 
```

			// if Word isn't available or could not extract any text, try plan B

			if ( !success || string.IsNullOrWhiteSpace( plaintext ) )

```
			{
```
```
				rc = 1;
```

				string ext = Path.GetExtension( document ).ToLower( );

```
				if ( ext == ".doc" )
```
```
				{
```
```
					success = ReadDocFile( document );
```
```
				}
```

				else if ( ext == ".docx" || ext == ".odt" )

```
				{
```

					success = ReadDocxOrOdtFile( document );

```
				}
```
```
				else if ( ext == ".rtf" )
```
```
				{
```
```
					success = ReadRTFFile( document );
```
```
				}
```
```
				else if ( ext == ".wpd" )
```
```
				{
```
```
					success = ReadWPDFile( document );
```
```
				}
```
```
				else
```
```
				{
```

					return ShowHelp( "If Word is not installed or fails to extract text, this program can only handle .DOC, .DOCX, .ODT and .WPD files" );

```
				}
```
```
			}
```
```
 
```
```
			#endregion Extract Text
```
```
 
```
```
 
```

			#region Cleanup Text and Display Result

```
 
```

			if ( success && !string.IsNullOrWhiteSpace( plaintext ) )

```
			{
```

				// convert stray carriage returns to carriage return/linefeed pairs

				plaintext = ConvertStrayCarriageReturns( plaintext ).Trim( "\n\r\t ".ToCharArray( ) );

```
 
```
```
				if ( usexmlencoding )
```
```
				{
```

					encoding = GetEncoding( xmlencoding );

```
				}
```
```
 
```
```
				if ( encoding == null )
```
```
				{
```

					// send text to console using default output encoding

```
					Console.WriteLine( plaintext );
```
```
				}
```
```
				else
```
```
				{
```

					// temporarily change output encoding and send text to console

					Encoding oldencoding = Console.OutputEncoding;

```
					Console.OutputEncoding = encoding;
```
```
					Console.WriteLine( plaintext );
```

					Console.OutputEncoding = oldencoding;

```
				}
```
```
			}
```
```
			else
```
```
			{
```
```
				rc = 2;
```
```
			}
```
```
 
```

			#endregion Cleanup Text and Display Result

```
 
```
```
 
```
```
			return rc;
```
```
		}
```
```
 
```
```
 
```

		static string ConvertStrayCarriageReturns( string text )

```
		{
```

			// convert stray carriage returns to carriage return/linefeed pairs

			// search for stray carriage returns (\r), i.e. the ones NOT followed by linefeeds (\n)

			Regex regex = new Regex( "\r(?!\n)" );

			// replace each matching stray carriage return by a carriage return/linefeed pair

			text = regex.Replace( text, Environment.NewLine );

```
			return text;
```
```
		}
```
```
 
```
```
 
```

		static Encoding GetEncoding( string myencoding )

```
		{
```

			if ( string.IsNullOrEmpty( myencoding ) )

```
			{
```
```
				return null;
```
```
			}
```
```
			// Get a list of available encodings
```

			EncodingInfo[] encodings = Encoding.GetEncodings( );

			// Try correctly spelled encodings first

			foreach ( EncodingInfo encoding in encodings )

```
			{
```

				if ( encoding.Name.ToLower( ) == myencoding.ToLower( ) )

```
				{
```

					return Encoding.GetEncoding( encoding.CodePage );

```
				}
```
```
			}
```

			// No direct match found, try again, ignoring dashes

			foreach ( EncodingInfo encoding in encodings )

```
			{
```

				if ( encoding.Name.Replace( "-", "" ).ToLower( ) == myencoding.Replace( "-", "" ).ToLower( ) )

```
				{
```

					return Encoding.GetEncoding( encoding.CodePage );

```
				}
```
```
			}
```
```
			// Still no match, try codepages
```

			foreach ( EncodingInfo encoding in encodings )

```
			{
```

				if ( encoding.CodePage.ToString( ) == myencoding )

```
				{
```

					return Encoding.GetEncoding( encoding.CodePage );

```
				}
```
```
			}
```
```
			// Still no match, giving up
```
```
			return null;
```
```
		}
```
```
 
```
```
 
```
```
		static bool IsWordInstalled( )
```
```
		{
```

			// Source: "How to Check Whether Word is Installed in the System or Not" by Tadit Dash

			// https://www.codeproject.com/Tips/689968/How-to-Check-Whether-Word-is-Installed-in-the-Syst

			return ( Type.GetTypeFromProgID( "Word.Application" ) != null );

```
		}
```
```
 
```
```
 
```
```
		static int ListEncodings( )
```
```
		{
```
```
			try
```
```
			{
```
```
				Console.Clear( );
```
```
			}
```
```
			catch
```
```
			{
```

				// Console.Clear( ) throws an IO exception if the output is redirected

```
			}
```
```
			int columnwidth = 8;
```

			EncodingInfo[] allencodings = Encoding.GetEncodings( );

			List<string> allencodingnames = new List<string>( );

			foreach ( EncodingInfo enc in allencodings )

```
			{
```
```
				allencodingnames.Add( enc.Name );
```
```
			}
```
```
			allencodingnames.Sort( );
```

			foreach ( string enc in allencodingnames )

```
			{
```

				columnwidth = Math.Max( columnwidth, enc.Length );

```
			}
```

			Console.WriteLine( "{0,-" + columnwidth + "}   {1}", "Encoding", "CodePage" );

			Console.WriteLine( "{0,-" + columnwidth + "}   {1}", "========", "========" );

			foreach ( string enc in allencodingnames )

```
			{
```

				Console.WriteLine( "{0,-" + columnwidth + "}   {1}", enc, GetEncoding( enc ).CodePage );

```
			}
```
```
			return 0;
```
```
		}
```
```
 
```
```
 
```

		static bool ReadDocFile( string docfile )

```
		{
```
```
			string doccontent = string.Empty;
```
```
			try
```
```
			{
```

				StreamReader sr = new StreamReader( docfile, false );

				doccontent = sr.ReadToEnd( ).Trim( "\n\t ".ToCharArray( ) );

```
				sr.Close( );
```
```
			}
```
```
			catch ( IOException )
```
```
			{
```

				ShowHelp( "Access to file \"{0}\" denied", docfile );

```
				return false;
```
```
			}
```
```
			if ( doccontent.Length == 0 )
```
```
			{
```
```
				return false;
```
```
			}
```

			if ( doccontent.Contains( "[Content_Types]" ) )

```
			{
```

				doccontent = doccontent.Substring( 0, doccontent.IndexOf( "[Content_Types]" ) );

```
			}
```

			Regex regex = new Regex( "[^\\000\\015\\367\\377]{20,}" );

			MatchCollection matches = regex.Matches( doccontent );

```
			if ( matches.Count == 0 )
```
```
			{
```
```
				return false;
```
```
			}
```
```
			plaintext = string.Empty;
```
```
			foreach ( Match match in matches )
```
```
			{
```

				string matchingtext = match.Value.Trim( "\n\t ".ToCharArray( ) );

				if ( Encoding.UTF8.GetByteCount( matchingtext ) == matchingtext.Length && !matchingtext.Contains( (char)4 ) )

```
				{
```
```
					plaintext += matchingtext + "\n";
```
```
				}
```
```
			}
```
```
			return true;
```
```
		}
```
```
 
```
```
 
```

		static bool ReadDocxOrOdtFile( string docfile )

```
		{
```
```
			string contentfile;
```

			string ext = Path.GetExtension( docfile ).ToLower( );

			if ( ext == ".odt" ) // OpenOffice document

```
			{
```
```
				contentfile = "content.xml";
```
```
			}
```

			else if ( ext == ".docx" ) // MS Office document

```
			{
```
```
				contentfile = "document.xml";
```
```
			}
```
```
			else
```
```
			{
```
```
				return false;
```
```
			}
```
```
 
```

			string tempfile = Path.GetTempFileName( );

```
			string content = string.Empty;
```
```
			bool success = false;
```
```
 
```
```
			try
```
```
			{
```

				// Open document as ZIP file and extract the XML file containing the text content

				using ( ZipArchive archive = ZipFile.OpenRead( docfile ) )

```
				{
```

					foreach ( ZipArchiveEntry entry in archive.Entries )

```
					{
```

						if ( entry.Name.ToLower( ) == contentfile )

```
						{
```

							entry.ExtractToFile( tempfile, true );

```
							success = true;
```
```
						}
```
```
					}
```
```
				}
```
```
			}
```
```
			catch ( IOException )
```
```
			{
```

				ShowHelp( "Access to file \"{0}\" denied", docfile );

```
				return false;
```
```
			}
```
```
 
```
```
			if ( success )
```
```
			{
```

				// Read the text content from the extracted file

				StreamReader sr = new StreamReader( tempfile );

				content = sr.ReadToEnd( ).Trim( "\n\r\t ".ToCharArray( ) );

```
				sr.Close( );
```
```
			}
```
```
 
```
```
			// Delete the extracted file
```
```
			File.Delete( tempfile );
```
```
 
```
```
			if ( success )
```
```
			{
```

				// The first 100 characters of the extracted XML usually contain its encoding;

				// this encoding will be used if the /D command line switch was used

				Regex regex = new Regex( " encoding=\"([^\"]+)\"" );

				string xmlencoding = regex.Match( content, 0, 100 ).Groups[1].Value;

				// insert newlines after headers, list items and paragraphs

				regex = new Regex( "</(text|w):(h|p)>" );

				plaintext = regex.Replace( content, "\n\n" );

```
				regex = new Regex( "<w:br/>" );
```

				plaintext = regex.Replace( plaintext, "\n\n" );

```
				// remove all XML tags
```
```
				regex = new Regex( "<[^>]+>" );
```

				plaintext = regex.Replace( plaintext, "" );

```
			}
```
```
			return success;
```
```
		}
```
```
 
```
```
 
```

		static bool ReadRTFFile( string rtffile )

```
		{
```

			// Use a hidden RichTextBox to convert RTF to plain text, by Wendy Zang

			// https://social.msdn.microsoft.com/Forums/vstudio/en-US/6e56af9b-d7d3-49f3-9ec4-80edde3fe54b/reading-modifying-rtf-files?forum=csharpgeneral#a64345e9-cfcb-43be-ab18-c08fae02cb2a

			RichTextBox rtbox = new RichTextBox( );

```
			string rtftext = string.Empty;
```
```
			try
```
```
			{
```

				rtftext = File.ReadAllText( rtffile );

```
				rtbox.Rtf = rtftext;
```
```
				plaintext = rtbox.Text;
```
```
			}
```
```
			catch ( IOException )
```
```
			{
```
```
				return false;
```
```
			}
```
```
			return true;
```
```
		}
```
```
 
```
```
 
```

		static bool ReadWordFile( string wordfile )

```
		{
```

			Word.Application wordapp = new Word.Application( );

			object savechanges = Word.WdSaveOptions.wdDoNotSaveChanges;

```
			bool success = false;
```
```
			try
```
```
			{
```
```
				wordapp.Visible = false;
```

				Word.Document worddoc = wordapp.Documents.Open( wordfile );

```
				wordapp.Selection.WholeStory( );
```
```
				plaintext = worddoc.Content.Text;
```
```
				worddoc.Close( ref savechanges );
```
```
				success = true;
```
```
			}
```
```
			catch ( Exception )
```
```
			{
```
```
				success = false;
```
```
			}
```
```
			finally
```
```
			{
```
```
				wordapp.Quit( ref savechanges );
```
```
			}
```
```
			return success;
```
```
		}
```
```
 
```
```
 
```

		static bool ReadWPDFile( string wpfile )

```
		{
```

			string wpcontent = File.ReadAllText( wpfile, Encoding.UTF8 );

```
 
```

			// Remove (most of) the WPD file header - WARNING: regex pattern depends on Encoding used for StreamReader!

			Regex regex = new Regex( "^[\\w\\W]*\\000{8,}([^\\w]+[B-HJ-NP-TV-Z\\d])*[^\\w-]+", RegexOptions.IgnoreCase );

			wpcontent = regex.Replace( wpcontent, "" );

```
 
```
```
			plaintext = string.Empty;
```
```
 
```

			// WPD file format info based on http://justsolve.archiveteam.org/wiki/WordPerfect

			// Modified for spaces, linefeeds and e acute by yours truly

			// More modifications are required for accented characters

```
			bool skip = false;
```
```
			int resume = -1;
```
```
			foreach ( char c in wpcontent )
```
```
			{
```
```
				int i = (int)c;
```
```
				if ( !skip )
```
```
				{
```

					if ( i == 63 || i == 128 || i == 160 || i == 65533 )

```
					{
```
```
						plaintext += ' ';
```
```
					}
```

					else if ( i >= 169 && i != 172 && i <= 174 )

```
					{
```
```
						plaintext += '-';
```
```
					}
```

					else if ( i == 10 || i == 13 || i == 208 )

```
					{
```
```
						plaintext += Environment.NewLine;
```
```
					}
```
```
					else if ( i >= 192 && i <= 236 )
```
```
					{
```
```
						skip = true;
```
```
						resume = i;
```
```
					}
```
```
					else if ( i == 15 )
```
```
					{
```
```
						plaintext += (char)233;
```
```
					}
```

					else if ( i <= 31 || ( i >= 129 && i <= 159 ) || ( i >= 161 && i <= 168 ) || i == 172 || ( i >= 175 && i <= 191 ) || ( i >= 237 && i <= 255 ) )

```
					{
```
```
						// control characters, ignore
```
```
					}
```
```
					else
```
```
					{
```
```
						plaintext += c;
```
```
					}
```
```
				}
```
```
				else if ( skip && i == resume )
```
```
				{
```
```
					skip = false;
```
```
					resume = -1;
```
```
				}
```
```
			}
```

			return !string.IsNullOrWhiteSpace( plaintext );

```
		}
```
```
 
```
```
 
```

		static int ShowHelp( params string[] errmsg )

```
		{
```
```
			#region Help Text
```
```
 
```
```
			/*
```
```
			Word2Txt,  Version 1.05
```

			Extract plain text from a Word document and send it to the screen

```
 
```

			Usage:   Word2Txt    "wordfile"  [ encoding | /D ]

```
 
```
```
			or:      Word2Txt    /E
```
```
 
```

			Where:   wordfile    is the path of the Word document to be read

			                     (no wildcards allowed)

			         encoding    force use of alternative encoding for plain

			                     text, e.g. UTF-8 to preserve accented characters

			                     or IBM437 to convert unicode quotes to ASCII

			         /D          use the encoding specified in the document file

			                     (for .DOCX and .ODT only, if Word isn't available)

			         /E          list all available encodings

```
 
```

			Notes:   If a "regular" (MSI based) Microsoft Word (2007 or later)

			         installation is detected, this program will use Word to read the

			         text from the Word file, which may be ANY file format recognized

```
			         by Word.
```

			         If Word was already active when this program is started, any other

			         opened document(s) will be left alone, and only the document opened

			         by this program will be closed.

			         If Word is not available, or if it encounters unreadable content

			         (i.e. the file is corrupted), the text can still be extracted, but

			         only from .DOC, .DOCX, .ODT, .RTF and .WPD files.

			         If the specified encoding does not match any available encoding name,

			         the program will try again, ignoring dashes; if that does not provide

			         a match, the program will try matching the specified encoding with

			         the available encodings' codepages.

			         This program requires .NET 4.5.

			         Return code ("errorlevel") 0 means Word encountered no errors and

			         some text was extracted from the file; 1 means Word is not available

			         or the file was corrupted; 2 means either command line errors or the

			         program failed to extract any text.

```
 
```
```
			Written by Rob van der Woude
```
```
			https://www.robvanderwoude.com
```
```
			*/
```
```
 
```
```
			#endregion Help Text
```
```
 
```
```
 
```
```
			#region Error Message
```
```
 
```
```
			if ( errmsg.Length > 0 )
```
```
			{
```

				List<string> errargs = new List<string>( errmsg );

```
				errargs.RemoveAt( 0 );
```
```
				Console.Error.WriteLine( );
```

				Console.ForegroundColor = ConsoleColor.Red;

```
				Console.Error.Write( "ERROR:\t" );
```

				Console.ForegroundColor = ConsoleColor.White;

				Console.Error.WriteLine( errmsg[0], errargs.ToArray( ) );

```
				Console.ResetColor( );
```
```
			}
```
```
 
```
```
			#endregion Error Message
```
```
 
```
```
 
```
```
			#region Display Help Text
```
```
 
```
```
			Console.Error.WriteLine( );
```
```
 
```

			Console.Error.WriteLine( "Word2Txt,  Version {0}", progver );

```
 
```

			Console.Error.WriteLine( "Extract plain text from a Word document and send it to the screen" );

```
 
```
```
			Console.Error.WriteLine( );
```
```
 
```
```
			Console.Error.Write( "Usage:   " );
```

			Console.ForegroundColor = ConsoleColor.White;

			Console.Error.WriteLine( "Word2Txt    \"wordfile\"  [ encoding | /D ]" );

```
			Console.ResetColor( );
```
```
 
```
```
			Console.Error.WriteLine( );
```
```
 
```
```
			Console.Error.Write( "or:      " );
```

			Console.ForegroundColor = ConsoleColor.White;

			Console.Error.WriteLine( "Word2Txt    /E" );

```
			Console.ResetColor( );
```
```
 
```
```
			Console.Error.WriteLine( );
```
```
 
```
```
			Console.Error.Write( "Where:   " );
```

			Console.ForegroundColor = ConsoleColor.White;

```
			Console.Error.Write( "wordfile" );
```
```
			Console.ResetColor( );
```

			Console.Error.WriteLine( "    is the path of the Word document to be read" );

```
 
```

			Console.Error.WriteLine( "                     (no wildcards allowed)" );

```
 
```

			Console.ForegroundColor = ConsoleColor.White;

			Console.Error.Write( "         encoding" );

```
			Console.ResetColor( );
```

			Console.Error.WriteLine( "    force use of alternative encoding for plain" );

```
 
```

			Console.Error.Write( "                     text, e.g. " );

			Console.ForegroundColor = ConsoleColor.White;

```
			Console.Error.Write( "UTF-8" );
```
```
			Console.ResetColor( );
```

			Console.Error.WriteLine( " to preserve accented characters" );

```
 
```

			Console.Error.Write( "                     or " );

			Console.ForegroundColor = ConsoleColor.White;

```
			Console.Error.Write( "IBM437" );
```
```
			Console.ResetColor( );
```

			Console.Error.WriteLine( " to convert unicode quotes to ASCII" );

```
 
```

			Console.ForegroundColor = ConsoleColor.White;

			Console.Error.Write( "         /D" );

```
			Console.ResetColor( );
```

			Console.Error.WriteLine( "          use the encoding specified in the document file" );

```
 
```

			Console.Error.WriteLine( "                     (for .DOCX and .ODT only, if Word isn't available)" );

```
 
```

			Console.ForegroundColor = ConsoleColor.White;

			Console.Error.Write( "         /E" );

```
			Console.ResetColor( );
```

			Console.Error.WriteLine( "          list all available encodings" );

```
 
```
```
			Console.Error.WriteLine( );
```
```
 
```

			Console.Error.WriteLine( "Notes:   If a \"regular\" (MSI based) Microsoft Word (2007 or later)" );

```
 
```

			Console.Error.WriteLine( "         installation is detected, this program will use Word to read the" );

```
 
```

			Console.Error.WriteLine( "         recognized text from the Word file, which may be ANY file format" );

```
 
```

			Console.Error.WriteLine( "         by Word." );

```
 
```

			Console.Error.WriteLine( "         If Word was already active when this program is started, any other" );

```
 
```

			Console.Error.WriteLine( "         opened document(s) will be left alone, and only the document opened" );

```
 
```

			Console.Error.WriteLine( "         by this program will be closed." );

```
 
```

			Console.Error.WriteLine( "         If Word is not available, or if it encounters unreadable content" );

```
 
```

			Console.Error.WriteLine( "         (i.e. the file is corrupted), the text can still be extracted, but" );

```
 
```

			Console.Error.WriteLine( "         only from .DOC, .DOCX, .ODT, .RTF and .WPD files." );

```
 
```

			Console.Error.WriteLine( "         If the specified encoding does not match any available encoding name," );

```
 
```

			Console.Error.WriteLine( "         the program will try again, ignoring dashes; if that does not provide" );

```
 
```

			Console.Error.WriteLine( "         a match, the program will try matching the specified encoding with" );

```
 
```

			Console.Error.WriteLine( "         the available encodings' codepages." );

```
 
```

			Console.Error.WriteLine( "         This program requires .NET 4.5." );

```
 
```

			Console.Error.WriteLine( "         Return code (\"errorlevel\") 0 means Word encountered no errors and" );

```
 
```

			Console.Error.WriteLine( "         some text was extracted from the file; 1 means Word is not available" );

```
 
```

			Console.Error.WriteLine( "         or the file was corrupted; 2 means either command line errors or the" );

```
 
```

			Console.Error.WriteLine( "         program failed to extract any text." );

```
 
```
```
			Console.Error.WriteLine( );
```
```
 
```

			Console.Error.WriteLine( "Written by Rob van der Woude" );

```
 
```

			Console.Error.WriteLine( "https://www.robvanderwoude.com" );

```
 
```
```
			#endregion Display Help Text
```
```
 
```
```
 
```
```
			return 2;
```
```
		}
```
```
	}
```
```
}
```