Rob van der Woude's Scripting Pages
Powered by GeSHi

Source code for doc2txt.cs

(view source code of doc2txt.cs as plain text)

  1. using System;
  2. using System.Collections.Generic;
  3. using System.IO;
  4. using System.Linq;
  5. using System.Text;
  6. using System.Text.RegularExpressions;
  7.  
  8.  
  9. namespace RobvanderWoude
  10. {
  11. 	internal class Doc2Txt
  12. 	{
  13. 		static string progver = "1.00";
  14.  
  15.  
  16. 		static int Main( string[] args )
  17. 		{
  18. 			if ( args.Length != 1 || args[0] == "/?" )
  19. 			{
  20. 				return ShowHelp( );
  21. 			}
  22. 			string docfile = args[0];
  23. 			if ( !File.Exists( docfile ) )
  24. 			{
  25. 				return ShowHelp( "File not found: \"{0}\"", docfile );
  26. 			}
  27. 			if ( Path.GetExtension( docfile ).ToLower( ) != ".doc" )
  28. 			{
  29. 				return ShowHelp( "This program can extract text from .DOC files only" );
  30. 			}
  31.  
  32. 			string doccontent = string.Empty;
  33. 			try
  34. 			{
  35. 				StreamReader sr = new StreamReader( docfile, false );
  36. 				doccontent = sr.ReadToEnd( ).Trim( "\n\t ".ToCharArray( ) );
  37. 				sr.Close( );
  38. 			}
  39. 			catch ( IOException )
  40. 			{
  41. 				return ShowHelp( "Access to file \"{0}\" denied", docfile );
  42. 			}
  43. 			if ( doccontent.Length == 0 )
  44. 			{
  45. 				return ShowHelp( "An error occurred while trying to extract text from \"{0}\"", docfile );
  46. 			}
  47. 			if ( doccontent.Contains( "[Content_Types]" ) )
  48. 			{
  49. 				doccontent = doccontent.Substring( 0, doccontent.IndexOf( "[Content_Types]" ) );
  50. 			}
  51. 			string plaintext = string.Empty;
  52. 			Regex regex = new Regex( "[^\\000\\015\\367\\377]{20,}" );
  53. 			MatchCollection matches = regex.Matches( doccontent );
  54. 			if ( matches.Count == 0 )
  55. 			{
  56. 				return ShowHelp( "An error occurred while trying to extract text from \"{0}\"", docfile );
  57. 			}
  58. 			foreach ( Match match in matches )
  59. 			{
  60. 				string matchingtext = match.Value.Trim( "\n\t ".ToCharArray( ) );
  61. 				if ( Encoding.UTF8.GetByteCount( matchingtext ) == matchingtext.Length && !matchingtext.Contains( (char)4 ) )
  62. 				{
  63. 					plaintext += matchingtext + "\n";
  64. 				}
  65. 			}
  66.  
  67. 			Console.WriteLine( plaintext );
  68.  
  69. 			return 0;
  70. 		}
  71.  
  72.  
  73. 		static int ShowHelp( params string[] errmsg )
  74.  
  75. 		{
  76. 			#region Error Message
  77.  
  78. 			if ( errmsg.Length > 0 )
  79. 			{
  80. 				List<string> errargs = new List<string>( errmsg );
  81. 				errargs.RemoveAt( 0 );
  82. 				Console.Error.WriteLine( );
  83. 				Console.ForegroundColor = ConsoleColor.Red;
  84. 				Console.Error.Write( "ERROR:\t" );
  85. 				Console.ForegroundColor = ConsoleColor.White;
  86. 				Console.Error.WriteLine( errmsg[0], errargs.ToArray( ) );
  87. 				Console.ResetColor( );
  88. 			}
  89.  
  90. 			#endregion Error Message
  91.  
  92.  
  93. 			#region Help Text
  94.  
  95. 			/*
  96. 			Doc2Txt.exe,  Version 1.00
  97. 			Return the plain text content of a Word .DOC file without requiring Word
  98.  
  99. 			Usage:    Doc2Txt.exe  docxfile
  100.  
  101. 			Where:    docfile      is the path of the Word file to be read
  102. 			                       (no wildcards, only .doc extension allowed)
  103.  
  104. 			Note:     Return code ("errorlevel") 1 in case of errors, 0 on success.
  105.  
  106. 			Written by Rob van der Woude
  107. 			https://www.robvanderwoude.com
  108. 			*/
  109.  
  110. 			#endregion Help Text
  111.  
  112.  
  113. 			#region Display Help Text
  114.  
  115. 			Console.Error.WriteLine( );
  116.  
  117. 			Console.Error.WriteLine( "Doc2Txt.exe,  Version {0}", progver );
  118.  
  119. 			Console.Error.WriteLine( "Return the plain text content of a Word .DOC file without requiring Word" );
  120.  
  121. 			Console.Error.WriteLine( );
  122.  
  123. 			Console.Error.Write( "Usage:    " );
  124. 			Console.ForegroundColor = ConsoleColor.White;
  125. 			Console.Error.WriteLine( "Doc2Txt.exe  docfile" );
  126. 			Console.ResetColor( );
  127.  
  128. 			Console.Error.WriteLine( );
  129.  
  130. 			Console.Error.Write( "Where:    " );
  131. 			Console.ForegroundColor = ConsoleColor.White;
  132. 			Console.Error.Write( "docfile" );
  133. 			Console.ResetColor( );
  134. 			Console.Error.WriteLine( "      is the path of the Word file to be read" );
  135.  
  136. 			Console.Error.WriteLine( "                       (no wildcards, only .doc extension allowed)" );
  137.  
  138. 			Console.Error.WriteLine( );
  139.  
  140. 			Console.Error.WriteLine( "Note:     Return code (\"errorlevel\") 1 in case of errors, 0 on success." );
  141.  
  142. 			Console.Error.WriteLine( );
  143.  
  144. 			Console.Error.WriteLine( "Written by Rob van der Woude" );
  145.  
  146. 			Console.Error.WriteLine( "https://www.robvanderwoude.com" );
  147.  
  148. 			#endregion Display Help Text
  149.  
  150.  
  151. 			return 1;
  152. 		}
  153. 	}
  154. }

page last modified: 2024-04-16; loaded in 0.0064 seconds