(view source code of doc2txt.cs as plain text)
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Text;
using System.Text.RegularExpressions;
namespace RobvanderWoude
{
internal class Doc2Txt
{
static string progver = "1.00";
static int Main( string[] args )
{
if ( args.Length != 1 || args[0] == "/?" )
{
return ShowHelp( );
}
string docfile = args[0];
if ( !File.Exists( docfile ) )
{
return ShowHelp( "File not found: \"{0}\"", docfile );
}
if ( Path.GetExtension( docfile ).ToLower( ) != ".doc" )
{
return ShowHelp( "This program can extract text from .DOC files only" );
}
string doccontent = string.Empty;
try
{
StreamReader sr = new StreamReader( docfile, false );
doccontent = sr.ReadToEnd( ).Trim( "\n\t ".ToCharArray( ) );
sr.Close( );
}
catch ( IOException )
{
return ShowHelp( "Access to file \"{0}\" denied", docfile );
}
if ( doccontent.Length == 0 )
{
return ShowHelp( "An error occurred while trying to extract text from \"{0}\"", docfile );
}
if ( doccontent.Contains( "[Content_Types]" ) )
{
doccontent = doccontent.Substring( 0, doccontent.IndexOf( "[Content_Types]" ) );
}
string plaintext = string.Empty;
Regex regex = new Regex( "[^\\000\\015\\367\\377]{20,}" );
MatchCollection matches = regex.Matches( doccontent );
if ( matches.Count == 0 )
{
return ShowHelp( "An error occurred while trying to extract text from \"{0}\"", docfile );
}
foreach ( Match match in matches )
{
string matchingtext = match.Value.Trim( "\n\t ".ToCharArray( ) );
if ( Encoding.UTF8.GetByteCount( matchingtext ) == matchingtext.Length && !matchingtext.Contains( (char)4 ) )
{
plaintext += matchingtext + "\n";
}
}
Console.WriteLine( plaintext );
return 0;
}
static int ShowHelp( params string[] errmsg )
{
#region Error Message
if ( errmsg.Length > 0 )
{
List<string> errargs = new List<string>( errmsg );
errargs.RemoveAt( 0 );
Console.Error.WriteLine( );
Console.ForegroundColor = ConsoleColor.Red;
Console.Error.Write( "ERROR:\t" );
Console.ForegroundColor = ConsoleColor.White;
Console.Error.WriteLine( errmsg[0], errargs.ToArray( ) );
Console.ResetColor( );
}
#endregion Error Message
#region Help Text
/*
Doc2Txt.exe, Version 1.00
Return the plain text content of a Word .DOC file without requiring Word
Usage: Doc2Txt.exe docxfile
Where: docfile is the path of the Word file to be read
(no wildcards, only .doc extension allowed)
Note: Return code ("errorlevel") 1 in case of errors, 0 on success.
Written by Rob van der Woude
https://www.robvanderwoude.com
*/
#endregion Help Text
#region Display Help Text
Console.Error.WriteLine( );
Console.Error.WriteLine( "Doc2Txt.exe, Version {0}", progver );
Console.Error.WriteLine( "Return the plain text content of a Word .DOC file without requiring Word" );
Console.Error.WriteLine( );
Console.Error.Write( "Usage: " );
Console.ForegroundColor = ConsoleColor.White;
Console.Error.WriteLine( "Doc2Txt.exe docfile" );
Console.ResetColor( );
Console.Error.WriteLine( );
Console.Error.Write( "Where: " );
Console.ForegroundColor = ConsoleColor.White;
Console.Error.Write( "docfile" );
Console.ResetColor( );
Console.Error.WriteLine( " is the path of the Word file to be read" );
Console.Error.WriteLine( " (no wildcards, only .doc extension allowed)" );
Console.Error.WriteLine( );
Console.Error.WriteLine( "Note: Return code (\"errorlevel\") 1 in case of errors, 0 on success." );
Console.Error.WriteLine( );
Console.Error.WriteLine( "Written by Rob van der Woude" );
Console.Error.WriteLine( "https://www.robvanderwoude.com" );
#endregion Display Help Text
return 1;
}
}
}
page last modified: 2024-04-16; loaded in 0.0064 seconds