Rob van der Woude's Scripting Pages
Powered by GeSHi

Source code for odt2txt.cs

(view source code of odt2txt.cs as plain text)

  1. using System;
  2. using System.Collections.Generic;
  3. using System.IO;
  4. using System.IO.Compression;
  5. using System.Text;
  6. using System.Text.RegularExpressions;
  7.  
  8.  
  9. namespace RobvanderWoude
  10. {
  11. 	internal class ODT2Txt
  12. 	{
  13. 		static string progver = "1.00";
  14.  
  15.  
  16. 		static int Main( string[] args )
  17. 		{
  18. 			string odtfile = string.Empty;
  19. 			Encoding encoding = null;
  20. 			bool usexmlencoding = false;
  21.  
  22. 			#region Parse Command Line
  23.  
  24. 			if ( args.Length == 0 || args.Length > 2 )
  25. 			{
  26. 				return ShowHelp( );
  27. 			}
  28.  
  29. 			foreach ( string arg in args )
  30. 			{
  31. 				if ( arg[0] == '/' )
  32. 				{
  33. 					if ( arg == "/?" )
  34. 					{
  35. 						return ShowHelp( );
  36. 					}
  37. 					else if ( arg.ToUpper( ).StartsWith( "/D", StringComparison.OrdinalIgnoreCase ) )
  38. 					{
  39. 						usexmlencoding = true;
  40. 					}
  41. 					else if ( arg.ToUpper( ) == "/E" )
  42. 					{
  43. 						return ListEncodings( );
  44. 					}
  45. 					else
  46. 					{
  47. 						return ShowHelp( "Invalid command line switch {0}", arg );
  48. 					}
  49. 				}
  50. 				else
  51. 				{
  52. 					if ( string.IsNullOrWhiteSpace( odtfile ) )
  53. 					{
  54. 						odtfile = arg;
  55. 						if ( !File.Exists( odtfile ) )
  56. 						{
  57. 							return ShowHelp( "File \"{0}\" not found", odtfile );
  58. 						}
  59. 						if ( Path.GetExtension( odtfile ).ToLower( ) != ".odt" )
  60. 						{
  61. 							return ShowHelp( "This program can extract text from .ODT files only" );
  62. 						}
  63. 					}
  64. 					else if ( encoding == null )
  65. 					{
  66. 						encoding = GetEncoding( arg );
  67. 						if ( encoding == null )
  68. 						{
  69. 							return ShowHelp( "Invalid encoding \"{0}\"", args[1] );
  70. 						}
  71. 					}
  72. 					else
  73. 					{
  74. 						return ShowHelp( "Too many command line arguments" );
  75. 					}
  76. 				}
  77. 			}
  78.  
  79. 			if ( string.IsNullOrWhiteSpace( odtfile ) )
  80. 			{
  81. 				return ShowHelp( );
  82. 			}
  83.  
  84. 			#endregion Parse Command Line
  85.  
  86.  
  87. 			#region Extract Text
  88.  
  89. 			string tempfile = Path.GetTempFileName( );
  90. 			string content = string.Empty;
  91. 			bool success = false;
  92.  
  93. 			using ( ZipArchive archive = ZipFile.OpenRead( odtfile ) )
  94. 			{
  95. 				foreach ( ZipArchiveEntry entry in archive.Entries )
  96. 				{
  97. 					if ( entry.Name.ToLower( ) == "content.xml" )
  98. 					{
  99. 						entry.ExtractToFile( tempfile, true );
  100. 						success = true;
  101. 					}
  102. 				}
  103. 			}
  104.  
  105. 			if ( success )
  106. 			{
  107. 				StreamReader sr = new StreamReader( tempfile );
  108. 				content = sr.ReadToEnd( );
  109. 				sr.Close( );
  110. 			}
  111.  
  112. 			File.Delete( tempfile );
  113.  
  114. 			#endregion Extract Text
  115.  
  116.  
  117. 			if ( success )
  118. 			{
  119. 				// The first 100 characters of the extracted XML usually contain its encoding;
  120. 				// this encoding will be used if the /D command line switch was used
  121. 				Regex regex = new Regex( " encoding=\"([^\"]+)\"" );
  122. 				string xmlencoding = regex.Match( content, 0, 100 ).Groups[1].Value;
  123.  
  124.  
  125. 				#region Cleanup Text
  126.  
  127. 				// insert newlines after header, paragraph or list-item
  128. 				regex = new Regex( "</text:(h|list-item|p)>" );
  129. 				string plaintext = regex.Replace( content, "\n\n" );
  130. 				// remove all XML tags
  131. 				regex = new Regex( "<[^>]+>" );
  132. 				plaintext = regex.Replace( plaintext, "" );
  133. 				// reduce maximum number of censecutive newlines to two
  134. 				regex = new Regex( "\n{3,}" );
  135. 				plaintext = regex.Replace( plaintext, "\n\n" );
  136. 				// convert stray carriage returns to carriage return/linefeed pairs
  137. 				plaintext = ConvertStrayCarriageReturns( plaintext ).Trim( "\n\r\t ".ToCharArray( ) );
  138.  
  139. 				#endregion Cleanup Text
  140.  
  141.  
  142. 				#region Display Text
  143.  
  144. 				if ( usexmlencoding )
  145. 				{
  146. 					encoding = GetEncoding( xmlencoding );
  147. 				}
  148.  
  149. 				if ( encoding == null )
  150. 				{
  151. 					// send text to console using default output encoding
  152. 					Console.WriteLine( plaintext );
  153. 				}
  154. 				else
  155. 				{
  156. 					// temporarily change output encoding and send text to console
  157. 					Encoding oldencoding = Console.OutputEncoding;
  158. 					Console.OutputEncoding = encoding;
  159. 					Console.WriteLine( plaintext );
  160. 					Console.OutputEncoding = oldencoding;
  161. 				}
  162.  
  163. 				#endregion Display Text
  164.  
  165.  
  166. 				return 0;
  167. 			}
  168. 			return ShowHelp( "An error occurred while trying to read \"{0}\"", odtfile );
  169. 		}
  170.  
  171.  
  172.  
  173. 		static string ConvertStrayCarriageReturns( string text )
  174. 		{
  175. 			// convert stray carriage returns to carriage return/linefeed pairs
  176. 			// search for stray carriage returns (\r), i.e. the ones NOT followed by linefeeds (\n)
  177. 			Regex regex = new Regex( "\r(?!\n)" );
  178. 			// replace each matching stray carriage return by a carriage return/linefeed pair
  179. 			text = regex.Replace( text, Environment.NewLine );
  180. 			return text;
  181. 		}
  182.  
  183.  
  184. 		static Encoding GetEncoding( string myencoding )
  185. 		{
  186. 			if ( string.IsNullOrEmpty( myencoding ) )
  187. 			{
  188. 				return null;
  189. 			}
  190. 			// Get a list of available encodings
  191. 			EncodingInfo[] encodings = Encoding.GetEncodings( );
  192. 			// Try correctly spelled encodings first
  193. 			foreach ( EncodingInfo encoding in encodings )
  194. 			{
  195. 				if ( encoding.Name.ToLower( ) == myencoding.ToLower( ) )
  196. 				{
  197. 					return Encoding.GetEncoding( encoding.CodePage );
  198. 				}
  199. 			}
  200. 			// No direct match found, try again, ignoring dashes
  201. 			foreach ( EncodingInfo encoding in encodings )
  202. 			{
  203. 				if ( encoding.Name.Replace( "-", "" ).ToLower( ) == myencoding.Replace( "-", "" ).ToLower( ) )
  204. 				{
  205. 					return Encoding.GetEncoding( encoding.CodePage );
  206. 				}
  207. 			}
  208. 			// Still no match, try codepages
  209. 			foreach ( EncodingInfo encoding in encodings )
  210. 			{
  211. 				if ( encoding.CodePage.ToString( ) == myencoding )
  212. 				{
  213. 					return Encoding.GetEncoding( encoding.CodePage );
  214. 				}
  215. 			}
  216. 			// Still no match, giving up
  217. 			return null;
  218. 		}
  219.  
  220.  
  221. 		static int ListEncodings( )
  222. 		{
  223. 			try
  224. 			{
  225. 				Console.Clear( );
  226. 			}
  227. 			catch
  228. 			{
  229. 				// Console.Clear( ) throws an IO exception if the output is redirected
  230. 			}
  231. 			int columnwidth = 8;
  232. 			EncodingInfo[] allencodings = Encoding.GetEncodings( );
  233. 			List<string> allencodingnames = new List<string>( );
  234. 			foreach ( EncodingInfo enc in allencodings )
  235. 			{
  236. 				allencodingnames.Add( enc.Name );
  237. 			}
  238. 			allencodingnames.Sort( );
  239. 			foreach ( string enc in allencodingnames )
  240. 			{
  241. 				columnwidth = Math.Max( columnwidth, enc.Length );
  242. 			}
  243. 			Console.WriteLine( "{0,-" + columnwidth + "}   {1}", "Encoding", "CodePage" );
  244. 			Console.WriteLine( "{0,-" + columnwidth + "}   {1}", "========", "========" );
  245. 			foreach ( string enc in allencodingnames )
  246. 			{
  247. 				Console.WriteLine( "{0,-" + columnwidth + "}   {1}", enc, GetEncoding( enc ).CodePage );
  248. 			}
  249. 			return 0;
  250. 		}
  251.  
  252.  
  253. 		static int ShowHelp( params string[] errmsg )
  254. 		{
  255. 			#region Error Message
  256.  
  257. 			if ( errmsg.Length > 0 )
  258. 			{
  259. 				List<string> errargs = new List<string>( errmsg );
  260. 				errargs.RemoveAt( 0 );
  261. 				Console.Error.WriteLine( );
  262. 				Console.ForegroundColor = ConsoleColor.Red;
  263. 				Console.Error.Write( "ERROR:\t" );
  264. 				Console.ForegroundColor = ConsoleColor.White;
  265. 				Console.Error.WriteLine( errmsg[0], errargs.ToArray( ) );
  266. 				Console.ResetColor( );
  267. 			}
  268.  
  269. 			#endregion Error Message
  270.  
  271.  
  272. 			#region Help Text
  273.  
  274. 			/*
  275. 			ODT2Txt.exe,  Version 1.00
  276. 			Return plain text content of an OpenOffice file without requiring OpenOffice
  277.  
  278. 			Usage:    ODT2Txt.exe  odtfile  [ encoding | /D ]
  279.  
  280. 			or:       ODT2Txt.exe  /E
  281.  
  282. 			Where:    odtfile      is the path of the OpenOffice file to be read
  283. 			                       (no wildcards, only .odt extension allowed)
  284. 			          encoding     is the output encoding, e.g. UTF-8 to preserve
  285. 			                       Unicode characters, or IBM437 to convert Unicode
  286. 			                       doublequotes to ASCII
  287. 			          /D           use the encoding specified in the document file
  288. 			          /E           list all available encodings
  289.  
  290. 			Notes:    If the specified encoding does not match any available encoding
  291. 			          name, the program will try again, ignoring dashes; if that does
  292. 			          not provide a match, the program will try matching the specified
  293. 			          encoding with the available encodings' codepages.
  294. 			          This program requires .NET 4.5.
  295. 			          Return code ("errorlevel") 1 in case of errors, 0 on success.
  296.  
  297. 			Written by Rob van der Woude
  298. 			https://www.robvanderwoude.com
  299. 			*/
  300.  
  301. 			#endregion Help Text
  302.  
  303.  
  304. 			#region Display Help Text
  305.  
  306. 			Console.Error.WriteLine( );
  307.  
  308. 			Console.Error.WriteLine( "ODT2Txt.exe,  Version {0}", progver );
  309.  
  310. 			Console.Error.WriteLine( "Return plain text content of an OpenOffice file without requiring OpenOffice" );
  311.  
  312. 			Console.Error.WriteLine( );
  313.  
  314. 			Console.Error.Write( "Usage:    " );
  315. 			Console.ForegroundColor = ConsoleColor.White;
  316. 			Console.Error.WriteLine( "ODT2Txt.exe  odtfile  [ encoding | /D ]" );
  317. 			Console.ResetColor( );
  318.  
  319. 			Console.Error.WriteLine( );
  320.  
  321. 			Console.Error.Write( "or:       " );
  322. 			Console.ForegroundColor = ConsoleColor.White;
  323. 			Console.Error.WriteLine( "ODT2Txt.exe  /E" );
  324. 			Console.ResetColor( );
  325.  
  326. 			Console.Error.WriteLine( );
  327.  
  328. 			Console.Error.Write( "Where:    " );
  329. 			Console.ForegroundColor = ConsoleColor.White;
  330. 			Console.Error.Write( "odtfile" );
  331. 			Console.ResetColor( );
  332. 			Console.Error.WriteLine( "      is the path of the OpenOffice file to be read" );
  333.  
  334. 			Console.Error.WriteLine( "                       (no wildcards, only .odt extension allowed)" );
  335.  
  336. 			Console.ForegroundColor = ConsoleColor.White;
  337. 			Console.Error.Write( "          encoding" );
  338. 			Console.ResetColor( );
  339. 			Console.Error.Write( "     is the output encoding, e.g. " );
  340. 			Console.ForegroundColor = ConsoleColor.White;
  341. 			Console.Error.Write( "UTF-8" );
  342. 			Console.ResetColor( );
  343. 			Console.Error.WriteLine( " to preserve" );
  344.  
  345. 			Console.Error.Write( "                       Unicode characters, or " );
  346. 			Console.ForegroundColor = ConsoleColor.White;
  347. 			Console.Error.Write( "IBM437" );
  348. 			Console.ResetColor( );
  349. 			Console.Error.WriteLine( " to convert Unicode" );
  350.  
  351. 			Console.Error.WriteLine( "                       doublequotes to ASCII" );
  352.  
  353. 			Console.ForegroundColor = ConsoleColor.White;
  354. 			Console.Error.Write( "         /D" );
  355. 			Console.ResetColor( );
  356. 			Console.Error.WriteLine( "            use the encoding specified in the document file" );
  357.  
  358. 			Console.ForegroundColor = ConsoleColor.White;
  359. 			Console.Error.Write( "         /E" );
  360. 			Console.ResetColor( );
  361. 			Console.Error.WriteLine( "            list all available encodings" );
  362.  
  363. 			Console.Error.WriteLine( );
  364.  
  365. 			Console.Error.WriteLine( "Notes:   If the specified encoding does not match any available encoding" );
  366.  
  367. 			Console.Error.WriteLine( "         name, the program will try again, ignoring dashes; if that does" );
  368.  
  369. 			Console.Error.WriteLine( "         not provide a match, the program will try matching the specified" );
  370.  
  371. 			Console.Error.WriteLine( "         encoding with the available encodings' codepages." );
  372.  
  373. 			Console.Error.WriteLine( "         This program requires .NET 4.5." );
  374.  
  375. 			Console.Error.WriteLine( "         Return code (\"errorlevel\") 1 in case of errors, 0 on success." );
  376.  
  377. 			Console.Error.WriteLine( );
  378.  
  379. 			Console.Error.WriteLine( "Written by Rob van der Woude" );
  380.  
  381. 			Console.Error.WriteLine( "https://www.robvanderwoude.com" );
  382.  
  383. 			#endregion Display Help Text
  384.  
  385.  
  386. 			return 1;
  387. 		}
  388. 	}
  389. }
  390.  

page last modified: 2024-04-16; loaded in 0.0108 seconds