Rob van der Woude's Scripting Pages
Powered by GeSHi

Source code for splittextfile.cs

(view source code of splittextfile.cs as plain text)

  1. # define DEBUG
  2. #undef DEBUG
  3.  
  4. using System;
  5. using System.IO;
  6. using System.Text;
  7.  
  8. namespace RobvanderWoude
  9. {
  10. 	class SplitTextFile
  11. 	{
  12. 		static int Main( string[] args )
  13. 		{
  14. 			#region Command Line Parsing
  15.  
  16. 			bool linebreak = false;
  17. 			int maxfiles = 0;
  18. 			Encoding enc = null;
  19. 			if ( args.Length < 2 || args[0] == "/?" )
  20. 			{
  21. 				return WriteError( );
  22. 			}
  23. 			if ( !File.Exists( args[0] ) )
  24. 			{
  25. 				return WriteError( "File not found" );
  26. 			}
  27. 			string bigfile = args[0];
  28. 			string chunk = args[1].ToUpper( );
  29. 			int filesize = Convert.ToInt32( ( new FileInfo( bigfile ) ).Length );
  30. 			int chunksize = 0;
  31. 			try
  32. 			{
  33. 				for ( int i = 2; i < args.Length; i++ )
  34. 				{
  35. 					if ( args[i].ToUpper( ) == "/BREAK" )
  36. 					{
  37. 						linebreak = true;
  38. 					}
  39. 					else if ( args[i].ToUpper( ).Substring( 0, 7 ) == "/COUNT:" )
  40. 					{
  41. 						maxfiles = Convert.ToInt32( args[i].Substring( 7 ) );
  42. 					}
  43. 					else if ( args[i].ToUpper( ).Substring( 0, 5 ) == "/ENC:" )
  44. 					{
  45. 						switch ( args[i].ToUpper( ).Substring( 5 ) )
  46. 						{
  47. 							case "ANSI":
  48. 							case "ASCII":
  49. 								enc = Encoding.ASCII;
  50. 								break;
  51. 							case "UNICODE":
  52. 								enc = Encoding.Unicode;
  53. 								break;
  54. 							case "UNICODEBE":
  55. 							case "UNICODE-BE":
  56. 								enc = Encoding.BigEndianUnicode;
  57. 								break;
  58. 							case "UTF7":
  59. 							case "UTF-7":
  60. 								enc = Encoding.UTF7;
  61. 								break;
  62. 							case "UTF8":
  63. 							case "UTF-8":
  64. 								enc = Encoding.UTF8;
  65. 								break;
  66. 							case "UTF32":
  67. 							case "UTF-32":
  68. 								enc = Encoding.UTF32;
  69. 								break;
  70. 							default:
  71. 								return WriteError( "Invalid encoding" );
  72. 						}
  73. 					}
  74. 					else
  75. 					{
  76. 						return WriteError( "Invalid command line argument(s)" );
  77. 					}
  78. 				}
  79. 				if ( chunk.IndexOf( "KB" ) > -1 )
  80. 				{
  81. 					chunk = chunk.Substring( 0, chunk.Length - 2 );
  82. 					chunksize = Convert.ToInt32( chunk ) * 1024;
  83. 				}
  84. 				else if ( chunk.IndexOf( "MB" ) > -1 )
  85. 				{
  86. 					chunk = chunk.Substring( 0, chunk.Length - 2 );
  87. 					chunksize = Convert.ToInt32( chunk ) * 1024 * 1024;
  88. 				}
  89. 				else
  90. 				{
  91. 					chunksize = Convert.ToInt32( chunk );
  92. 				}
  93. 				// Try to get proper encoding of bigfile
  94. 				if ( enc == null )
  95. 				{
  96. 					enc = GetEncoding( bigfile );
  97. 				}
  98. 			}
  99. 			catch ( FormatException )
  100. 			{
  101. 				return WriteError( "Invalid chunk size" );
  102. 			}
  103.  
  104. #if DEBUG
  105. 			Console.WriteLine( );
  106. 			Console.WriteLine( "File name         : {0}", bigfile );
  107. 			Console.WriteLine( "Chunk size        : {0} ({1} Bytes)", args[1], chunksize );
  108. 			Console.WriteLine( "Break at line end : {0}", linebreak );
  109. 			Console.WriteLine( "File encoding     : {0}", enc.BodyName );
  110. 			Console.WriteLine( "Maximum # chunks  : {0}", maxfiles );
  111. 			Console.ReadKey( );
  112. #endif
  113.  
  114. 			#endregion Command Line Parsing
  115.  
  116.  
  117.  
  118. 			try
  119. 			{
  120. 				using ( FileStream fsi = File.Open( bigfile, FileMode.Open, FileAccess.Read, FileShare.ReadWrite ) )
  121. 				using ( BufferedStream bsi = new BufferedStream( fsi ) )
  122. 				using ( StreamReader sri = new StreamReader( bsi, enc ) )
  123. 				{
  124. 					int index = 0;
  125. 					char[] buffer = new char[chunksize];
  126. 					string chunkname = Directory.GetCurrentDirectory( ) + "\\" + Path.GetFileNameWithoutExtension( bigfile );
  127. 					string chunkext = Path.GetExtension( bigfile );
  128. 					int count = 0;
  129. 					while ( sri.Read( buffer, 0, chunksize ) > 0 )
  130. 					{
  131. 						if ( maxfiles == 0 || count < maxfiles )
  132. 						{
  133. 							count += 1;
  134. 							string chunkout = chunkname + "." + count + chunkext;
  135. 							int length = Math.Max( 0, Math.Min( chunksize, filesize - index ) );
  136. 							using ( FileStream fso = File.Open( chunkout, FileMode.Create, FileAccess.ReadWrite, FileShare.Read ) )
  137. 							using ( BufferedStream bso = new BufferedStream( fso ) )
  138. 							using ( StreamWriter swo = new StreamWriter( bso, enc ) )
  139. 							{
  140. 								swo.Write( buffer, 0, length );
  141. 								if ( linebreak )
  142. 								{
  143. 									swo.WriteLine( sri.ReadLine( ) );
  144. 								}
  145. 							}
  146. 							index = Math.Min( index + chunksize, filesize );
  147. 						}
  148. 					}
  149. 				}
  150. 				return 0;
  151. 			}
  152. 			catch ( Exception e )
  153. 			{
  154. 				return WriteError( e.Message );
  155. 			}
  156. 		}
  157.  
  158.  
  159.  
  160. 		/// <summary>
  161. 		/// Determines a text file's encoding by analyzing its byte order mark (BOM).
  162. 		/// Defaults to ASCII when detection of the text file's endianness fails.
  163. 		/// </summary>
  164. 		/// <param name="filename">The text file to analyze.</param>
  165. 		/// <returns>The detected encoding.</returns>
  166. 		public static Encoding GetEncoding( string filename )
  167. 		{
  168. 			// Code found on http://stackoverflow.com/a/19283954
  169.  
  170. 			// Read the BOM
  171. 			var bom = new byte[4];
  172. 			using ( var file = new FileStream( filename, FileMode.Open ) ) file.Read( bom, 0, 4 );
  173.  
  174. 			// Analyze the BOM
  175. 			if ( bom[0] == 0x2b && bom[1] == 0x2f && bom[2] == 0x76 ) return Encoding.UTF7;
  176. 			if ( bom[0] == 0xef && bom[1] == 0xbb && bom[2] == 0xbf ) return Encoding.UTF8;
  177. 			if ( bom[0] == 0xff && bom[1] == 0xfe ) return Encoding.Unicode; //UTF-16LE
  178. 			if ( bom[0] == 0xfe && bom[1] == 0xff ) return Encoding.BigEndianUnicode; //UTF-16BE
  179. 			if ( bom[0] == 0 && bom[1] == 0 && bom[2] == 0xfe && bom[3] == 0xff ) return Encoding.UTF32;
  180. 			return Encoding.Default;
  181. 		}
  182.  
  183.  
  184.  
  185. 		#region Error Handling
  186.  
  187. 		public static int WriteError( Exception e = null )
  188. 		{
  189. 			return WriteError( e == null ? null : e.Message );
  190. 		}
  191.  
  192. 		public static int WriteError( string errorMessage )
  193. 		{
  194. 			if ( string.IsNullOrEmpty( errorMessage ) == false )
  195. 			{
  196. 				Console.Error.WriteLine( );
  197. 				Console.ForegroundColor = ConsoleColor.Red;
  198. 				Console.Error.Write( "ERROR: " );
  199. 				Console.ForegroundColor = ConsoleColor.White;
  200. 				Console.Error.WriteLine( errorMessage );
  201. 				Console.ResetColor( );
  202. 			}
  203.  
  204. 			/*
  205. 			SplitTextFile,  Version 0.50 beta
  206. 			Split really big files in manageable chunks
  207.  
  208. 			Usage:    SPLITTEXTFILE  bigfilename  chunksize  [ options ]
  209.  
  210. 			Where:    bigfilename    is the file to be split up
  211. 			          chunksize      is the size of the split off chunks
  212. 			                         (e.g. 2048 or 2KB or 64MB)
  213.  
  214. 			Options:  /BREAK         split at line break (slightly increases chunk size)
  215. 			          /COUNT:nnn     limit chunk count to first nnn files
  216. 			          /ENC:encoding  force encoding (ASCII, UTF-7, UTF-8, UTF-32,
  217. 			                         Unicode or UnicodeBE)
  218.  
  219. 			Note:     Output chunks will be located in the current directory and have the
  220. 			          same name and extension as the (big) input file, with an added index
  221. 			          number between the file name and extension (e.g. bigfilename.1.txt).
  222.  
  223. 			Written by Rob van der Woude
  224. 			http://www.robvanderwoude.com
  225. 			 */
  226.  
  227. 			string fullpath = Environment.GetCommandLineArgs( ).GetValue( 0 ).ToString( );
  228. 			string[] program = fullpath.Split( '\\' );
  229. 			string exeName = program[program.GetUpperBound( 0 )];
  230. 			exeName = exeName.Substring( 0, exeName.IndexOf( '.' ) );
  231.  
  232. 			Console.Error.WriteLine( );
  233. 			Console.Error.WriteLine( "{0},  Version 0.50 beta", exeName );
  234. 			Console.Error.WriteLine( "Split really big files in manageable chunks" );
  235. 			Console.Error.WriteLine( );
  236. 			Console.Error.Write( "Usage:    " );
  237. 			Console.ForegroundColor = ConsoleColor.White;
  238. 			Console.Error.WriteLine( "{0}  bigfilename  chunksize  [ options ]", exeName.ToUpper( ) );
  239. 			Console.ResetColor( );
  240. 			Console.Error.WriteLine( );
  241. 			Console.Error.Write( "Where:    " );
  242. 			Console.ForegroundColor = ConsoleColor.White;
  243. 			Console.Error.Write( "bigfilename" );
  244. 			Console.ResetColor( );
  245. 			Console.Error.WriteLine( "    is the file to be split up" );
  246. 			Console.ForegroundColor = ConsoleColor.White;
  247. 			Console.Error.Write( "          chunksize" );
  248. 			Console.ResetColor( );
  249. 			Console.Error.WriteLine( "      is the size of the split off chunks" );
  250. 			Console.Error.WriteLine( "                         (e.g. 2048 or 2KB or 64MB)" );
  251. 			Console.Error.WriteLine( );
  252. 			Console.Error.Write( "Options:  " );
  253. 			Console.ForegroundColor = ConsoleColor.White;
  254. 			Console.Error.Write( "/BREAK" );
  255. 			Console.ResetColor( );
  256. 			Console.Error.Write( "         split at line " );
  257. 			Console.ForegroundColor = ConsoleColor.White;
  258. 			Console.Error.Write( "break" );
  259. 			Console.ResetColor( );
  260. 			Console.Error.WriteLine( " (slightly increases chunk size)" );
  261. 			Console.ForegroundColor = ConsoleColor.White;
  262. 			Console.Error.Write( "          /COUNT:nnn" );
  263. 			Console.ResetColor( );
  264. 			Console.Error.Write( "     limit chunk " );
  265. 			Console.ForegroundColor = ConsoleColor.White;
  266. 			Console.Error.Write( "count" );
  267. 			Console.ResetColor( );
  268. 			Console.Error.Write( " to first " );
  269. 			Console.ForegroundColor = ConsoleColor.White;
  270. 			Console.Error.Write( "nnn" );
  271. 			Console.ResetColor( );
  272. 			Console.Error.WriteLine( " files" );
  273. 			Console.ForegroundColor = ConsoleColor.White;
  274. 			Console.Error.Write( "          /ENC:encoding" );
  275. 			Console.ResetColor( );
  276. 			Console.Error.Write( "  force " );
  277. 			Console.ForegroundColor = ConsoleColor.White;
  278. 			Console.Error.Write( "encoding" );
  279. 			Console.ResetColor( );
  280. 			Console.Error.WriteLine( " (ASCII, UTF-7, UTF-8, UTF-32," );
  281. 			Console.Error.WriteLine( "                         Unicode or UnicodeBE)" );
  282. 			Console.Error.WriteLine( );
  283. 			Console.Error.WriteLine( "Note:     Output chunks will be located in the current directory and have the" );
  284. 			Console.Error.WriteLine( "          same name and extension as the (big) input file, with an added index" );
  285. 			Console.Error.WriteLine( "          number between the file name and extension (e.g. bigfilename.1.txt)." );
  286. 			Console.Error.WriteLine( );
  287. 			Console.Error.WriteLine( "Written by Rob van der Woude" );
  288. 			Console.Error.Write( "http://www.robvanderwoude.com" );
  289. 			Console.OpenStandardOutput( );
  290. 			return 1;
  291. 		}
  292.  
  293. 		#endregion Error Handling
  294. 	}
  295. }
  296.  

page last modified: 2024-04-16; loaded in 0.0116 seconds