reading csv file in mathematica
When Mathematica reads a flatfile, it evaluates each field as a mathematica expression. I sometimes just want to read the entire file in as strings then perform some specific type conversion on columns. Here's a short way to import data is string fields.
The idea is to provide a shell of the algorithm and then extensive customization through functions. This is a very functional approach to creating functions. However, the package is still composed of a giant function though versus a more clever, parsing state machine.
You have options to customize line splitting, field/attribute processing, field level transformations as well a collection mechanism that allows you to collect specific input record for later use. For example, you can save the header line. The returned value is not just a data matrix, but an association with various import statistics and other collected information.
You can find the package here in a gist.
The idea is to provide a shell of the algorithm and then extensive customization through functions. This is a very functional approach to creating functions. However, the package is still composed of a giant function though versus a more clever, parsing state machine.
You have options to customize line splitting, field/attribute processing, field level transformations as well a collection mechanism that allows you to collect specific input record for later use. For example, you can save the header line. The returned value is not just a data matrix, but an association with various import statistics and other collected information.
You can find the package here in a gist.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(* ::Package:: *) | |
(* :Title: Import Delimited *) | |
(* :Summary: Containts declarations for importing a delimited text file into a session. *) | |
BeginPackage["ImportUtilities`"] | |
(* Canned functions that can be used as arguments. *) | |
WhitespaceSplitter::usage = "Function that splits string records on whitespace." | |
ForeachTrim::usage = "Option that trimes the string argument." | |
SplitFieldOnComma::usage = "Option that splits a record on commas." | |
IncludeAllLines::usage = "Option that includes all lines by always indicating that the line should not be ignored. Always returns false." | |
ReadSingleRecord::usage = "Option that reads a single record from a stream." | |
NoHeader::usage = "Always 0 indicating that no records should be skipped at the start." | |
SingleLineHeader::usage = "Always 1 indiacting that a single record should be skipped at the start." | |
IdentityRecordTransformer::usage="Option that always returns the argument directly." | |
CopyLine::usage = "Option that just returns the argument directly, thereby just copying the input to the output." | |
ImportDelimited::usage = "ImportDelimited[filename] imports the delimited file. Many useful options exist to control the import process." | |
MDYH24MS::usage = "Pattern spec to convert from date time string." | |
MDYH12MSMAM::usage = "Pattern spec to convert from date time string, very long version." | |
MDYH12MSMAMTransformer::usage = "Transformer date time to absolute time" | |
MDYH24MSTransformer::usage = "Transformer date time to absolute time" | |
Splitter::usage = "Function to split a line." | |
MaxProcessed::usage ="All or number representing the number of processed lines to keep." | |
MaxLine::usage="All or a number representing the last line number to read to. This is an absolute record position. If you want to control your sample size, use MaxProcessed." | |
ForeachSplit::usage="A function to apply to each field. Default is to trim whitespace." | |
RecordTransformer::usage="A function applied to the parsed row assuming it is not skipped or ignored. The output of this function is the new parsed row content." | |
ApplyAt::usage="Association of line index (1 based) \[Rule] f[line] applied to the unparsed and unprocessed record. This collects lines and are returned in ApplyAt in the returned values. It ignores skippin gand ignoring." | |
IgnoreLine::usage="Return a boolean when a line should be ignored. Only applies to lines that are not skipped. Default is to include all lines." | |
Reader::usage="Read a line from teh input stream. Return EndOfFile when end of file" | |
Begin["`Private`"] | |
MDYH12MSMAM = {"Month", "Day", "Year", "Hour12", "Minute", "Second", "Millisecond", "AMPM"} | |
MDYH24MS = {"Month", "Day", "Year", "Hour24", "Minute", "Second"} | |
MDYH12MSMAMTransformer = AbsoluteTime[DateList[{#, MDYH12MSMAM }]] & | |
MDYH24MSTransformer = AbsoluteTime[DateList[{#, MDYH24MS}]] & | |
WhitespaceSplitter = StringSplit[#,Whitespace..]& | |
ForeachTrim = StringTrim[#]& | |
SplitFieldOnComma = StringSplit[#,","]& | |
IncludeAllLines = False & | |
ReadSingleRecord = Read[#, Record]& | |
NoHeader = 0 | |
SingleLineHeader = 1 | |
IdentityRecordTransformer = #& | |
CopyLine = #& | |
(* Import a file using the functions to customize the import process. *) | |
ImportDelimited[file_String?FileExistsQ, | |
(* lines to skip at the start of the stream, ignores results of IgnoreLines *) | |
opts: OptionsPattern[{StartSkip -> NoHeader, | |
(* All or a number representing the number of processed lines to keep. A processed line is like a sample, it was not skipped or ignored *) | |
MaxProcessed -> All, | |
(* All or a number representing the last line number to read to. This is an absolute record position. If you want to control your sample size, use MaxProcessed *) | |
MaxLine -> All, | |
(* function to split a line *) | |
Splitter -> SplitFieldOnComma, | |
(* could use Identity[#]& to preserve surrounding whitespace. Default is to trim whitespace. *) | |
ForeachSplit -> ForeachTrim, | |
(* field index \[Rule] f[string] *) | |
Transformers -> <||>, | |
(* a function applied to the parsed row assuming it is not skipped or ignored. The output of this function is the new parsed row content. The default is identity. *) | |
RecordTransformer -> IdentityRecordTransformer, | |
(* line index (1-based) \[Rule] f[line], applied to the unparsed and unprocessed record which is typically one line of the input stream. | |
The resulting value is returned in a applyAt list that has {line, result value} tuples. ApplyAt | |
uses simple AppendTo so use it sparaingly to collect values. It ignores skipping and ignoring.*) | |
ApplyAt -> <||>, | |
(* a function returning a boolean that determines whether a line should be ignored, only applies to lines that are not skipped *) | |
IgnoreLine -> IncludeAllLines, | |
(* read a line from the input stream. Return EndOfFile when end of file. *) | |
Reader-> ReadSingleRecord }]] := | |
Module[{str, result, line = {}, specials = {}, linecount=0, parsed,ignoredcount=0,startcount = 0,linesprocessed = 0, | |
rr = OptionValue[Reader], | |
sp=OptionValue[Splitter], | |
maxlines =-1, | |
applies = OptionValue[ApplyAt], | |
recordTransformer = OptionValue[RecordTransformer], | |
transformers = OptionValue[Transformers], | |
fieldProcessor = OptionValue[ForeachSplit], | |
maxProcessed = -1, | |
ignoreLine = OptionValue[IgnoreLine], | |
skips = OptionValue[StartSkip]} , | |
str = OpenRead[file]; | |
maxlines = With[{v=OptionValue[MaxLine]}, If[v =!= All, v, maxlines]]; | |
maxProcessed = With[{v = OptionValue[MaxProcessed]}, If[v =!= All, v, maxProcessed]]; | |
result = Reap[ | |
While[True && If[maxlines<0,True, linecount<maxlines], | |
line =rr @ str; | |
linecount++; | |
If[line === EndOfFile, Break[]]; | |
(* If an applyto exist for this record, use it. *) | |
If[KeyExistsQ[applies, linecount], | |
With[{aa = applies[[Key[linecount]]]}, | |
AppendTo[specials, {linecount, aa@line}]]]; | |
If[linecount<=skips,startcount++; Continue[]]; | |
If[ignoreLine@line,ignoredcount++; Continue[]]; | |
parsed = recordTransformer@(fieldProcessor /@ (sp @ line)); | |
(* Redo this loop to map over the lhs of the associations versus every field! *) | |
If[Length[transformers]>0, | |
With[{len = Length[parsed]}, | |
(* if a transform has been specified, transform it then replace the string value (expensive!) *) | |
MapIndexed[(With[{index=First[#2]}, | |
If[KeyExistsQ[transformers,index], | |
parsed[[index]]= transformers[index]@ #1]])&, parsed]]]; | |
Sow[parsed, d]; (* reap only data tagged with d for data *) | |
linesprocessed++; | |
If[maxProcessed >=0 && linesprocessed >= maxProcessed, Break[]] | |
], d, Rule]; | |
Close[str]; | |
<|"Processed"->linesprocessed, | |
"LinesRead" -> linecount, | |
"StartSkipped"->startcount, | |
"Ignored"->ignoredcount, | |
"ApplyAt" -> specials, | |
"Data"->d /. result[[2]] (* only return the reaped data not the reap data strucure *) |> | |
] | |
End[] | |
EndPackage[] |
Comments
Post a Comment