The .NET Data Frames Nuget package is for performing exploratory data analysis in any .NET language like C#. It’s similar to R data frames but takes advantage of the type safety of .NET and utilizes Linq for powerful filtering, aggregation, and data manipulation.
Install-Package Frames -Version 1.1.3
Click here for the .NET Data Frames Nuget page.
using Spearing.Utilities.Data.Frames;
using static Spearing.Utilities.Data.Frames.FrameExtensions;
// Create a new frame
Frame frame = new Frame();
// Generic lists and typed arrays are stored internally typed
frame["Names"] = new string[] { "Bob", "Mary", "Joe" };
frame["StartDate"] = new DateTime[] {
new DateTime(2016, 10, 1),
new DateTime(2016, 6, 8),
new DateTime(2017, 9, 2)
};
// The static col method allows you quickly create a Column
frame["Ages"] = col(41.0, 28.0, 35.0);
frame["LowScore"] = col(78.0, 81.0, 85.0);
// Use the ToColumn extension method to convert an IEnumerable to a Column (also works on arrays and lists)
frame["HighScore"] = new double[] { 90.0, 92.0, 87.0}.ToColumn();
// Print all columns
frame.Print();
Names StartDate Ages LowScore HighScore
Bob 10/1/2016 12:00:00 AM 41 78 90
Mary 6/8/2016 12:00:00 AM 28 81 92
Joe 9/2/2017 12:00:00 AM 35 85 87
// Print selected columns
frame.Print("Names", "HighScore");
Names HighScore
Bob 90
Mary 92
Joe 87
// Print a column
frame["Names"].Print();
Names
Bob
Mary
Joe
// The As extension method retrieves the data with typing
// Make sure to use the type that the column was created with
double averageAge = frame["Ages"].As<double>().Average();
double totalScore = frame["HighScore"].As<double>().Sum();
double[] ages = frame["Ages"].As<double>().ToArray();
// Columns inherit from List
List<double> lowScores = frame["LowScore"].As<double>();
frame["ScoreDiff"] = frame["HighScore"].As<double>() - frame["LowScore"].As<double>();
frame["HighPlus1"] = frame["HighScore"].As<double>() + 1.0;
frame["Hours"] = new double[] { 25, 30, 38 };
double[] hourlyRate = new double[] { 15, 20, 12 };
frame["Pay"] = frame["Hours"].As<double>() * hourlyRate;
frame.SaveCsv(@"c:\temp\Employees.csv");
public class Employee
{
public string Name { get; set; }
public int Age { get; set; }
public double HighScore { get; set; }
}
// Use the ToFrame method to convert a strongly typed collection to a frame
List<Employee> employees = new List<Employee>()
{
new Employee() {Name = "Bob", Age = 40, HighScore = 90.0 },
new Employee() {Name = "Mary", Age = 28, HighScore = 92.0 },
new Employee() {Name = "Joe", Age = 35, HighScore = 87.0 }
};
Frame employeesFrame = employees.ToFrame();
employeesFrame.Print();
Name Age HighScore
Bob 40 90
Mary 28 92
Joe 35 87
// Use Linq predicates to filter data
DateTime startDate = new DateTime(2016, 9, 1);
Frame newEmployees = frame
.Where(row => row.Get<DateTime>("StartDate") >= startDate)
.ToFrame();
newEmployees.Print();
Names StartDate Ages LowScore HighScore ScoreDiff HighPlus1 Hours Pay
Bob 10/1/2016 12:00:00 AM 41 78 90 12 91 25 375
Joe 9/2/2017 12:00:00 AM 35 85 87 2 88 38 456
// Group data and use anonymous types to create a new frame
Frame empYearSummary = frame
.GroupBy(row => row.Get<DateTime>("StartDate").Year)
.Select(grp => new
{
Year = grp.Key,
AverageAge = grp.Average(row => row.Get<double>("Ages")),
Count = grp.Count()
})
.ToFrame();
empYearSummary.Print();
Year AverageAge Count
2016 34.5 2
2017 35 1
// Local file
Frame employeesLocal = Frame.ReadCSV<string, DateTime, double, double, double, double, double>(@"c:\temp\Employees.csv");
// Web site
Frame employeesWeb = Frame.ReadCSV<string, DateTime, double, double, double, double, double>(@"http://www.spearing.com/files/Employees.csv");
// Git web site
Frame employeesGit = Frame.ReadCSV<string, DateTime, double, double, double, double, double>(@"https://raw.githubusercontent.com/jackimburgia/Frames/master/Employees.csv");
// Get the top rows of the frame
IEnumerable<Row> headEmployees = employeesGit.Head();
headEmployees.Print();
Names StartDate Ages HighScore LowScore ScoreDiff HighPlus1
Bob 10/1/2016 12:00:00 AM 41 90 78 12 91
Mary 6/8/2016 12:00:00 AM 28 92 81 11 93
Joe 9/2/2017 12:00:00 AM 35 87 85 2 88
// Get the last rows of the frame
IEnumerable<Row> tailEmployees = employeesGit.Tail();
tailEmployees.Print();
Names StartDate Ages HighScore LowScore ScoreDiff HighPlus1
Bob 10/1/2016 12:00:00 AM 41 90 78 12 91
Mary 6/8/2016 12:00:00 AM 28 92 81 11 93
Joe 9/2/2017 12:00:00 AM 35 87 85 2 88
// Create the frame that will be the left side
// Use the static c method to create a typed array
Frame books = new Frame();
books["Name"] = c("Tukey", "Venables", "Tierney", "Ripley", "Ripley", "McNeil", "R Core");
books["Title"] = c("Exploratory Data Analysis",
"Modern Applied Statistics ...",
"LISP-STAT",
"Spatial Statistics", "Stochastic Simulation",
"Interactive Data Analysis",
"An Introduction to R");
books["Other.Author"] = c(null, "Ripley", null, null, null, null, "Venables & Smith");
// Create the frame that will be the right side
Frame authors = new Frame();
authors["Surname"] = c("Tukey", "Venables", "Tierney", "Ripley", "McNeil");
authors["Nationality"] = c("US", "Australia", "US", "UK", "Australia");
authors["Deceased"] = c(true, false, false, false, false);
// Perform an "inner" style join; only display rows where keys match
Frame joined = books.Join(authors,
row => row.Get<string>("Name"),
row => row.Get<string>("Surname")
);
joined.Print();
6 of the 7 rows from the books frame match an author.
Name Title Other.Author Surname Nationality Deceased
Tukey Exploratory Data Analysis Tukey US True
Venables Modern Applied Statistics ... Ripley Venables Australia False
Tierney LISP-STAT Tierney US False
Ripley Spatial Statistics Ripley UK False
Ripley Stochastic Simulation Ripley UK False
McNeil Interactive Data Analysis McNeil Australia False
// Perform a left "outer" style join
// Display all rows from left table and only the values from the right
// table where the keys match
Frame outerJoin = books.OuterJoin(authors,
row => row.Get<string>("Name"),
row => row.Get<string>("Surname")
);
outerJoin.Print();
All of the rows from the books frame are displayed. "R Core" does not match but is still displayed. Default values are displayed on that row on the authors side.
Name Title Other.Author Surname Nationality Deceased
Tukey Exploratory Data Analysis Tukey US True
Venables Modern Applied Statistics ... Ripley Venables Australia False
Tierney LISP-STAT Tierney US False
Ripley Spatial Statistics Ripley UK False
Ripley Stochastic Simulation Ripley UK False
McNeil Interactive Data Analysis McNeil Australia False
R Core An Introduction to R Venables & Smith False
Frame sites = new Frame();
sites["State"] = c("IL", "IL", "IN");
sites["Site"] = c(1, 2, 1);
sites["Latitude"] = c(42.46757, 42.04915, 41.6814);
sites["Longitude"] = c(-87.81005, -88.27303, -87.49473);
Frame parameters = new Frame();
parameters["Region"] = c("IL", "IN", "IL", "IL");
parameters["Monitor"] = c(1, 1, 2, 2);
parameters["Parameter"] = c("ozone", "so2", "ozone", "no2");
parameters["Duration"] = c("1h", "1h", "8h", "1h");
// Create anonymous types that will act as keys to join on each frame
var multipleKey = sites
.Join(parameters,
row => new { State = row.Get<string>("State"), Site = row.Get<int>("Site") },
row => new { State = row.Get<string>("Region"), Site = row.Get<int>("Monitor") }
);
multipleKey.Print();
State Site Latitude Longitude Region Monitor Parameter Duration
IL 1 42.46757 -87.81005 IL 1 ozone 1h
IL 2 42.04915 -88.27303 IL 2 ozone 8h
IL 2 42.04915 -88.27303 IL 2 no2 1h
IN 1 41.6814 -87.49473 IN 1 so2 1h
Frame meltData = new Frame();
meltData["FactorA"] = c("Low", "Medium", "High", "Low", "Medium", "High", "Low", "Medium", "High");
meltData["FactorB"] = c("Low", "Low", "Low", "Medium", "Medium", "Medium", "High", "High", "High");
meltData["Group1"] = c(-1.1616334, -0.5991478, 0.8420797, 1.6225569, -0.3450745, 1.6025044, -1.2991011, -0.49064, 0.3897769);
meltData["Group2"] = c(-0.5228371, -1.0461138, -1.5413266, -1.2706469, -1.3377985, 0.7631882, -0.2223622, -1.1802192, -0.3832142);
meltData["Group3"] = c(-0.6587093, -0.1942979, 0.6318852, -0.8026467, 1.4988363, -0.5375833, -0.6321478, 0.1235253, 0.6671101);
meltData["Group4"] = c(0.45064563, 2.47985577, -0.98948125, -0.32332181, 0.36541918, 0.85028148, -1.57284216, 0.09891793, 0.23407257);
meltData.Print();
FactorA FactorB Group1 Group2 Group3 Group4
Low Low -1.1616334 -0.5228371 -0.6587093 0.45064563
Medium Low -0.5991478 -1.0461138 -0.1942979 2.47985577
High Low 0.8420797 -1.5413266 0.6318852 -0.98948125
Low Medium 1.6225569 -1.2706469 -0.8026467 -0.32332181
Medium Medium -0.3450745 -1.3377985 1.4988363 0.36541918
High Medium 1.6025044 0.7631882 -0.5375833 0.85028148
Low High -1.2991011 -0.2223622 -0.6321478 -1.57284216
Medium High -0.49064 -1.1802192 0.1235253 0.09891793
High High 0.3897769 -0.3832142 0.6671101 0.23407257
Frame melted = meltData.Melt(
c("FactorA", "FactorB"),
c("Group1", "Group2")
);
melted.Print();
FactorA FactorB Variable Value
Low Low Group1 -1.1616334
Medium Low Group1 -0.5991478
High Low Group1 0.8420797
Low Medium Group1 1.6225569
Medium Medium Group1 -0.3450745
High Medium Group1 1.6025044
Low High Group1 -1.2991011
Medium High Group1 -0.49064
High High Group1 0.3897769
Low Low Group2 -0.5228371
Medium Low Group2 -1.0461138
High Low Group2 -1.5413266
Low Medium Group2 -1.2706469
Medium Medium Group2 -1.3377985
High Medium Group2 0.7631882
Low High Group2 -0.2223622
Medium High Group2 -1.1802192
High High Group2 -0.3832142