Question

I'm trying to get a script to run on each individual column of a csv file. I've figured out how to tell python which column I would like to run the script on but I want it to analyze column one, output the results, the move to column two and continue on and on through the file. What I want is a "if etc goto etc" command. I've found how to do this with simple oneliners but I have a larger script. Any help would be great as I'm sure I'm just missing something. Like if I could loop back to where I define my data (h=data) but tell it to choose the next column. Here is my script.

import numpy as np
import matplotlib.pyplot as plt
from pylab import * 
import pylab
from scipy import linalg
import sys
import scipy.interpolate as interpolate
import scipy.optimize as optimize

a=raw_input("Data file name? ") #Name of the data file including the directory, must be .csv

datafile = open(a, 'r')
data = []
for row in datafile:
    data.append(row.strip().split(',')) #opening and organizing the csv file
print('Data points= ', len(data))
print data
c=raw_input("Is there a header row? y/n?") #Remove header line if present
if c is ('y'):
    del data[0]
    data2=data
    print('Raw data= ', data2)
else:
    print('Raw data= ', data)
'''
#if I wanted to select a column
b=input("What column to analyze?") #Asks what column depth data is in
if b is 1: 
    h=[[rowa[i] for rowa in data] for i in range(1)] #first row
'''
h=data # all columns
g=reduce(lambda x,y: x+y,h) #prepares data for calculations
a=map(float, g)
a.sort()
print ('Organized data= ',a)

def GRLC(values):
    '''
    Calculate Gini index, Gini coefficient, Robin Hood index, and points of 
    Lorenz curve based on the instructions given in 
    www.peterrosenmai.com/lorenz-curve-graphing-tool-and-gini-coefficient-calculator
    Lorenz curve values as given as lists of x & y points [[x1, x2], [y1, y2]]
    @param values: List of values
    @return: [Gini index, Gini coefficient, Robin Hood index, [Lorenz curve]] 
    '''

    n = len(values)
    assert(n > 0), 'Empty list of values'
    sortedValues = sorted(values) #Sort smallest to largest

    #Find cumulative totals
    cumm = [0]
    for i in range(n):
        cumm.append(sum(sortedValues[0:(i + 1)]))

    #Calculate Lorenz points
    LorenzPoints = [[], []]
    sumYs = 0           #Some of all y values
    robinHoodIdx = -1   #Robin Hood index max(x_i, y_i)
    for i in range(1, n + 2):
        x = 100.0 * (i - 1)/n
        y = 100.0 * (cumm[i - 1]/float(cumm[n]))
        LorenzPoints[0].append(x)
        LorenzPoints[1].append(y)
        sumYs += y
        maxX_Y = x - y
        if maxX_Y > robinHoodIdx: robinHoodIdx = maxX_Y   

    giniIdx = 100 + (100 - 2 * sumYs)/n #Gini index 

    return [giniIdx, giniIdx/100, robinHoodIdx, LorenzPoints]

result = GRLC(a)
print 'Gini Index', result[0]  
print 'Gini Coefficient', result[1]
print 'Robin Hood Index', result[2]
Was it helpful?

Solution

I'm ignoring all of that GRLC function and just solving the looping question. Give this a try. It uses while True: to loop forever (you can just break out by ending the program; Ctrl+C in Windows, depends on OS). Just load the data from the csv once then each time it loops, you can re-build some variables. If you have questions please ask. Also, I didn't test it as I don't have all the NumPy packages installed :)

import numpy as np
import matplotlib.pyplot as plt
from pylab import * 
import pylab
from scipy import linalg
import sys
import scipy.interpolate as interpolate
import scipy.optimize as optimize

def GRLC(values):
    '''
    Calculate Gini index, Gini coefficient, Robin Hood index, and points of 
    Lorenz curve based on the instructions given in 
    www.peterrosenmai.com/lorenz-curve-graphing-tool-and-gini-coefficient-calculator
    Lorenz curve values as given as lists of x & y points [[x1, x2], [y1, y2]]
    @param values: List of values
    @return: [Gini index, Gini coefficient, Robin Hood index, [Lorenz curve]] 
    '''

    n = len(values)
    assert(n > 0), 'Empty list of values'
    sortedValues = sorted(values) #Sort smallest to largest

    #Find cumulative totals
    cumm = [0]
    for i in range(n):
        cumm.append(sum(sortedValues[0:(i + 1)]))

    #Calculate Lorenz points
    LorenzPoints = [[], []]
    sumYs = 0           #Some of all y values
    robinHoodIdx = -1   #Robin Hood index max(x_i, y_i)
    for i in range(1, n + 2):
        x = 100.0 * (i - 1)/n
        y = 100.0 * (cumm[i - 1]/float(cumm[n]))
        LorenzPoints[0].append(x)
        LorenzPoints[1].append(y)
        sumYs += y
        maxX_Y = x - y
        if maxX_Y > robinHoodIdx: robinHoodIdx = maxX_Y   

    giniIdx = 100 + (100 - 2 * sumYs)/n #Gini index 

    return [giniIdx, giniIdx/100, robinHoodIdx, LorenzPoints]

#Name of the data file including the directory, must be .csv
a=raw_input("Data file name? ") 

datafile = open(a.strip(), 'r')
data = []

#opening and organizing the csv file
for row in datafile:
    data.append(row.strip().split(',')) 

#Remove header line if present
c=raw_input("Is there a header row? y/n?") 
if c.strip().lower() == ('y'):
    del data[0]

while True :
    #if I want the first column, that's index 0.
    b=raw_input("What column to analyze?")

    # Validate that the column input data is correct here.  Otherwise it might be out of range, etc.
    # Maybe try this.  You might want more smarts in there, depending on your intent:
    b = int(b.strip())

    # If you expect the user to inpt "2" to mean the second column, you're going to use index 1 (list indexes are 0 based)
    h=[[rowa[b-1] for rowa in data] for i in range(1)]

    # prepares data for calculations
    g=reduce(lambda x,y: x+y,h) 
    a=map(float, g)
    a.sort()
    print ('Organized data= ',a)

    result = GRLC(a)
    print 'Gini Index', result[0]  
    print 'Gini Coefficient', result[1]
    print 'Robin Hood Index', result[2]
Licensed under: CC-BY-SA with attribution
Not affiliated with StackOverflow
scroll top