data test; length line $ 80; input line &; firstchar = substr(line, 1, 1); rest_line = substr(line, 3, 77); if firstchar=">"; datalines; > d <- read.csv( "data", header = TRUE, sep = "\t" ) > str(d) 'data.frame': 7 obs. of 4 variables: $ Name : Factor w/ 7 levels "Bill","Jack",..: 5 3 6 4 1 7 2 $ Height: num 6.2 5.5 5.7 5.6 5.8 6.1 6 $ Weight: num 192 155 164 166 186 ... $ Gender: int 0 1 1 1 0 0 0 > > mean( d$Weight ) [1] 180.1429 > mean( d[,3] ) [1] 180.1429 > > mean( d$Weight[ d$Gender == 1 ] ) [1] 162.0333 > mean( d$Weight[ 2:4 ] ) [1] 162.0333 > > d$Diff <- d$Height - mean( d$Height ) > print(d) Name Height Weight Gender Diff 1 Joe 6.2 192.2 0 0.35714286 2 Jane 5.5 155.4 1 -0.34285714 3 Mary 5.7 164.3 1 -0.14285714 4 Jill 5.6 166.4 1 -0.24285714 5 Bill 5.8 185.8 0 -0.04285714 6 Pete 6.1 201.7 0 0.25714286 7 Jack 6.0 195.2 0 0.15714286 > summary(d) Name Height Weight Gender Diff Bill:1 Min. :5.500 Min. :155.4 Min. :0.0000 Min. :-3.429e-01 Jack:1 1st Qu.:5.650 1st Qu.:165.3 1st Qu.:0.0000 1st Qu.:-1.929e-01 Jane:1 Median :5.800 Median :185.8 Median :0.0000 Median :-4.286e-02 Jill:1 Mean :5.843 Mean :180.1 Mean :0.4286 Mean : 2.538e-16 Joe :1 3rd Qu.:6.050 3rd Qu.:193.7 3rd Qu.:1.0000 3rd Qu.: 2.071e-01 Mary:1 Max. :6.200 Max. :201.7 Max. :1.0000 Max. : 3.571e-01 Pete:1 > > d$Gender <- factor( d$Gender, labels = c("M", "F") ) > summary(d) Name Height Weight Gender Diff Bill:1 Min. :5.500 Min. :155.4 M:4 Min. :-3.429e-01 Jack:1 1st Qu.:5.650 1st Qu.:165.3 F:3 1st Qu.:-1.929e-01 Jane:1 Median :5.800 Median :185.8 Median :-4.286e-02 Jill:1 Mean :5.843 Mean :180.1 Mean : 2.538e-16 Joe :1 3rd Qu.:6.050 3rd Qu.:193.7 3rd Qu.: 2.071e-01 Mary:1 Max. :6.200 Max. :201.7 Max. : 3.571e-01 Pete:1 > > plot( d$Height ~ d$Gender ) > plot( d$Height ~ d$Weight, xlab="Weight", ylab="Height" ) > m <- lm( d$Height ~ d$Weight ) > print(m) Call: lm(formula = d$Height ~ d$Weight) Coefficients: (Intercept) d$Weight 3.39918 0.01357 > abline(m) > abline( mean(d$Height), 0, lty=2 ) ; proc print data=test; run;