Skip to content

gramat/get-and-clean-cours-project

Folders and files

NameName
Last commit message
Last commit date

Latest commit

 

History

23 Commits
 
 
 
 
 
 
 
 

Repository files navigation

####The repo get-and-clean-cours-project contains Coursera.org

Data Science Specialization

Getting & Cleaning Data course croject files

D.Gramatchikov

######Script run_analysis.R ######NEEDS: The training and test data sets and descripting files extracted from

https://d396qusza40orc.cloudfront.net/getdata%2Fprojectfiles%2FUCI%20HAR%20Dataset.zip.

Folder UCI HAR Dataset must be in the current work directory.

######DOES:

  • Merges the training and the test sets to create one data set.
  • Extracts only the measurements on the mean and standard deviation for each measurement.
  • Uses descriptive activity names to name the activities in the data set
  • Appropriately labels the data set with descriptive activity names.
  • Creates a second, independent tidy data set with the average of each variable for each activity and each subject.

Preporatory steps are:

Write the function getting pathnames for necessary data files

and make possible to test are there these files at all

(in UCI HAR Dataset folder or in train folder

or just in work directory)

getDataPath <- function(dataDir, dataSubDir, dataFile){
  if(file.exists(file.path(dataDir,dataSubDir, dataFile))){
    return(file.path(dataDir,dataSubDir, dataFile))
  }else{
    if(file.exists(file.path(dataSubDir, dataFile))){
      return(file.path(dataSubDir, dataFile))
    }else{
      if(file.exists(file.path(dataFile))){
        return(file.path(dataFile))
      }
      else{
        return(NA)
      }
    }
  }
}

define the folder names and data files variables

    dataDir <- file.path("UCI HAR Dataset")
    trainDir <- file.path("train")
    testDir <- file.path("test")
    dataFiles <- c("X_train.txt", "subject_train.txt", "y_train.txt", "X_test.txt", "subject_test.txt", "y_test.txt", "activity_labels.txt", "features.txt")
    dataPaths <- vector(mode="character", length=8)

test if files exist and if they do - get path to files

    for(i in c(1:3)){
        dataPaths[i] <- getDataPath(dataDir, trainDir, dataFiles[i])
    }
    for(i in c(4:6)){
        dataPaths[i] <- getDataPath(dataDir, testDir, dataFiles[i])
    }
    for(i in c(7:8)){
        dataPaths[i] <- getDataPath("", dataDir, dataFiles[i])
    }

if some files do not exist - stop script

    if(sum(is.na(dataPaths)) > 0){
        stop("Sorry, files: ", dataFiles[is.na(dataPaths)], " - not found. Script is stopped.")

    }

else, if all files are found

getting train data

    xTrain <- read.table(dataPaths[1])
    subjectTrain <- read.table(dataPaths[2], col.names=c("subject"))
    yTrain <- read.table(dataPaths[3], col.names=c("activity"))
    trainData <- cbind(subjectTrain, yTrain, xTrain)

train data set is ready;

    remove subjectTrain, yTrain, xTrain (just for memory saving)

    rm(subjectTrain, yTrain, xTrain)

getting test data

    xTest <- read.table(dataPaths[4])
    subjectTest <- read.table(dataPaths[5], col.names=c("subject"))
    yTest <- read.table(dataPaths[6], col.names=c("activity"))
    testData <- cbind(subjectTest, yTest, xTest)

test data set is ready;

    remove subjectTest, yTest, xTest

    rm(subjectTest, yTest, xTest)

Combine the Data in samsungData set

    samsungData <- rbind(trainData, testData)
    rm(testData, trainData)

get the names of activities

    actNames <- read.table(dataPaths[7])
    actNames <- as.vector(actNames[,2], mode="character")

set activities data in data set as factor with activities names

    samsungData$activity <- as.factor(samsungData$activity)
    levels(samsungData$activity) <- actNames;

get the labels of data set variables

    varNames <- read.table(dataPaths[8])
    varNames <- as.vector(varNames[,2], mode="character")

transform labels

    varNames <- gsub("[-(),]", ".", varNames)
    varNames <- gsub("[^0-9A-Za-z]+$", "", varNames) 
    varNames <- gsub("[^0-9A-Za-z]+", ".", varNames)

get index of mean and std mesurements variables

    varIndex <- sort(c(grep("mean[^0-9A-Za-z]|mean[^0-9A-Za-z]*$", varNames, ignore.case=F), grep("std", varNames, ignore.case=F)))

define labels for selected columns

    varNames <- varNames[varIndex]

Extract only the measurements on the mean and standard deviation for each measurement.

    varIndex <- varIndex+2
    samsungData <- samsungData[,c(1:2, varIndex)]

set labels

    colnames(samsungData) <- c(colnames(samsungData[,1:2]), varNames)

create new data set - tidyData

column names as in samsungData and 180 rows (30 subjects * 6 activities)

    tidyData <- samsungData[1:180, ]

assign the cells the average of each variable for each activity and each subject.

    for(i in c(1:30)){
      for(j in c(1:6)){
        rowNum <- 6*(i-1)+j
        tidyData[rowNum, 1] <- i
        tidyData[rowNum, 2] <- levels(samsungData$activity)[j]
        for(k in c(3:ncol(samsungData))){
            tidyData[rowNum, k] <- mean(samsungData[(samsungData$subject == i) & (samsungData$activity == levels(samsungData$activity)[j]),k])      
      }
     }
    }

save data in a file

    write.table(tidyData, "tidy_data.txt")

About

Getting and Cleaning Data Course Project's repo

Resources

Stars

Watchers

Forks

Releases

No releases published

Packages

No packages published

Languages