GitHub - gramat/get-and-clean-cours-project: Getting and Cleaning Data Course Project's repo

####The repo get-and-clean-cours-project contains Coursera.org

Data Science Specialization

Getting & Cleaning Data course croject files

D.Gramatchikov

######Script run_analysis.R ######NEEDS: The training and test data sets and descripting files extracted from

https://d396qusza40orc.cloudfront.net/getdata%2Fprojectfiles%2FUCI%20HAR%20Dataset.zip.

Folder UCI HAR Dataset must be in the current work directory.

######DOES:

Merges the training and the test sets to create one data set.
Extracts only the measurements on the mean and standard deviation for each measurement.
Uses descriptive activity names to name the activities in the data set
Appropriately labels the data set with descriptive activity names.
Creates a second, independent tidy data set with the average of each variable for each activity and each subject.

Preporatory steps are:

Write the function getting pathnames for necessary data files

and make possible to test are there these files at all

(in UCI HAR Dataset folder or in train folder

or just in work directory)

getDataPath <- function(dataDir, dataSubDir, dataFile){
  if(file.exists(file.path(dataDir,dataSubDir, dataFile))){
    return(file.path(dataDir,dataSubDir, dataFile))
  }else{
    if(file.exists(file.path(dataSubDir, dataFile))){
      return(file.path(dataSubDir, dataFile))
    }else{
      if(file.exists(file.path(dataFile))){
        return(file.path(dataFile))
      }
      else{
        return(NA)
      }
    }
  }
}

define the folder names and data files variables

    dataDir <- file.path("UCI HAR Dataset")
    trainDir <- file.path("train")
    testDir <- file.path("test")
    dataFiles <- c("X_train.txt", "subject_train.txt", "y_train.txt", "X_test.txt", "subject_test.txt", "y_test.txt", "activity_labels.txt", "features.txt")
    dataPaths <- vector(mode="character", length=8)

test if files exist and if they do - get path to files

    for(i in c(1:3)){
        dataPaths[i] <- getDataPath(dataDir, trainDir, dataFiles[i])
    }
    for(i in c(4:6)){
        dataPaths[i] <- getDataPath(dataDir, testDir, dataFiles[i])
    }
    for(i in c(7:8)){
        dataPaths[i] <- getDataPath("", dataDir, dataFiles[i])
    }

if some files do not exist - stop script

    if(sum(is.na(dataPaths)) > 0){
        stop("Sorry, files: ", dataFiles[is.na(dataPaths)], " - not found. Script is stopped.")

    }

else, if all files are found

getting train data

    xTrain <- read.table(dataPaths[1])
    subjectTrain <- read.table(dataPaths[2], col.names=c("subject"))
    yTrain <- read.table(dataPaths[3], col.names=c("activity"))
    trainData <- cbind(subjectTrain, yTrain, xTrain)

train data set is ready;

    remove subjectTrain, yTrain, xTrain (just for memory saving)

    rm(subjectTrain, yTrain, xTrain)

getting test data

    xTest <- read.table(dataPaths[4])
    subjectTest <- read.table(dataPaths[5], col.names=c("subject"))
    yTest <- read.table(dataPaths[6], col.names=c("activity"))
    testData <- cbind(subjectTest, yTest, xTest)

test data set is ready;

    remove subjectTest, yTest, xTest

    rm(subjectTest, yTest, xTest)

Combine the Data in samsungData set

    samsungData <- rbind(trainData, testData)
    rm(testData, trainData)

get the names of activities

    actNames <- read.table(dataPaths[7])
    actNames <- as.vector(actNames[,2], mode="character")

set activities data in data set as factor with activities names

    samsungData$activity <- as.factor(samsungData$activity)
    levels(samsungData$activity) <- actNames;

get the labels of data set variables

    varNames <- read.table(dataPaths[8])
    varNames <- as.vector(varNames[,2], mode="character")

transform labels

    varNames <- gsub("[-(),]", ".", varNames)
    varNames <- gsub("[^0-9A-Za-z]+$", "", varNames) 
    varNames <- gsub("[^0-9A-Za-z]+", ".", varNames)

get index of mean and std mesurements variables

    varIndex <- sort(c(grep("mean[^0-9A-Za-z]|mean[^0-9A-Za-z]*$", varNames, ignore.case=F), grep("std", varNames, ignore.case=F)))

define labels for selected columns

    varNames <- varNames[varIndex]

Extract only the measurements on the mean and standard deviation for each measurement.

    varIndex <- varIndex+2
    samsungData <- samsungData[,c(1:2, varIndex)]

set labels

    colnames(samsungData) <- c(colnames(samsungData[,1:2]), varNames)

create new data set - tidyData

column names as in samsungData and 180 rows (30 subjects * 6 activities)

    tidyData <- samsungData[1:180, ]

assign the cells the average of each variable for each activity and each subject.

    for(i in c(1:30)){
      for(j in c(1:6)){
        rowNum <- 6*(i-1)+j
        tidyData[rowNum, 1] <- i
        tidyData[rowNum, 2] <- levels(samsungData$activity)[j]
        for(k in c(3:ncol(samsungData))){
            tidyData[rowNum, k] <- mean(samsungData[(samsungData$subject == i) & (samsungData$activity == levels(samsungData$activity)[j]),k])      
      }
     }
    }

save data in a file

    write.table(tidyData, "tidy_data.txt")

Name		Name	Last commit message	Last commit date
Latest commit History 23 Commits
CodeBook.md		CodeBook.md
README.md		README.md
run_analysis.R		run_analysis.R
tidy_data.txt		tidy_data.txt

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Repository files navigation

About

Uh oh!

Releases

Packages

Languages

gramat/get-and-clean-cours-project

Folders and files

Latest commit

History

Repository files navigation

About

Resources

Uh oh!

Stars

Watchers

Forks

Releases

Packages 0

Languages

Packages