Manage read errors.

If instructed to continue on errors, store info correctly in the db.
2024-11-25 00:53:43 +00:00 · 2015-12-10 12:13:16 +00:00 · 2015-12-10 12:13:16 +00:00 · a0b87e6a2d
commit a0b87e6a2d
parent ed3dea8f2c
8 changed files with 81 additions and 33 deletions
--- a/dindexer.sql
+++ b/dindexer.sql
@ -4,7 +4,7 @@

 -- Dumped from database version 9.4.5
 -- Dumped by pg_dump version 9.4.5
-- Started on 2015-12-04 12:29:56 GMT
+-- Started on 2015-12-10 12:11:34 GMT

 SET statement_timeout = 0;
 SET lock_timeout = 0;
@ -22,7 +22,7 @@ CREATE EXTENSION IF NOT EXISTS plpgsql WITH SCHEMA pg_catalog;


 --
-- TOC entry 2038 (class 0 OID 0)
+-- TOC entry 2039 (class 0 OID 0)
 -- Dependencies: 178
 -- Name: EXTENSION plpgsql; Type: COMMENT; Schema: -; Owner: 
 --
@ -70,12 +70,23 @@ CREATE TABLE files (
    is_hash_valid boolean DEFAULT true NOT NULL,
    access_time timestamp with time zone,
    modify_time timestamp with time zone,
-    CONSTRAINT chk_files_dirsize_zero CHECK (((is_directory = false) OR (size = 0)))
+    unreadable boolean NOT NULL,
+    CONSTRAINT chk_files_dirsize_zero CHECK (((is_directory = false) OR (size = 0))),
+    CONSTRAINT chk_hash_0 CHECK ((((NOT unreadable) AND is_hash_valid) OR ((NOT is_hash_valid) AND (hash ~ '^0+$'::text))))
 );


 ALTER TABLE files OWNER TO @USERNAME@;

+--
+-- TOC entry 2040 (class 0 OID 0)
+-- Dependencies: 175
+-- Name: CONSTRAINT chk_hash_0 ON files; Type: COMMENT; Schema: public; Owner: @USERNAME@
+--
+
+COMMENT ON CONSTRAINT chk_hash_0 ON files IS 'Make sure hash is 0 if unreadable or not valid are set.';
+
+
 --
 -- TOC entry 174 (class 1259 OID 31279)
 -- Name: files_id_seq; Type: SEQUENCE; Schema: public; Owner: @USERNAME@
@ -92,7 +103,7 @@ CREATE SEQUENCE files_id_seq
 ALTER TABLE files_id_seq OWNER TO @USERNAME@;

 --
-- TOC entry 2039 (class 0 OID 0)
+-- TOC entry 2041 (class 0 OID 0)
 -- Dependencies: 174
 -- Name: files_id_seq; Type: SEQUENCE OWNED BY; Schema: public; Owner: @USERNAME@
 --
@ -118,7 +129,7 @@ CREATE TABLE sets (
 ALTER TABLE sets OWNER TO @USERNAME@;

 --
-- TOC entry 2040 (class 0 OID 0)
+-- TOC entry 2042 (class 0 OID 0)
 -- Dependencies: 177
 -- Name: COLUMN sets.type; Type: COMMENT; Schema: public; Owner: @USERNAME@
 --
@ -148,7 +159,7 @@ CREATE SEQUENCE sets_id_seq
 ALTER TABLE sets_id_seq OWNER TO @USERNAME@;

 --
-- TOC entry 2041 (class 0 OID 0)
+-- TOC entry 2043 (class 0 OID 0)
 -- Dependencies: 176
 -- Name: sets_id_seq; Type: SEQUENCE OWNED BY; Schema: public; Owner: @USERNAME@
 --
@ -165,7 +176,7 @@ ALTER TABLE ONLY files ALTER COLUMN id SET DEFAULT nextval('files_id_seq'::regcl


 --
-- TOC entry 1907 (class 2604 OID 31414)
+-- TOC entry 1908 (class 2604 OID 31414)
 -- Name: id; Type: DEFAULT; Schema: public; Owner: @USERNAME@
 --

@ -173,7 +184,7 @@ ALTER TABLE ONLY sets ALTER COLUMN id SET DEFAULT nextval('sets_id_seq'::regclas


 --
-- TOC entry 1915 (class 2606 OID 31289)
+-- TOC entry 1916 (class 2606 OID 31289)
 -- Name: pk_files_id; Type: CONSTRAINT; Schema: public; Owner: @USERNAME@; Tablespace: 
 --

@ -182,7 +193,7 @@ ALTER TABLE ONLY files


 --
-- TOC entry 1919 (class 2606 OID 31420)
+-- TOC entry 1920 (class 2606 OID 31420)
 -- Name: pk_sets_id; Type: CONSTRAINT; Schema: public; Owner: @USERNAME@; Tablespace: 
 --

@ -191,7 +202,7 @@ ALTER TABLE ONLY sets


 --
-- TOC entry 1917 (class 2606 OID 31294)
+-- TOC entry 1918 (class 2606 OID 31294)
 -- Name: uniq_item; Type: CONSTRAINT; Schema: public; Owner: @USERNAME@; Tablespace: 
 --

@ -200,7 +211,7 @@ ALTER TABLE ONLY files


 --
-- TOC entry 1912 (class 1259 OID 31426)
+-- TOC entry 1913 (class 1259 OID 31426)
 -- Name: fki_files_sets; Type: INDEX; Schema: public; Owner: @USERNAME@; Tablespace: 
 --

@ -208,7 +219,7 @@ CREATE INDEX fki_files_sets ON files USING btree (group_id);


 --
-- TOC entry 1913 (class 1259 OID 31292)
+-- TOC entry 1914 (class 1259 OID 31292)
 -- Name: idx_paths; Type: INDEX; Schema: public; Owner: @USERNAME@; Tablespace: 
 --

@ -216,7 +227,7 @@ CREATE INDEX idx_paths ON files USING btree (path);


 --
-- TOC entry 1921 (class 2620 OID 31291)
+-- TOC entry 1922 (class 2620 OID 31291)
 -- Name: triggerupcasehash; Type: TRIGGER; Schema: public; Owner: @USERNAME@
 --

@ -224,7 +235,7 @@ CREATE TRIGGER triggerupcasehash BEFORE INSERT OR UPDATE ON files FOR EACH ROW E


 --
-- TOC entry 1920 (class 2606 OID 31421)
+-- TOC entry 1921 (class 2606 OID 31421)
 -- Name: fk_files_sets; Type: FK CONSTRAINT; Schema: public; Owner: @USERNAME@
 --

@ -233,7 +244,7 @@ ALTER TABLE ONLY files


 --
-- TOC entry 2037 (class 0 OID 0)
+-- TOC entry 2038 (class 0 OID 0)
 -- Dependencies: 8
 -- Name: public; Type: ACL; Schema: -; Owner: postgres
 --
@ -244,7 +255,7 @@ GRANT ALL ON SCHEMA public TO postgres;
 GRANT ALL ON SCHEMA public TO PUBLIC;


-- Completed on 2015-12-04 12:29:59 GMT
+-- Completed on 2015-12-10 12:11:36 GMT

 --
 -- PostgreSQL database dump complete
--- a/src/scan/commandline.cpp
+++ b/src/scan/commandline.cpp
@ -83,6 +83,7 @@ namespace din {
 #else
 			("type,t", po::value<char>()->default_value('V'), type_param_help.c_str())
 #endif
+			("ignore-errors", "Move on even if reading a file fails. Unreadable files are marked as such in the db.")
 		;
 		po::options_description positional_options("Positional options");
 		positional_options.add_options()
--- a/src/scan/dbbackend.cpp
+++ b/src/scan/dbbackend.cpp
@ -94,6 +94,7 @@ namespace din {
 	}

 	void write_to_db (const DinDBSettings& parDB, const std::vector<FileRecordData>& parData, const SetRecordData& parSetData) {
+		auto bool_to_str = [](bool b) { return (b ? "true" : "false"); };
 		if (parData.empty()) {
 			return;
 		}
@ -111,7 +112,7 @@ namespace din {
 			std::ostringstream query;
 			query << "INSERT INTO \"files\" " <<
 				"(path, hash, level, group_id, is_directory, is_symlink, size, " <<
-				"access_time, modify_time) VALUES "
+				"access_time, modify_time, is_hash_valid, unreadable) VALUES "
 			;

 			const char* comma = "";
@ -121,10 +122,12 @@ namespace din {
 				query << '(' << conn.escaped_literal(itm.path) << ",'" << itm.hash << "',"
 					<< itm.level << ','
 					<< "currval('\"sets_id_seq\"')" << ','
-					<< (itm.is_directory ? "true" : "false") << ','
+					<< bool_to_str(itm.is_directory) << ','
 					<< (itm.is_symlink ? "true" : "false") << ',' << itm.size
 					<< ',' << '\'' << time_to_str(itm.atime, strtime_buff.get(), strtime_buff_size) << '\''
 					<< ',' << '\'' << time_to_str(itm.mtime, strtime_buff.get(), strtime_buff_size) << '\''
+					<< ',' << bool_to_str(itm.hash_valid)
+					<< ',' << bool_to_str(itm.unreadable)
 				<< ')';
 				comma = ",";
 			}
--- a/src/scan/dbbackend.hpp
+++ b/src/scan/dbbackend.hpp
@ -36,6 +36,8 @@ namespace din {
 		uint64_t size;
 		bool is_directory;
 		bool is_symlink;
+		bool unreadable;
+		bool hash_valid;
 	};

 	struct SetRecordDataFull {
--- a/src/scan/indexer.cpp
+++ b/src/scan/indexer.cpp
@ -52,7 +52,8 @@ namespace din {
 			//file_size(0),
 			level(static_cast<uint16_t>(parSt.level)),
 			is_dir(parSt.is_dir),
-			is_symlink(parSt.is_symlink)
+			is_symlink(parSt.is_symlink),
+			unreadable(false)
 		{
 		}

@ -71,12 +72,13 @@ namespace din {
 		uint16_t level;
 		bool is_dir;
 		bool is_symlink;
+		bool unreadable;
 	};

 	namespace {
 		typedef std::vector<FileEntry>::iterator FileEntryIt;

-		void hash_dir (FileEntryIt parEntry, FileEntryIt parBegin, FileEntryIt parEnd, const PathName& parCurrDir, std::function<void(std::size_t)> parNextItemCallback) {
+		void hash_dir (FileEntryIt parEntry, FileEntryIt parBegin, FileEntryIt parEnd, const PathName& parCurrDir, std::function<void(std::size_t)> parNextItemCallback, bool parIgnoreErrors) {
 			assert(parEntry != parEnd);
 			assert(parEntry->is_dir);
 			FileEntry& curr_entry = *parEntry;
@ -105,7 +107,7 @@ namespace din {
 				while (parEnd != it_entry and it_entry->level == curr_entry_it->level + 1 and parCurrDir == PathName(it_entry->path).pop_right()) {
 					PathName curr_subdir(it_entry->path);
 					if (it_entry->is_dir) {
-						hash_dir(it_entry, parBegin, parEnd, curr_subdir, parNextItemCallback);
+						hash_dir(it_entry, parBegin, parEnd, curr_subdir, parNextItemCallback, parIgnoreErrors);

 						std::string relpath = make_relative_path(parCurrDir, curr_subdir).path();
 						const auto old_size = dir_blob.size();
@ -148,7 +150,19 @@ namespace din {
 					std::cout << "Hashing file " << it_entry->path << "...";
 #endif
 					parNextItemCallback(it_entry - parBegin);
+					try {
 						tiger_file(it_entry->path, it_entry->hash, curr_entry_it->hash, it_entry->file_size);
+					}
+					catch (const std::ios_base::failure& e) {
+						if (parIgnoreErrors) {
+							it_entry->unreadable = true;
+							it_entry->hash = HashType {};
+						}
+						else {
+							throw e;
+						}
+					}
+
 #if defined(INDEXER_VERBOSE)
 					std::cout << ' ' << tiger_to_string(it_entry->hash) << '\n';
 #endif
@ -178,6 +192,7 @@ namespace din {
 		std::condition_variable step_notify;
 #endif
 		std::size_t file_count;
+		bool ignore_read_errors;
 	};

 	bool FileEntry::operator< (const FileEntry& parOther) const {
@ -262,7 +277,8 @@ namespace din {
 				++m_local_data->done_count;
 				m_local_data->processing_index = parNext;
 				m_local_data->step_notify.notify_all();
-			}
+			},
+			m_local_data->ignore_read_errors
 		);

 		assert(m_local_data->done_count == m_local_data->file_count);
@ -272,7 +288,8 @@ namespace din {
 			m_local_data->paths.begin(),
 			m_local_data->paths.end(),
 			base_path,
-			[](std::size_t) {}
+			[](std::size_t) {},
+			m_local_data->ignore_read_errors
 		);
 #endif

@ -310,7 +327,9 @@ namespace din {
 				itm.level,
 				itm.file_size,
 				itm.is_dir,
-				itm.is_symlink
+				itm.is_symlink,
+				itm.unreadable,
+				not itm.unreadable
 			});
 		}

@ -382,4 +401,8 @@ namespace din {
 		std::advance(it, parIndex);
 		return make_relative_path(PathName(m_local_data->paths.front().path), PathName(it->path)).path();
 	}
+
+	void Indexer::ignore_read_errors (bool parIgnore) {
+		m_local_data->ignore_read_errors = parIgnore;
+	}
 } //namespace din
--- a/src/scan/indexer.hpp
+++ b/src/scan/indexer.hpp
@ -60,6 +60,7 @@ namespace din {
 		void calculate_hash ( void );
 		bool add_to_db ( const std::string& parSetName, char parType, bool parForce=false ) const;
 		bool empty ( void ) const;
+		void ignore_read_errors ( bool parIgnore );

 	private:
 		struct LocalData;
--- a/src/scan/main.cpp
+++ b/src/scan/main.cpp
@ -97,6 +97,7 @@ int main (int parArgc, char* parArgv[]) {
 	std::cout << "constructing...\n";

 	din::Indexer indexer(settings);
+	indexer.ignore_read_errors(vm.count("ignore-errors") > 0);
 	fastf::FileSearcher searcher(search_path);
 	fastf::FileSearcher::ConstCharVecType ext, ignore;
 	searcher.SetFollowSymlinks(true);
--- a/src/scan/tiger.cpp
+++ b/src/scan/tiger.cpp
@ -58,6 +58,7 @@ namespace din {
 		tiger_init_hash(parHashFile);

 		std::ifstream src(parPath, std::ios::binary);
+		src.exceptions(src.badbit); //Throw on read error
 		src.seekg(0, std::ios_base::end);
 		const auto file_size = src.tellg();
 		src.seekg(0, std::ios_base::beg);
@ -68,15 +69,19 @@ namespace din {
 		char* const buff_ptr = reinterpret_cast<char*>((reinterpret_cast<std::intptr_t>(buff.get()) + 63) & (-64));
 		assert(buff_ptr >= buff.get() and buff_ptr + buffsize <= buff.get() + 63 + buffsize);

+		//Take a copy of parHashDir and work on it - if hashing fails at some
+		//point, we need to leave the dir's hash untouched.
+		auto hash_dir = parHashDir;
+
 		//Use the initial value of the dir's hash as if it was part of the data to hash and start
 		//by processing that value. Hash is reset to the initial value before the call to tiger.
 		{
-			std::copy(parHashDir.byte_data, parHashDir.byte_data + sizeof(parHashDir), buff_ptr);
-			assert(hash_size >= static_cast<FileSizeType>(sizeof(parHashDir)));
-			std::fill(buff_ptr + sizeof(parHashDir), buff_ptr + hash_size, 0);
+			std::copy(hash_dir.byte_data, hash_dir.byte_data + sizeof(hash_dir), buff_ptr);
+			assert(hash_size >= static_cast<FileSizeType>(sizeof(hash_dir)));
+			std::fill(buff_ptr + sizeof(hash_dir), buff_ptr + hash_size, 0);
 			TigerHash dummy {};
-			tiger_init_hash(parHashDir);
-			tiger_sse2_chunk(buff_ptr, buff_ptr, hash_size, dummy.data, parHashDir.data);
+			tiger_init_hash(hash_dir);
+			tiger_sse2_chunk(buff_ptr, buff_ptr, hash_size, dummy.data, hash_dir.data);
 		}

 		auto remaining = file_size;
@ -86,7 +91,7 @@ namespace din {
 			assert(buffsize % 64 == 0);
 			remaining -= buffsize;
 			src.read(buff_ptr, buffsize);
-			tiger_sse2_chunk(buff_ptr, buff_ptr, buffsize, parHashFile.data, parHashDir.data);
+			tiger_sse2_chunk(buff_ptr, buff_ptr, buffsize, parHashFile.data, hash_dir.data);
 		}

 		{
@ -97,7 +102,7 @@ namespace din {
 			assert(aligned_size <= buffsize);
 			const char* read_from_buff = buff_ptr;
 			if (aligned_size) {
-				tiger_sse2_chunk(buff_ptr, buff_ptr, aligned_size, parHashFile.data, parHashDir.data);
+				tiger_sse2_chunk(buff_ptr, buff_ptr, aligned_size, parHashFile.data, hash_dir.data);
 				assert((remaining & 63) == remaining - aligned_size);
 				remaining -= aligned_size;
 				read_from_buff += aligned_size;
@ -105,10 +110,11 @@ namespace din {

 			//Remember to pass the augmented data size for the second reallength value: we passed the initial
 			//dir's hash value (64 bytes) as if they were part of the data.
-			tiger_sse2_last_chunk(read_from_buff, read_from_buff, remaining, file_size, file_size + hash_size, parHashFile.data, parHashDir.data, g_tiger_padding);
+			tiger_sse2_last_chunk(read_from_buff, read_from_buff, remaining, file_size, file_size + hash_size, parHashFile.data, hash_dir.data, g_tiger_padding);
 		}

 		parSizeOut = static_cast<uint64_t>(file_size);
+		parHashDir = hash_dir;
 	}

 	std::string tiger_to_string (const TigerHash& parHash, bool parUpcase) {