profile
viewpoint
Maximillian von Briesen mobyvb Atlanta http://mobyvb.com I make stuff

Pull request review commentstorj/storj

satellite/gracefulexit: add count for order limits sent from satellite to exiting node

 func TestFailureNotFoundPieceHashUnverified(t *testing.T) {  } +func TestFailureStorageNodeIgnoresTransferMessages(t *testing.T) {+	var maxOrderLimitSendCount = 3+	testplanet.Run(t, testplanet.Config{+		SatelliteCount:   1,+		StorageNodeCount: 5,+		UplinkCount:      1,+		Reconfigure: testplanet.Reconfigure{+			Satellite: func(logger *zap.Logger, index int, config *satellite.Config) {+				// We don't care whether a node gracefully exits or not in this test,+				// so we set the max failures percentage extra high.+				config.GracefulExit.OverallMaxFailuresPercentage = 101+				config.GracefulExit.MaxOrderLimitSendCount = maxOrderLimitSendCount+			},+		},+	}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {+		uplinkPeer := planet.Uplinks[0]+		satellite := planet.Satellites[0]++		satellite.GracefulExit.Chore.Loop.Pause()++		nodeFullIDs := make(map[storj.NodeID]*identity.FullIdentity)+		for _, node := range planet.StorageNodes {+			nodeFullIDs[node.ID()] = node.Identity+		}++		rs := &uplink.RSConfig{+			MinThreshold:     2,+			RepairThreshold:  3,+			SuccessThreshold: 4,+			MaxThreshold:     4,+		}++		err := uplinkPeer.UploadWithConfig(ctx, satellite, rs, "testbucket", "test/path", testrand.Bytes(5*memory.KiB))+		require.NoError(t, err)++		// check that there are no exiting nodes.+		exitingNodes, err := satellite.DB.OverlayCache().GetExitingNodes(ctx)+		require.NoError(t, err)+		require.Len(t, exitingNodes, 0)++		exitingNode, err := findNodeToExit(ctx, planet, 1)+		require.NoError(t, err)++		// connect to satellite so we initiate the exit.+		conn, err := exitingNode.Dialer.DialAddressID(ctx, satellite.Addr(), satellite.Identity.ID)+		require.NoError(t, err)+		defer ctx.Check(conn.Close)++		client := conn.SatelliteGracefulExitClient()++		c, err := client.Process(ctx)+		require.NoError(t, err)++		response, err := c.Recv()+		require.NoError(t, err)++		// should get a NotReady since the metainfo loop would not be finished at this point.+		switch response.GetMessage().(type) {+		case *pb.SatelliteMessage_NotReady:+			// now check that the exiting node is initiated.+			exitingNodes, err := satellite.DB.OverlayCache().GetExitingNodes(ctx)+			require.NoError(t, err)+			require.Len(t, exitingNodes, 1)++			require.Equal(t, exitingNode.ID(), exitingNodes[0].NodeID)+		default:+			t.FailNow()+		}+		// close the old client+		require.NoError(t, c.CloseSend())++		// trigger the metainfo loop chore so we can get some pieces to transfer+		satellite.GracefulExit.Chore.Loop.TriggerWait()++		// make sure all the pieces are in the transfer queue+		_, err = satellite.DB.GracefulExit().GetIncomplete(ctx, exitingNode.ID(), 1, 0)+		require.NoError(t, err)++		var messageCount int++		// We need to label this outer loop so that we're able to exit it from the inner loop.+		// The outer loop is for sending the request from node to satellite multiple times.+		// The inner loop is for reading the response.+	MessageLoop:+		for messageCount <= maxOrderLimitSendCount {

what I am saying is there are two potential ways to exit here. One is if messageCount > maxOrderLimitSendCount (which now that I am thinking about it should never happen). And the other is break MessageLoop. The test would be clearer if we only had one of these potential exit routes, so we should pick one and use that, and get rid of the other. So if you want to keep using MessageLoop, feel free, but in that case I think you should change the for loop to not have any condition (for {)

navillasa

comment created time in 7 hours

Pull request review commentstorj/storj

satellite/gracefulexit: add count for order limits sent from satellite to exiting node

 func TestFailureNotFoundPieceHashUnverified(t *testing.T) {  } +func TestFailureStorageNodeIgnoresTransferMessages(t *testing.T) {+	var maxOrderLimitSendCount = 3+	testplanet.Run(t, testplanet.Config{+		SatelliteCount:   1,+		StorageNodeCount: 5,+		UplinkCount:      1,+		Reconfigure: testplanet.Reconfigure{+			Satellite: func(logger *zap.Logger, index int, config *satellite.Config) {+				// We don't care whether a node gracefully exits or not in this test,+				// so we set the max failures percentage extra high.+				config.GracefulExit.OverallMaxFailuresPercentage = 101+				config.GracefulExit.MaxOrderLimitSendCount = maxOrderLimitSendCount+			},+		},+	}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {+		uplinkPeer := planet.Uplinks[0]+		satellite := planet.Satellites[0]++		satellite.GracefulExit.Chore.Loop.Pause()++		nodeFullIDs := make(map[storj.NodeID]*identity.FullIdentity)+		for _, node := range planet.StorageNodes {+			nodeFullIDs[node.ID()] = node.Identity+		}++		rs := &uplink.RSConfig{+			MinThreshold:     2,+			RepairThreshold:  3,+			SuccessThreshold: 4,+			MaxThreshold:     4,+		}++		err := uplinkPeer.UploadWithConfig(ctx, satellite, rs, "testbucket", "test/path", testrand.Bytes(5*memory.KiB))+		require.NoError(t, err)++		// check that there are no exiting nodes.+		exitingNodes, err := satellite.DB.OverlayCache().GetExitingNodes(ctx)+		require.NoError(t, err)+		require.Len(t, exitingNodes, 0)++		exitingNode, err := findNodeToExit(ctx, planet, 1)+		require.NoError(t, err)++		// connect to satellite so we initiate the exit.+		conn, err := exitingNode.Dialer.DialAddressID(ctx, satellite.Addr(), satellite.Identity.ID)+		require.NoError(t, err)+		defer ctx.Check(conn.Close)++		client := conn.SatelliteGracefulExitClient()++		c, err := client.Process(ctx)+		require.NoError(t, err)++		response, err := c.Recv()+		require.NoError(t, err)++		// should get a NotReady since the metainfo loop would not be finished at this point.+		switch response.GetMessage().(type) {+		case *pb.SatelliteMessage_NotReady:+			// now check that the exiting node is initiated.+			exitingNodes, err := satellite.DB.OverlayCache().GetExitingNodes(ctx)+			require.NoError(t, err)+			require.Len(t, exitingNodes, 1)++			require.Equal(t, exitingNode.ID(), exitingNodes[0].NodeID)+		default:+			t.FailNow()+		}+		// close the old client+		require.NoError(t, c.CloseSend())++		// trigger the metainfo loop chore so we can get some pieces to transfer+		satellite.GracefulExit.Chore.Loop.TriggerWait()++		// make sure all the pieces are in the transfer queue+		_, err = satellite.DB.GracefulExit().GetIncomplete(ctx, exitingNode.ID(), 1, 0)+		require.NoError(t, err)++		var messageCount int++		// We need to label this outer loop so that we're able to exit it from the inner loop.+		// The outer loop is for sending the request from node to satellite multiple times.+		// The inner loop is for reading the response.+	MessageLoop:+		for messageCount <= maxOrderLimitSendCount {+			var unknownMsgSent bool+			c, err := client.Process(ctx)+			require.NoError(t, err)++			for {+				response, err := c.Recv()+				if unknownMsgSent {+					require.Error(t, err)+					break+				} else {+					require.NoError(t, err)+				}++				switch response.GetMessage().(type) {+				case *pb.SatelliteMessage_ExitCompleted:+					break MessageLoop+				case *pb.SatelliteMessage_TransferPiece:+					messageCount+++					unknownMsgSent = true+					// We send an unknown message because we want to fail the+					// transfer message request we get from the satellite.+					// This allows us to keep the conn open but repopulate+					// the pending queue.+					err = c.Send(&pb.StorageNodeMessage{})+					require.NoError(t, err)+					require.NoError(t, c.CloseSend())+				default:+					t.FailNow()+				}+			}+		}+

I think we should check that messageCount == maxOrderLimitSendCount somewhere

navillasa

comment created time in 7 hours

Pull request review commentstorj/storj

satellite/gracefulexit: add count for order limits sent from satellite to exiting node

 func TestFailureNotFoundPieceHashUnverified(t *testing.T) {  } +func TestFailureStorageNodeIgnoresTransferMessages(t *testing.T) {+	var maxOrderLimitSendCount = 3+	testplanet.Run(t, testplanet.Config{+		SatelliteCount:   1,+		StorageNodeCount: 5,+		UplinkCount:      1,+		Reconfigure: testplanet.Reconfigure{+			Satellite: func(logger *zap.Logger, index int, config *satellite.Config) {+				// We don't care whether a node gracefully exits or not in this test,+				// so we set the max failures percentage extra high.+				config.GracefulExit.OverallMaxFailuresPercentage = 101+				config.GracefulExit.MaxOrderLimitSendCount = maxOrderLimitSendCount+			},+		},+	}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {+		uplinkPeer := planet.Uplinks[0]+		satellite := planet.Satellites[0]++		satellite.GracefulExit.Chore.Loop.Pause()++		nodeFullIDs := make(map[storj.NodeID]*identity.FullIdentity)+		for _, node := range planet.StorageNodes {+			nodeFullIDs[node.ID()] = node.Identity+		}++		rs := &uplink.RSConfig{+			MinThreshold:     2,+			RepairThreshold:  3,+			SuccessThreshold: 4,+			MaxThreshold:     4,+		}++		err := uplinkPeer.UploadWithConfig(ctx, satellite, rs, "testbucket", "test/path", testrand.Bytes(5*memory.KiB))+		require.NoError(t, err)++		// check that there are no exiting nodes.+		exitingNodes, err := satellite.DB.OverlayCache().GetExitingNodes(ctx)+		require.NoError(t, err)+		require.Len(t, exitingNodes, 0)++		exitingNode, err := findNodeToExit(ctx, planet, 1)+		require.NoError(t, err)++		// connect to satellite so we initiate the exit.+		conn, err := exitingNode.Dialer.DialAddressID(ctx, satellite.Addr(), satellite.Identity.ID)+		require.NoError(t, err)+		defer ctx.Check(conn.Close)++		client := conn.SatelliteGracefulExitClient()++		c, err := client.Process(ctx)+		require.NoError(t, err)++		response, err := c.Recv()+		require.NoError(t, err)++		// should get a NotReady since the metainfo loop would not be finished at this point.+		switch response.GetMessage().(type) {+		case *pb.SatelliteMessage_NotReady:+			// now check that the exiting node is initiated.+			exitingNodes, err := satellite.DB.OverlayCache().GetExitingNodes(ctx)+			require.NoError(t, err)+			require.Len(t, exitingNodes, 1)++			require.Equal(t, exitingNode.ID(), exitingNodes[0].NodeID)+		default:+			t.FailNow()+		}+		// close the old client+		require.NoError(t, c.CloseSend())++		// trigger the metainfo loop chore so we can get some pieces to transfer+		satellite.GracefulExit.Chore.Loop.TriggerWait()++		// make sure all the pieces are in the transfer queue+		_, err = satellite.DB.GracefulExit().GetIncomplete(ctx, exitingNode.ID(), 1, 0)+		require.NoError(t, err)++		var messageCount int++		// We need to label this outer loop so that we're able to exit it from the inner loop.+		// The outer loop is for sending the request from node to satellite multiple times.+		// The inner loop is for reading the response.+	MessageLoop:+		for messageCount <= maxOrderLimitSendCount {

nit - I think we should exit the loop either with the messageCount <= maxOrderLimitSendCount (preferable to me), or change this to a for { with no condition and use the break MessageLoop on line 1310. I don't think we should have both, because then it is not clear what is causing the exit. But it's not a big deal so don't worry about it too much.

navillasa

comment created time in 7 hours

Pull request review commentstorj/storj

satellite/gracefulexit: add count for order limits sent from satellite to exiting node

 func TestFailureNotFoundPieceHashUnverified(t *testing.T) {  } +func TestFailureStorageNodeIgnoresTransferMessages(t *testing.T) {+	var maxOrderLimitSendCount int+	testplanet.Run(t, testplanet.Config{+		SatelliteCount:   1,+		StorageNodeCount: 5,+		UplinkCount:      1,+		Reconfigure: testplanet.Reconfigure{+			Satellite: func(logger *zap.Logger, index int, config *satellite.Config) {+				// We don't care whether a node gracefully exits or not in this test,+				// so we set the max failures percentage extra high.+				config.GracefulExit.OverallMaxFailuresPercentage = 101+				maxOrderLimitSendCount = config.GracefulExit.MaxOrderLimitSendCount+			},+		},+	}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {+		uplinkPeer := planet.Uplinks[0]+		satellite := planet.Satellites[0]++		satellite.GracefulExit.Chore.Loop.Pause()++		nodeFullIDs := make(map[storj.NodeID]*identity.FullIdentity)+		for _, node := range planet.StorageNodes {+			nodeFullIDs[node.ID()] = node.Identity+		}++		rs := &uplink.RSConfig{+			MinThreshold:     2,+			RepairThreshold:  3,+			SuccessThreshold: 4,+			MaxThreshold:     4,+		}++		err := uplinkPeer.UploadWithConfig(ctx, satellite, rs, "testbucket", "test/path", testrand.Bytes(5*memory.KiB))+		require.NoError(t, err)++		// check that there are no exiting nodes.+		exitingNodes, err := satellite.DB.OverlayCache().GetExitingNodes(ctx)+		require.NoError(t, err)+		require.Len(t, exitingNodes, 0)++		exitingNode, err := findNodeToExit(ctx, planet, 1)+		require.NoError(t, err)++		// connect to satellite so we initiate the exit.+		conn, err := exitingNode.Dialer.DialAddressID(ctx, satellite.Addr(), satellite.Identity.ID)+		require.NoError(t, err)+		defer ctx.Check(conn.Close)++		client := conn.SatelliteGracefulExitClient()++		c, err := client.Process(ctx)+		require.NoError(t, err)++		response, err := c.Recv()+		require.NoError(t, err)++		// should get a NotReady since the metainfo loop would not be finished at this point.+		switch response.GetMessage().(type) {+		case *pb.SatelliteMessage_NotReady:+			// now check that the exiting node is initiated.+			exitingNodes, err := satellite.DB.OverlayCache().GetExitingNodes(ctx)+			require.NoError(t, err)+			require.Len(t, exitingNodes, 1)++			require.Equal(t, exitingNode.ID(), exitingNodes[0].NodeID)+		default:+			t.FailNow()+		}+		// close the old client+		require.NoError(t, c.CloseSend())++		// trigger the metainfo loop chore so we can get some pieces to transfer+		satellite.GracefulExit.Chore.Loop.TriggerWait()++		// make sure all the pieces are in the transfer queue+		_, err = satellite.DB.GracefulExit().GetIncomplete(ctx, exitingNode.ID(), 1, 0)+		require.NoError(t, err)++		var notRespondingPiece storj.PieceID+		var messageCount int++		// We need to label this outer loop so that we're able to exit it from the inner loop.+		// The outer loop is for sending the request from node to satellite multiple times.+		// The inner loop is for reading the response.+	MessageLoop:+		for messageCount <= maxOrderLimitSendCount {+			var unknownMsgSent bool+			c, err := client.Process(ctx)+			require.NoError(t, err)++			for {+				response, err := c.Recv()+				if unknownMsgSent {+					require.Error(t, err)+					break+				} else {+					require.NoError(t, err)+				}++				switch m := response.GetMessage().(type) {+				case *pb.SatelliteMessage_ExitCompleted:+					break MessageLoop+				case *pb.SatelliteMessage_TransferPiece:+					if notRespondingPiece.IsZero() {+						notRespondingPiece = m.TransferPiece.OriginalPieceId+					}+					if m.TransferPiece.OriginalPieceId == notRespondingPiece {+						messageCount+++						unknownMsgSent = true+						// We send an unknown message because we want to fail the+						// transfer message request we get from the satellite.+						// This allows us to keep the conn open but repopulate+						// the pending queue.+						err = c.Send(&pb.StorageNodeMessage{})+						require.NoError(t, err)+						require.NoError(t, c.CloseSend())+					} else {+						pieceReader, err := exitingNode.Storage2.Store.Reader(ctx, satellite.ID(), m.TransferPiece.OriginalPieceId)+						require.NoError(t, err)++						header, err := pieceReader.GetPieceHeader()+						require.NoError(t, err)++						orderLimit := header.OrderLimit+						originalPieceHash := &pb.PieceHash{+							PieceId:   orderLimit.PieceId,+							Hash:      header.GetHash(),+							PieceSize: pieceReader.Size(),+							Timestamp: header.GetCreationTime(),+							Signature: header.GetSignature(),+						}++						newPieceHash := &pb.PieceHash{+							PieceId:   m.TransferPiece.AddressedOrderLimit.Limit.PieceId,+							Hash:      originalPieceHash.Hash,+							PieceSize: originalPieceHash.PieceSize,+							Timestamp: time.Now(),+						}++						receivingNodeID := nodeFullIDs[m.TransferPiece.AddressedOrderLimit.Limit.StorageNodeId]+						require.NotNil(t, receivingNodeID)+						signer := signing.SignerFromFullIdentity(receivingNodeID)++						signedNewPieceHash, err := signing.SignPieceHash(ctx, signer, newPieceHash)+						require.NoError(t, err)++						success := &pb.StorageNodeMessage{+							Message: &pb.StorageNodeMessage_Succeeded{+								Succeeded: &pb.TransferSucceeded{+									OriginalPieceId:      m.TransferPiece.OriginalPieceId,+									OriginalPieceHash:    originalPieceHash,+									OriginalOrderLimit:   &orderLimit,+									ReplacementPieceHash: signedNewPieceHash,+								},+							},+						}+						err = c.Send(success)+						require.NoError(t, err)+					}++				default:+					t.FailNow()+				}+			}+		}++		// make sure not responding piece not in queue+		incompletes, err := satellite.DB.GracefulExit().GetIncomplete(ctx, exitingNode.ID(), 10, 0)+		require.NoError(t, err)++		for _, inc := range incompletes {

you can just check that len(incompletes) is 0

navillasa

comment created time in 10 hours

Pull request review commentstorj/storj

satellite/gracefulexit: add count for order limits sent from satellite to exiting node

 func TestFailureNotFoundPieceHashUnverified(t *testing.T) {  } +func TestFailureStorageNodeIgnoresTransferMessages(t *testing.T) {+	var maxOrderLimitSendCount int+	testplanet.Run(t, testplanet.Config{+		SatelliteCount:   1,+		StorageNodeCount: 5,+		UplinkCount:      1,+		Reconfigure: testplanet.Reconfigure{+			Satellite: func(logger *zap.Logger, index int, config *satellite.Config) {+				// We don't care whether a node gracefully exits or not in this test,+				// so we set the max failures percentage extra high.+				config.GracefulExit.OverallMaxFailuresPercentage = 101+				maxOrderLimitSendCount = config.GracefulExit.MaxOrderLimitSendCount+			},+		},+	}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {+		uplinkPeer := planet.Uplinks[0]+		satellite := planet.Satellites[0]++		satellite.GracefulExit.Chore.Loop.Pause()++		nodeFullIDs := make(map[storj.NodeID]*identity.FullIdentity)+		for _, node := range planet.StorageNodes {+			nodeFullIDs[node.ID()] = node.Identity+		}++		rs := &uplink.RSConfig{+			MinThreshold:     2,+			RepairThreshold:  3,+			SuccessThreshold: 4,+			MaxThreshold:     4,+		}++		err := uplinkPeer.UploadWithConfig(ctx, satellite, rs, "testbucket", "test/path", testrand.Bytes(5*memory.KiB))+		require.NoError(t, err)++		// check that there are no exiting nodes.+		exitingNodes, err := satellite.DB.OverlayCache().GetExitingNodes(ctx)+		require.NoError(t, err)+		require.Len(t, exitingNodes, 0)++		exitingNode, err := findNodeToExit(ctx, planet, 1)+		require.NoError(t, err)++		// connect to satellite so we initiate the exit.+		conn, err := exitingNode.Dialer.DialAddressID(ctx, satellite.Addr(), satellite.Identity.ID)+		require.NoError(t, err)+		defer ctx.Check(conn.Close)++		client := conn.SatelliteGracefulExitClient()++		c, err := client.Process(ctx)+		require.NoError(t, err)++		response, err := c.Recv()+		require.NoError(t, err)++		// should get a NotReady since the metainfo loop would not be finished at this point.+		switch response.GetMessage().(type) {+		case *pb.SatelliteMessage_NotReady:+			// now check that the exiting node is initiated.+			exitingNodes, err := satellite.DB.OverlayCache().GetExitingNodes(ctx)+			require.NoError(t, err)+			require.Len(t, exitingNodes, 1)++			require.Equal(t, exitingNode.ID(), exitingNodes[0].NodeID)+		default:+			t.FailNow()+		}+		// close the old client+		require.NoError(t, c.CloseSend())++		// trigger the metainfo loop chore so we can get some pieces to transfer+		satellite.GracefulExit.Chore.Loop.TriggerWait()++		// make sure all the pieces are in the transfer queue+		_, err = satellite.DB.GracefulExit().GetIncomplete(ctx, exitingNode.ID(), 1, 0)+		require.NoError(t, err)++		var notRespondingPiece storj.PieceID+		var messageCount int++		// We need to label this outer loop so that we're able to exit it from the inner loop.+		// The outer loop is for sending the request from node to satellite multiple times.+		// The inner loop is for reading the response.+	MessageLoop:+		for messageCount <= maxOrderLimitSendCount {+			var unknownMsgSent bool+			c, err := client.Process(ctx)+			require.NoError(t, err)++			for {+				response, err := c.Recv()+				if unknownMsgSent {+					require.Error(t, err)+					break+				} else {+					require.NoError(t, err)+				}++				switch m := response.GetMessage().(type) {+				case *pb.SatelliteMessage_ExitCompleted:+					break MessageLoop+				case *pb.SatelliteMessage_TransferPiece:+					if notRespondingPiece.IsZero() {+						notRespondingPiece = m.TransferPiece.OriginalPieceId+					}+					if m.TransferPiece.OriginalPieceId == notRespondingPiece {+						messageCount+++						unknownMsgSent = true+						// We send an unknown message because we want to fail the+						// transfer message request we get from the satellite.+						// This allows us to keep the conn open but repopulate+						// the pending queue.+						err = c.Send(&pb.StorageNodeMessage{})+						require.NoError(t, err)+						require.NoError(t, c.CloseSend())+					} else {

else case not necessary since there are no pieces to successfully transfer in this test

navillasa

comment created time in 10 hours

Pull request review commentstorj/storj

satellite/gracefulexit: add count for order limits sent from satellite to exiting node

 func TestFailureNotFoundPieceHashUnverified(t *testing.T) {  } +func TestFailureStorageNodeIgnoresTransferMessages(t *testing.T) {+	var maxOrderLimitSendCount int+	testplanet.Run(t, testplanet.Config{+		SatelliteCount:   1,+		StorageNodeCount: 5,+		UplinkCount:      1,+		Reconfigure: testplanet.Reconfigure{+			Satellite: func(logger *zap.Logger, index int, config *satellite.Config) {+				// We don't care whether a node gracefully exits or not in this test,+				// so we set the max failures percentage extra high.+				config.GracefulExit.OverallMaxFailuresPercentage = 101+				maxOrderLimitSendCount = config.GracefulExit.MaxOrderLimitSendCount

I think we should verify that maxOrderLimitSendCount is > 1 or something. Either that, or manually configure it ourselves here.

navillasa

comment created time in 10 hours

Pull request review commentstorj/storj

satellite/gracefulexit: add count for order limits sent from satellite to exiting node

 func TestFailureNotFoundPieceHashUnverified(t *testing.T) {  } +func TestFailureStorageNodeIgnoresTransferMessages(t *testing.T) {+	var maxOrderLimitSendCount int+	testplanet.Run(t, testplanet.Config{+		SatelliteCount:   1,+		StorageNodeCount: 5,+		UplinkCount:      1,+		Reconfigure: testplanet.Reconfigure{+			Satellite: func(logger *zap.Logger, index int, config *satellite.Config) {+				// We don't care whether a node gracefully exits or not in this test,+				// so we set the max failures percentage extra high.+				config.GracefulExit.OverallMaxFailuresPercentage = 101+				maxOrderLimitSendCount = config.GracefulExit.MaxOrderLimitSendCount+			},+		},+	}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {+		uplinkPeer := planet.Uplinks[0]+		satellite := planet.Satellites[0]++		satellite.GracefulExit.Chore.Loop.Pause()++		nodeFullIDs := make(map[storj.NodeID]*identity.FullIdentity)+		for _, node := range planet.StorageNodes {+			nodeFullIDs[node.ID()] = node.Identity+		}++		rs := &uplink.RSConfig{+			MinThreshold:     2,+			RepairThreshold:  3,+			SuccessThreshold: 4,+			MaxThreshold:     4,+		}++		err := uplinkPeer.UploadWithConfig(ctx, satellite, rs, "testbucket", "test/path", testrand.Bytes(5*memory.KiB))+		require.NoError(t, err)++		// check that there are no exiting nodes.+		exitingNodes, err := satellite.DB.OverlayCache().GetExitingNodes(ctx)+		require.NoError(t, err)+		require.Len(t, exitingNodes, 0)++		exitingNode, err := findNodeToExit(ctx, planet, 1)+		require.NoError(t, err)++		// connect to satellite so we initiate the exit.+		conn, err := exitingNode.Dialer.DialAddressID(ctx, satellite.Addr(), satellite.Identity.ID)+		require.NoError(t, err)+		defer ctx.Check(conn.Close)++		client := conn.SatelliteGracefulExitClient()++		c, err := client.Process(ctx)+		require.NoError(t, err)++		response, err := c.Recv()+		require.NoError(t, err)++		// should get a NotReady since the metainfo loop would not be finished at this point.+		switch response.GetMessage().(type) {+		case *pb.SatelliteMessage_NotReady:+			// now check that the exiting node is initiated.+			exitingNodes, err := satellite.DB.OverlayCache().GetExitingNodes(ctx)+			require.NoError(t, err)+			require.Len(t, exitingNodes, 1)++			require.Equal(t, exitingNode.ID(), exitingNodes[0].NodeID)+		default:+			t.FailNow()+		}+		// close the old client+		require.NoError(t, c.CloseSend())++		// trigger the metainfo loop chore so we can get some pieces to transfer+		satellite.GracefulExit.Chore.Loop.TriggerWait()++		// make sure all the pieces are in the transfer queue+		_, err = satellite.DB.GracefulExit().GetIncomplete(ctx, exitingNode.ID(), 1, 0)+		require.NoError(t, err)++		var notRespondingPiece storj.PieceID+		var messageCount int++		// We need to label this outer loop so that we're able to exit it from the inner loop.+		// The outer loop is for sending the request from node to satellite multiple times.+		// The inner loop is for reading the response.+	MessageLoop:+		for messageCount <= maxOrderLimitSendCount {+			var unknownMsgSent bool+			c, err := client.Process(ctx)+			require.NoError(t, err)++			for {+				response, err := c.Recv()+				if unknownMsgSent {+					require.Error(t, err)+					break+				} else {+					require.NoError(t, err)+				}++				switch m := response.GetMessage().(type) {+				case *pb.SatelliteMessage_ExitCompleted:+					break MessageLoop+				case *pb.SatelliteMessage_TransferPiece:+					if notRespondingPiece.IsZero() {

There is only one piece so I don't know if this is necessary

navillasa

comment created time in 10 hours

push eventstorj/storj

Ethan Adams

commit sha f3dccb56b1b30320b60383489e3402d0df61dad4

satellite/gracefulexit: Check if pointer has been overwritten or deleted before sending transfer message. (#3481)

view details

Yingrong Zhao

commit sha 6331f839aee7c2d5b895abb0c578f15781ccd6b6

satellite/gracefulexit: not allow disqualified node to graceful exit (#3493)

view details

Nikolay Yurchenko

commit sha 7ef0bbe00dc4e9312e9890c450517f5533db6e6b

credit cards icons selection added (#3527)

view details

Nikolay Yurchenko

commit sha 9ce6dad3172f56c07912a8877ec3e90e48113efb

web/satellite: usage report date selected date range formatted (#3518)

view details

Nikolay Yurchenko

commit sha 06a52e75b8fc6e966ad76a8dc533441e38f267e6

web/satellite: name hint (#3515)

view details

Yingrong Zhao

commit sha 69b0ae02bfb8ef5fe1aecb80ad8cbe0af98fa415

satellite/gracefulexit: separate functional code in endpoint (#3476)

view details

Kaloyan Raev

commit sha 9dce3dc942341be7e908f220a1601f78cbefa47a

installer/windows: unit tests for C# custom actions (part 1) (#3521)

view details

Kaloyan Raev

commit sha 20623fdc969afa4e5e9bc857227f24815b8be893

Increase min required difficulty to 36 in signing service (#3535)

view details

Nikolay Yurchenko

commit sha e065ad00169ae8132084dae939ad13b3aef2fdc6

detailed usage charges info markup added (#3528)

view details

Nikolay Yurchenko

commit sha 20eef5a20ac3c2381993b9166dd0f8678df1a669

sorting header on api keys page styles fixed (#3537)

view details

Egon Elbre

commit sha f3e803203bdcd9880661a2746cd9034463c464c4

lib/uplinkc: add clarifying comments to download_read (#3525)

view details

Egon Elbre

commit sha 4b85d3d73967c3093dd9adf785ef8e11c1c9785f

internal/testplanet: better error message when postgres is not defined (#3539)

view details

Ivan Fraixedes

commit sha 6516471cbceeb47fc7c3e5884733994fedfb18e7

uplink/storage/streams: Upload loop operations reorganization (#3429) * uplink/storage/streams: Upload loop ops reorganization Reorganize the operations of the loop run by streamsStore.upload method for not doing unneeded computations on each iteration. * uplink/storage/streams: Move out returns values declaration Move out return values declarations for those which aren't strictly needed due to defer statements nor documentation purpose.

view details

Michal Niewrzal

commit sha 89efd17f4db4c0c7c404734460adea98b7e29a90

docs/design: zombie segments cleaner (#3461)

view details

Yaroslav Vorobiov

commit sha 36311a3a05b96e83facfe438a479e3756b05db64

satellite/console: add token deposit API, populate billing history with transactions (#3500)

view details

Bryan White

commit sha 7355065dc9b3cbe6f31db6d372891862a97d22a8

pkg/{cfgstruct,identity}: replace seperator in default values when path tag set (#3498)

view details

Nikolai Siedov

commit sha c1ae8c332ffa7b0aacfad4746c087261c7b01dde

satellite/console: auth API error handling refactored (#3540)

view details

Vitalii Shpital

commit sha e99af1822934cb8fbe3e24b72e4f3c8081517cc2

web/storagenode: date and data formating fixed (#3519)

view details

Vitalii Shpital

commit sha b7a04eb8811d58ba236e20cb611331fd69493020

web/satellite: saving selected project in local storage implemented (#3470)

view details

Bryan White

commit sha 7cc4217fef8311dc7674cb8ca5e818493a65a07b

cmd/storagenode-updater: simplify and reorder update sqeuence (#3487)

view details

push time in 16 hours

push eventstorj/storj

Nikolay Yurchenko

commit sha 7ef0bbe00dc4e9312e9890c450517f5533db6e6b

credit cards icons selection added (#3527)

view details

Nikolay Yurchenko

commit sha 9ce6dad3172f56c07912a8877ec3e90e48113efb

web/satellite: usage report date selected date range formatted (#3518)

view details

Nikolay Yurchenko

commit sha 06a52e75b8fc6e966ad76a8dc533441e38f267e6

web/satellite: name hint (#3515)

view details

Maximillian von Briesen

commit sha 915224d8b1e3e12b853f81dfd0611b25ba9e4713

Merge branch 'master' into green/separate-functional-code-ge

view details

push time in 4 days

Pull request review commentstorj/storj

satellite/gracefulexit: separate functional code

 func (endpoint *Endpoint) handleSucceeded(ctx context.Context, stream processStr 		return Error.New("Could not find transfer item in pending queue") 	} -	if transfer.satelliteMessage == nil {-		return Error.New("Satellite message cannot be nil")-	}-	if transfer.satelliteMessage.GetTransferPiece() == nil {-		return Error.New("Satellite message transfer piece cannot be nil")-	}-	if transfer.satelliteMessage.GetTransferPiece().GetAddressedOrderLimit() == nil {-		return Error.New("Addressed order limit on transfer piece cannot be nil")-	}-	if transfer.satelliteMessage.GetTransferPiece().GetAddressedOrderLimit().GetLimit() == nil {-		return Error.New("Addressed order limit on transfer piece cannot be nil")-	}-	if transfer.path == nil {-		return Error.New("Transfer path cannot be nil")-	}--	originalOrderLimit := message.Succeeded.GetOriginalOrderLimit()-	if originalOrderLimit == nil {-		return ErrInvalidArgument.New("Original order limit cannot be nil")-	}-	originalPieceHash := message.Succeeded.GetOriginalPieceHash()-	if originalPieceHash == nil {-		return ErrInvalidArgument.New("Original piece hash cannot be nil")-	}-	replacementPieceHash := message.Succeeded.GetReplacementPieceHash()-	if replacementPieceHash == nil {-		return ErrInvalidArgument.New("Replacement piece hash cannot be nil")-	}--	// verify that the original piece hash and replacement piece hash match-	if !bytes.Equal(originalPieceHash.Hash, replacementPieceHash.Hash) {-		return ErrInvalidArgument.New("Piece hashes for transferred piece don't match")-	}--	// verify that the satellite signed the original order limit-	err = endpoint.orders.VerifyOrderLimitSignature(ctx, originalOrderLimit)+	err = endpoint.validatePendingTransfer(ctx, transfer) 	if err != nil {-		return ErrInvalidArgument.Wrap(err)-	}--	// verify that the public key on the order limit signed the original piece hash-	err = signing.VerifyUplinkPieceHashSignature(ctx, originalOrderLimit.UplinkPublicKey, originalPieceHash)-	if err != nil {-		return ErrInvalidArgument.Wrap(err)-	}--	if originalOrderLimit.PieceId != message.Succeeded.OriginalPieceId {-		return ErrInvalidArgument.New("Invalid original piece ID")+		return Error.Wrap(err) 	}  	receivingNodeID := transfer.satelliteMessage.GetTransferPiece().GetAddressedOrderLimit().GetLimit().StorageNodeId-	if transfer.originalPointer == nil || transfer.originalPointer.GetRemote() == nil {-		return Error.New("could not get remote pointer from transfer item")-	}-	calculatedNewPieceID := transfer.originalPointer.GetRemote().RootPieceId.Derive(receivingNodeID, transfer.pieceNum)-	if calculatedNewPieceID != replacementPieceHash.PieceId {-		return ErrInvalidArgument.New("Invalid replacement piece ID")-	}- 	// get peerID and signee for new storage node 	peerID, err := endpoint.peerIdentities.Get(ctx, receivingNodeID) 	if err != nil { 		return Error.Wrap(err) 	}-	signee := signing.SigneeFromPeerIdentity(peerID)--	// verify that the new node signed the replacement piece hash-	err = signing.VerifyPieceHashSignature(ctx, signee, replacementPieceHash)+	// verify transferred piece+	err = endpoint.verifyPieceTransferred(ctx, message, transfer, peerID)

nit - peerID should be able to be acquired from inside verifyPieceTransferred now that it is a function on the endpoint. Lines 543-548 should be able to be moved inside

VinozzZ

comment created time in 5 days

Pull request review commentstorj/storj

satellite/gracefulexit: separate functional code

 func (endpoint *Endpoint) updatePointer(ctx context.Context, originalPointer *pb 	return nil } -func (endpoint *Endpoint) handleDisqualifiedNode(ctx context.Context, nodeID storj.NodeID) (isDisqualified bool, err error) {-	// check if node is disqualified-	nodeInfo, err := endpoint.overlay.Get(ctx, nodeID)+// checkExitStatus returns a satellite message based on a node current graceful exit status+// if a node hasn't started graceful exit, it will initialize the process+// if a node has finished graceful exit, it will return a finished message+// if a node has started graceful exit, but no transfer item is available yet, it will return an not ready message+// otherwise, the returned message will be nil+func (endpoint *Endpoint) checkExitStatus(ctx context.Context, nodeID storj.NodeID) (*pb.SatelliteMessage, error) {+	exitStatus, err := endpoint.overlaydb.GetExitStatus(ctx, nodeID) 	if err != nil {-		return false, Error.Wrap(err)+		return nil, Error.Wrap(err) 	} -	if nodeInfo.Disqualified != nil {-		// update graceful exit status to be failed-		exitStatusRequest := &overlay.ExitStatusRequest{-			NodeID:         nodeID,-			ExitFinishedAt: time.Now().UTC(),-			ExitSuccess:    false,-		}+	if exitStatus.ExitFinishedAt != nil {+		// TODO maybe we should store the reason in the DB so we know how it originally failed.+		return endpoint.getFinishedMessage(ctx, nodeID, *exitStatus.ExitFinishedAt, exitStatus.ExitSuccess, -1)+	} -		_, err = endpoint.overlaydb.UpdateExitStatus(ctx, exitStatusRequest)+	if exitStatus.ExitInitiatedAt == nil {+		request := &overlay.ExitStatusRequest{NodeID: nodeID, ExitInitiatedAt: time.Now().UTC()}+		node, err := endpoint.overlaydb.UpdateExitStatus(ctx, request) 		if err != nil {-			return true, Error.Wrap(err)+			return nil, Error.Wrap(err) 		}--		// remove remaining items from the queue-		err = endpoint.db.DeleteTransferQueueItems(ctx, nodeID)+		err = endpoint.db.IncrementProgress(ctx, nodeID, 0, 0, 0) 		if err != nil {-			return true, Error.Wrap(err)+			return nil, Error.Wrap(err) 		} -		return true, nil+		// graceful exit initiation metrics+		age := time.Now().UTC().Sub(node.CreatedAt.UTC())+		mon.FloatVal("graceful_exit_init_node_age_seconds").Observe(age.Seconds())                           //locked+		mon.IntVal("graceful_exit_init_node_audit_success_count").Observe(node.Reputation.AuditSuccessCount) //locked+		mon.IntVal("graceful_exit_init_node_audit_total_count").Observe(node.Reputation.AuditCount)          //locked+		mon.IntVal("graceful_exit_init_node_piece_count").Observe(node.PieceCount)                           //locked++		return &pb.SatelliteMessage{Message: &pb.SatelliteMessage_NotReady{NotReady: &pb.NotReady{}}}, nil 	} -	return false, nil+	if exitStatus.ExitLoopCompletedAt == nil {+		return &pb.SatelliteMessage{Message: &pb.SatelliteMessage_NotReady{NotReady: &pb.NotReady{}}}, nil+	}++	return nil, nil+}++func (endpoint *Endpoint) generateExitStatusRequest(ctx context.Context, nodeID storj.NodeID) (*overlay.ExitStatusRequest, pb.ExitFailed_Reason, error) {+	var exitFailedReason pb.ExitFailed_Reason = -1+	progress, err := endpoint.db.GetProgress(ctx, nodeID)+	if err != nil {+		return nil, exitFailedReason, rpcstatus.Error(rpcstatus.Internal, err.Error())+	}++	mon.IntVal("graceful_exit_final_pieces_failed").Observe(progress.PiecesFailed)         //locked+	mon.IntVal("graceful_exit_final_pieces_succeess").Observe(progress.PiecesTransferred)  //locked+	mon.IntVal("graceful_exit_final_bytes_transferred").Observe(progress.BytesTransferred) //locked+	processed := progress.PiecesFailed + progress.PiecesTransferred++	if processed > 0 {+		mon.IntVal("graceful_exit_successful_pieces_transfer_ratio").Observe(progress.PiecesTransferred / processed) //locked+	}++	exitStatusRequest := &overlay.ExitStatusRequest{+		NodeID:         progress.NodeID,+		ExitFinishedAt: time.Now().UTC(),+	}+	// check node's exiting progress to see if it has failed passed max failure threshold+	if processed > 0 && float64(progress.PiecesFailed)/float64(processed)*100 >= float64(endpoint.config.OverallMaxFailuresPercentage) {+		exitStatusRequest.ExitSuccess = false+		exitFailedReason = pb.ExitFailed_OVERALL_FAILURE_PERCENTAGE_EXCEEDED+	} else {+		exitStatusRequest.ExitSuccess = true+	}++	if exitStatusRequest.ExitSuccess {+		mon.Meter("graceful_exit_success").Mark(1) //locked+	} else {+		mon.Meter("graceful_exit_fail_max_failures_percentage").Mark(1) //locked+	}++	return exitStatusRequest, exitFailedReason, nil++}++func (endpoint *Endpoint) calculatePieceSize(ctx context.Context, pointer *pb.Pointer, incomplete *TransferQueueItem, nodePiece *pb.RemotePiece) (int64, error) {+	nodeID := incomplete.NodeID++	// calculate piece size+	redundancy, err := eestream.NewRedundancyStrategyFromProto(pointer.GetRemote().GetRedundancy())+	if err != nil {+		return 0, Error.Wrap(err)+	}++	pieces := pointer.GetRemote().GetRemotePieces()+	if len(pieces) > redundancy.OptimalThreshold() {+		endpoint.log.Debug("pointer has more pieces than required. removing node from pointer.", zap.Stringer("node ID", nodeID), zap.ByteString("path", incomplete.Path), zap.Int32("piece num", incomplete.PieceNum))

nit - maybe we can move this log statement to where the node is actually removed from the pointer

VinozzZ

comment created time in 5 days

Pull request review commentstorj/storj

satellite/gracefulexit: separate functional code

 func (endpoint *Endpoint) getValidPointer(ctx context.Context, path string, piec 	} 	return pointer, nil }++func (endpoint *Endpoint) getNodePiece(ctx context.Context, pointer *pb.Pointer, incomplete *TransferQueueItem) (*pb.RemotePiece, error) {

nit - we could add a comment clarifying that pointer is expected to be remote when this is called. Alternatively, we could make the *pb.Pointer arg a *pb.RemoteSegment arg. Not super important, just a thought.

VinozzZ

comment created time in 5 days

Pull request review commentstorj/storj

satellite/gracefulexit: separate functional code

 func (endpoint *Endpoint) processIncomplete(ctx context.Context, stream processS 		if err != nil { 			return Error.Wrap(err) 		}-	}-	remote := pointer.GetRemote()-	pieces := remote.GetRemotePieces()--	var nodePiece *pb.RemotePiece-	excludedNodeIDs := make([]storj.NodeID, len(pieces))-	for i, piece := range pieces {-		if piece.NodeId == nodeID && piece.PieceNum == incomplete.PieceNum {-			nodePiece = piece-		}-		excludedNodeIDs[i] = piece.NodeId-	}--	if nodePiece == nil {-		endpoint.log.Debug("piece no longer held by node", zap.Stringer("Node ID", nodeID), zap.ByteString("path", incomplete.Path), zap.Int32("piece num", incomplete.PieceNum))--		err = endpoint.db.DeleteTransferQueueItem(ctx, nodeID, incomplete.Path, incomplete.PieceNum)-		if err != nil {-			return Error.Wrap(err)-		}  		return nil 	} -	redundancy, err := eestream.NewRedundancyStrategyFromProto(pointer.GetRemote().GetRedundancy())+	nodePiece, err := endpoint.getNodePiece(ctx, pointer, incomplete)

Do you think it would be worth combining getNodePiece and getValidPointer in the future? We don't have to do it here, but it seems like it might make sense to combine them to me.

VinozzZ

comment created time in 5 days

Pull request review commentstorj/storj

satellite/gracefulexit: separate functional code

 func (endpoint *Endpoint) updatePointer(ctx context.Context, originalPointer *pb 	return nil } -func (endpoint *Endpoint) handleDisqualifiedNode(ctx context.Context, nodeID storj.NodeID) (isDisqualified bool, err error) {-	// check if node is disqualified-	nodeInfo, err := endpoint.overlay.Get(ctx, nodeID)+// checkExitStatus returns a satellite message based on a node current graceful exit status+// if a node hasn't started graceful exit, it will initialize the process

nit - ... and return a not ready message

VinozzZ

comment created time in 5 days

Pull request review commentstorj/storj

satellite/gracefulexit: separate functional code

+// Copyright (C) 2019 Storj Labs, Inc.+// See LICENSE for copying information.++package gracefulexit++import (+	"bytes"+	"context"++	"storj.io/storj/pkg/identity"+	"storj.io/storj/pkg/pb"+	"storj.io/storj/pkg/signing"+	"storj.io/storj/uplink/eestream"+)++func validatePointer(ctx context.Context, pointer *pb.Pointer, incomplete *TransferQueueItem) (*pb.RemotePiece, error) {+	remote := pointer.GetRemote()+	nodeID := incomplete.NodeID++	pieces := remote.GetRemotePieces()+	var nodePiece *pb.RemotePiece+	for _, piece := range pieces {+		if piece.NodeId == nodeID && piece.PieceNum == incomplete.PieceNum {+			nodePiece = piece+		}+	}++	if nodePiece == nil {+		return nil, Error.New("piece no longer held by node")+	}++	return nodePiece, nil+}++func validateRedundancyThreshold(ctx context.Context, pointer *pb.Pointer, redundancy eestream.RedundancyStrategy) bool {+	pieces := pointer.GetRemote().GetRemotePieces()+	if len(pieces) > redundancy.OptimalThreshold() {+		return false+	}++	return true+}++func validatePendingTransfer(ctx context.Context, transfer *pendingTransfer) error {+	if transfer.satelliteMessage == nil {+		return Error.New("Satellite message cannot be nil")+	}+	if transfer.satelliteMessage.GetTransferPiece() == nil {+		return Error.New("Satellite message transfer piece cannot be nil")+	}+	if transfer.satelliteMessage.GetTransferPiece().GetAddressedOrderLimit() == nil {+		return Error.New("Addressed order limit on transfer piece cannot be nil")+	}+	if transfer.satelliteMessage.GetTransferPiece().GetAddressedOrderLimit().GetLimit() == nil {+		return Error.New("Addressed order limit on transfer piece cannot be nil")+	}+	if transfer.path == nil {+		return Error.New("Transfer path cannot be nil")+	}+	if transfer.originalPointer == nil || transfer.originalPointer.GetRemote() == nil {+		return Error.New("could not get remote pointer from transfer item")+	}++	return nil+}++func verifyPieceTransferred(ctx context.Context, message *pb.StorageNodeMessage_Succeeded, transfer *pendingTransfer, satellite signing.Signer, nodeID *identity.PeerIdentity) error {

nit - change nodeID *identity.PeerIdentity to receivingID *identity.PeerIdentity

VinozzZ

comment created time in 6 days

Pull request review commentstorj/storj

satellite/gracefulexit: separate functional code

 func (endpoint *Endpoint) processIncomplete(ctx context.Context, stream processS 	if err != nil { 		return Error.Wrap(err) 	}-	remote := pointer.GetRemote()--	pieces := remote.GetRemotePieces()-	var nodePiece *pb.RemotePiece-	excludedNodeIDs := make([]storj.NodeID, len(pieces))-	for i, piece := range pieces {-		if piece.NodeId == nodeID && piece.PieceNum == incomplete.PieceNum {-			nodePiece = piece-		}-		excludedNodeIDs[i] = piece.NodeId-	}--	if nodePiece == nil {-		endpoint.log.Debug("piece no longer held by node", zap.Stringer("Node ID", nodeID), zap.ByteString("path", incomplete.Path), zap.Int32("piece num", incomplete.PieceNum)) -		err = endpoint.db.DeleteTransferQueueItem(ctx, nodeID, incomplete.Path, incomplete.PieceNum)+	// validate pointer state+	nodePiece, err := validatePointer(ctx, pointer, incomplete)+	if err != nil {+		endpoint.log.Debug("piece no longer held by node", zap.Stringer("node ID", nodeID), zap.ByteString("path", incomplete.Path), zap.Int32("piece num", incomplete.PieceNum))+		err := endpoint.db.DeleteTransferQueueItem(ctx, nodeID, incomplete.Path, incomplete.PieceNum) 		if err != nil { 			return Error.Wrap(err) 		}+		return Error.Wrap(err)+	} +	pieceSize, err := endpoint.calculatePieceSize(ctx, pointer, incomplete, nodePiece)+	if ErrAboveOptimalThreshold.Has(err) { 		return nil 	}--	redundancy, err := eestream.NewRedundancyStrategyFromProto(pointer.GetRemote().GetRedundancy()) 	if err != nil { 		return Error.Wrap(err) 	} -	if len(remote.GetRemotePieces()) > redundancy.OptimalThreshold() {-		endpoint.log.Debug("pointer has more pieces than required. removing node from pointer.", zap.Stringer("Node ID", nodeID), zap.ByteString("path", incomplete.Path), zap.Int32("piece num", incomplete.PieceNum))--		_, err = endpoint.metainfo.UpdatePieces(ctx, string(incomplete.Path), pointer, nil, []*pb.RemotePiece{nodePiece})-		if err != nil {-			return Error.Wrap(err)-		}--		err = endpoint.db.DeleteTransferQueueItem(ctx, nodeID, incomplete.Path, incomplete.PieceNum)-		if err != nil {-			return Error.Wrap(err)-		}--		return nil+	// populate excluded node IDs+	pieces := pointer.GetRemote().RemotePieces+	excludedNodeIDs := make([]storj.NodeID, len(pieces))+	for i, piece := range pieces {+		excludedNodeIDs[i] = piece.NodeId 	} -	pieceSize := eestream.CalcPieceSize(pointer.GetSegmentSize(), redundancy)--	request := overlay.FindStorageNodesRequest{+	// get replacement node+	request := &overlay.FindStorageNodesRequest{ 		RequestedCount: 1, 		FreeBandwidth:  pieceSize, 		FreeDisk:       pieceSize, 		ExcludedNodes:  excludedNodeIDs, 	} -	newNodes, err := endpoint.overlay.FindStorageNodes(ctx, request)+	newNodes, err := endpoint.overlay.FindStorageNodes(ctx, *request) 	if err != nil { 		return Error.Wrap(err) 	}  	if len(newNodes) == 0 { 		return Error.New("could not find a node to receive piece transfer: node ID %v, path %v, piece num %v", nodeID, incomplete.Path, incomplete.PieceNum) 	}+ 	newNode := newNodes[0] 	endpoint.log.Debug("found new node for piece transfer", zap.Stringer("original node ID", nodeID), zap.Stringer("replacement node ID", newNode.Id), 		zap.ByteString("path", incomplete.Path), zap.Int32("piece num", incomplete.PieceNum)) +	remote := pointer.GetRemote()

maybe you can do this up before you call pointer.GetRemote().RemotePieces above. That way you can have

remote := pointer.GetRemote()
pieces := remote.RemotePieces
VinozzZ

comment created time in 6 days

Pull request review commentstorj/storj

satellite/gracefulexit: separate functional code

+// Copyright (C) 2019 Storj Labs, Inc.+// See LICENSE for copying information.++package gracefulexit++import (+	"bytes"+	"context"++	"storj.io/storj/pkg/identity"+	"storj.io/storj/pkg/pb"+	"storj.io/storj/pkg/signing"+	"storj.io/storj/uplink/eestream"+)++func validatePointer(ctx context.Context, pointer *pb.Pointer, incomplete *TransferQueueItem) (*pb.RemotePiece, error) {+	remote := pointer.GetRemote()+	nodeID := incomplete.NodeID++	pieces := remote.GetRemotePieces()+	var nodePiece *pb.RemotePiece+	for _, piece := range pieces {+		if piece.NodeId == nodeID && piece.PieceNum == incomplete.PieceNum {+			nodePiece = piece+		}+	}++	if nodePiece == nil {+		return nil, Error.New("piece no longer held by node")+	}++	return nodePiece, nil+}++func validateRedundancyThreshold(ctx context.Context, pointer *pb.Pointer, redundancy eestream.RedundancyStrategy) bool {

Does this need to be a separate function? I think it would be fine to have this logic in calculatePieceSize

VinozzZ

comment created time in 6 days

Pull request review commentstorj/storj

satellite/gracefulexit: separate functional code

 func (endpoint *Endpoint) processIncomplete(ctx context.Context, stream processS 	if err != nil { 		return Error.Wrap(err) 	}-	remote := pointer.GetRemote()--	pieces := remote.GetRemotePieces()-	var nodePiece *pb.RemotePiece-	excludedNodeIDs := make([]storj.NodeID, len(pieces))-	for i, piece := range pieces {-		if piece.NodeId == nodeID && piece.PieceNum == incomplete.PieceNum {-			nodePiece = piece-		}-		excludedNodeIDs[i] = piece.NodeId-	}--	if nodePiece == nil {-		endpoint.log.Debug("piece no longer held by node", zap.Stringer("Node ID", nodeID), zap.ByteString("path", incomplete.Path), zap.Int32("piece num", incomplete.PieceNum)) -		err = endpoint.db.DeleteTransferQueueItem(ctx, nodeID, incomplete.Path, incomplete.PieceNum)+	// validate pointer state+	nodePiece, err := validatePointer(ctx, pointer, incomplete)+	if err != nil {+		endpoint.log.Debug("piece no longer held by node", zap.Stringer("node ID", nodeID), zap.ByteString("path", incomplete.Path), zap.Int32("piece num", incomplete.PieceNum))+		err := endpoint.db.DeleteTransferQueueItem(ctx, nodeID, incomplete.Path, incomplete.PieceNum)

This error should be named something different so it is not confused with the err from validatePointer

VinozzZ

comment created time in 6 days

Pull request review commentstorj/storj

satellite/gracefulexit: separate functional code

 func (endpoint *Endpoint) handleFailed(ctx context.Context, pending *pendingMap, 	return nil } +func (endpoint *Endpoint) handleFinished(ctx context.Context, stream processStream, exitStatusRequest *overlay.ExitStatusRequest, failedReason pb.ExitFailed_Reason) error {+	finishedMsg, err := endpoint.getFinishedMessage(ctx, endpoint.signer, exitStatusRequest.NodeID, exitStatusRequest.ExitFinishedAt, exitStatusRequest.ExitSuccess, failedReason)+	if err != nil {+		return Error.Wrap(err)+	}++	_, err = endpoint.overlaydb.UpdateExitStatus(ctx, exitStatusRequest)+	if err != nil {+		return Error.Wrap(err)+	}++	err = stream.Send(finishedMsg)+	if err != nil {+		return Error.Wrap(err)+	}++	// remove remaining items from the queue after notifying nodes about their exit status+	err = endpoint.db.DeleteTransferQueueItems(ctx, exitStatusRequest.NodeID)+	if err != nil {+		return Error.Wrap(err)+	}++	return nil+}+ func (endpoint *Endpoint) getFinishedMessage(ctx context.Context, signer signing.Signer, nodeID storj.NodeID, finishedAt time.Time, success bool, reason pb.ExitFailed_Reason) (message *pb.SatelliteMessage, err error) {

nit - we should have caught this before, but since it is not related to this PR directly, feel free to ignore. getFinishedMessage does not need to take signer signing.Signer as an argument because it can be accessed via endpoint.signer.

VinozzZ

comment created time in 6 days

Pull request review commentstorj/storj

satellite/gracefulexit: separate functional code

 func (endpoint *Endpoint) processIncomplete(ctx context.Context, stream processS 	if err != nil { 		return Error.Wrap(err) 	}-	remote := pointer.GetRemote()--	pieces := remote.GetRemotePieces()-	var nodePiece *pb.RemotePiece-	excludedNodeIDs := make([]storj.NodeID, len(pieces))-	for i, piece := range pieces {-		if piece.NodeId == nodeID && piece.PieceNum == incomplete.PieceNum {-			nodePiece = piece-		}-		excludedNodeIDs[i] = piece.NodeId-	}--	if nodePiece == nil {-		endpoint.log.Debug("piece no longer held by node", zap.Stringer("Node ID", nodeID), zap.ByteString("path", incomplete.Path), zap.Int32("piece num", incomplete.PieceNum)) -		err = endpoint.db.DeleteTransferQueueItem(ctx, nodeID, incomplete.Path, incomplete.PieceNum)+	// validate pointer state+	nodePiece, err := validatePointer(ctx, pointer, incomplete)+	if err != nil {+		endpoint.log.Debug("piece no longer held by node", zap.Stringer("node ID", nodeID), zap.ByteString("path", incomplete.Path), zap.Int32("piece num", incomplete.PieceNum))

Would it be better for this log to be inside validatePointer since that is where the equivalent error is returned from?

VinozzZ

comment created time in 6 days

Pull request review commentstorj/storj

satellite/gracefulexit: separate functional code

 func (endpoint *Endpoint) doProcess(stream processStream) (err error) { 		if !morePiecesFlag && pendingCount == 0 { 			processMu.Unlock() -			exitStatusRequest := &overlay.ExitStatusRequest{-				NodeID:         nodeID,-				ExitFinishedAt: time.Now().UTC(),-			}- 			progress, err := endpoint.db.GetProgress(ctx, nodeID) 			if err != nil { 				return rpcstatus.Error(rpcstatus.Internal, err.Error()) 			} -			var transferMsg *pb.SatelliteMessage 			mon.IntVal("graceful_exit_final_pieces_failed").Observe(progress.PiecesFailed) 			mon.IntVal("graceful_exit_final_pieces_succeess").Observe(progress.PiecesTransferred) 			mon.IntVal("graceful_exit_final_bytes_transferred").Observe(progress.BytesTransferred) -			processed := progress.PiecesFailed + progress.PiecesTransferred-			if processed > 0 {-				mon.IntVal("graceful_exit_successful_pieces_transfer_ratio").Observe(progress.PiecesTransferred / processed)-			}--			// check node's exiting progress to see if it has failed passed max failure threshold-			if processed > 0 && float64(progress.PiecesFailed)/float64(processed)*100 >= float64(endpoint.config.OverallMaxFailuresPercentage) {--				exitStatusRequest.ExitSuccess = false-				transferMsg, err = endpoint.getFinishedMessage(ctx, endpoint.signer, nodeID, exitStatusRequest.ExitFinishedAt, exitStatusRequest.ExitSuccess, pb.ExitFailed_OVERALL_FAILURE_PERCENTAGE_EXCEEDED)-				if err != nil {-					return rpcstatus.Error(rpcstatus.Internal, err.Error())-				}-			} else {-				exitStatusRequest.ExitSuccess = true-				transferMsg, err = endpoint.getFinishedMessage(ctx, endpoint.signer, nodeID, exitStatusRequest.ExitFinishedAt, exitStatusRequest.ExitSuccess, -1)-				if err != nil {-					return rpcstatus.Error(rpcstatus.Internal, err.Error())-				}-			}-			if exitStatusRequest.ExitSuccess {-				mon.Meter("graceful_exit_success").Mark(1)-			} else {-				mon.Meter("graceful_exit_fail_max_failures_percentage").Mark(1)-			}-			_, err = endpoint.overlaydb.UpdateExitStatus(ctx, exitStatusRequest)+			// update exit status+			exitStatusRequest, err := endpoint.generateExitStatusRequest(ctx, progress) 			if err != nil { 				return rpcstatus.Error(rpcstatus.Internal, err.Error()) 			} -			err = stream.Send(transferMsg)+			err = endpoint.handleFinished(ctx, stream, exitStatusRequest, pb.ExitFailed_OVERALL_FAILURE_PERCENTAGE_EXCEEDED)

generateExitStatusRequest should return a pb.ExitFailed_Reason, because this call to handleFinished could still be an exit success, in which case we want to pass in -1 instead of pb.ExitFailed_OVERALL_FAILURE_PERCENTAGE_EXCEEDED

VinozzZ

comment created time in 6 days

Pull request review commentstorj/storj

satellite/gracefulexit: separate functional code

 func (endpoint *Endpoint) updatePointer(ctx context.Context, originalPointer *pb  	return nil }++func (endpoint *Endpoint) checkExitStatus(ctx context.Context, nodeID storj.NodeID) (*pb.SatelliteMessage, error) {+	exitStatus, err := endpoint.overlaydb.GetExitStatus(ctx, nodeID)+	if err != nil {+		return nil, Error.Wrap(err)+	}++	if exitStatus.ExitFinishedAt != nil {+		// TODO maybe we should store the reason in the DB so we know how it originally failed.+		return endpoint.getFinishedMessage(ctx, endpoint.signer, nodeID, *exitStatus.ExitFinishedAt, exitStatus.ExitSuccess, -1)+	}++	if exitStatus.ExitInitiatedAt == nil {+		request := &overlay.ExitStatusRequest{NodeID: nodeID, ExitInitiatedAt: time.Now().UTC()}+		node, err := endpoint.overlaydb.UpdateExitStatus(ctx, request)+		if err != nil {+			return nil, Error.Wrap(err)+		}+		err = endpoint.db.IncrementProgress(ctx, nodeID, 0, 0, 0)+		if err != nil {+			return nil, Error.Wrap(err)+		}++		// graceful exit initiation metrics+		age := time.Now().UTC().Sub(node.CreatedAt.UTC())+		mon.FloatVal("graceful_exit_init_node_age_seconds").Observe(age.Seconds())

nit - maybe since we are moving everything around anyway, we should go ahead and add //locked to these monkit values so that they are added to monkit.lock. We can also do it separately, so don't worry too much about it.

VinozzZ

comment created time in 6 days

Pull request review commentstorj/storj

satellite/gracefulexit: separate functional code

 func (endpoint *Endpoint) doProcess(stream processStream) (err error) { 		if !morePiecesFlag && pendingCount == 0 { 			processMu.Unlock() -			exitStatusRequest := &overlay.ExitStatusRequest{-				NodeID:         nodeID,-				ExitFinishedAt: time.Now().UTC(),-			}- 			progress, err := endpoint.db.GetProgress(ctx, nodeID)

I think this GetProgress and the monkit tracking can also be moved into generateExitStatusRequest

VinozzZ

comment created time in 6 days

Pull request review commentstorj/storj

satellite/gracefulexit: separate functional code

 func (endpoint *Endpoint) processIncomplete(ctx context.Context, stream processS 	if err != nil { 		return Error.Wrap(err) 	}-	remote := pointer.GetRemote()--	pieces := remote.GetRemotePieces()-	var nodePiece *pb.RemotePiece-	excludedNodeIDs := make([]storj.NodeID, len(pieces))-	for i, piece := range pieces {-		if piece.NodeId == nodeID && piece.PieceNum == incomplete.PieceNum {-			nodePiece = piece-		}-		excludedNodeIDs[i] = piece.NodeId-	}--	if nodePiece == nil {-		endpoint.log.Debug("piece no longer held by node", zap.Stringer("Node ID", nodeID), zap.ByteString("path", incomplete.Path), zap.Int32("piece num", incomplete.PieceNum)) -		err = endpoint.db.DeleteTransferQueueItem(ctx, nodeID, incomplete.Path, incomplete.PieceNum)+	// validate pointer state+	nodePiece, err := validatePointer(ctx, pointer, incomplete)+	if err != nil {+		endpoint.log.Debug("piece no longer held by node", zap.Stringer("node ID", nodeID), zap.ByteString("path", incomplete.Path), zap.Int32("piece num", incomplete.PieceNum))+		err := endpoint.db.DeleteTransferQueueItem(ctx, nodeID, incomplete.Path, incomplete.PieceNum) 		if err != nil { 			return Error.Wrap(err) 		}+		return Error.Wrap(err)+	} +	pieceSize, err := endpoint.calculatePieceSize(ctx, pointer, incomplete, nodePiece)+	if ErrAboveOptimalThreshold.Has(err) {

To me, it would make more sense for calculatePieceSize to not do any logic related to updating metainfo or deleting from the transfer queue. Rather, it should just return the ErrAboveOptimalThreshold and we should do those updates here or in a separate function called from here.

VinozzZ

comment created time in 6 days

Pull request review commentstorj/storj

satellite/gracefulexit: separate functional code

 func (endpoint *Endpoint) updatePointer(ctx context.Context, originalPointer *pb  	return nil }++func (endpoint *Endpoint) checkExitStatus(ctx context.Context, nodeID storj.NodeID) (*pb.SatelliteMessage, error) {

let's add a comment above this that describes what this function does, but specifically clarifies that the message returned can be nil, and if it is not nil, it means that we should close the connection after sending

VinozzZ

comment created time in 6 days

Pull request review commentstorj/storj

satellite/gracefulexit: Check if pointer has been overwritten or deleted before sending transfer message.

 func TestExitDisabled(t *testing.T) { 	}) } +func TestPointerChangedOrDeleted(t *testing.T) {+	successThreshold := 4+	testplanet.Run(t, testplanet.Config{+		SatelliteCount:   1,+		StorageNodeCount: successThreshold + 1,+		UplinkCount:      1,+	}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {+		uplinkPeer := planet.Uplinks[0]+		satellite := planet.Satellites[0]++		satellite.GracefulExit.Chore.Loop.Pause()++		rs := &uplink.RSConfig{+			MinThreshold:     2,+			RepairThreshold:  3,+			SuccessThreshold: successThreshold,+			MaxThreshold:     successThreshold,+		}++		err := uplinkPeer.UploadWithConfig(ctx, satellite, rs, "testbucket", "test/path0", testrand.Bytes(5*memory.KiB))+		require.NoError(t, err)+		err = uplinkPeer.UploadWithConfig(ctx, satellite, rs, "testbucket", "test/path1", testrand.Bytes(5*memory.KiB))+		require.NoError(t, err)++		// check that there are no exiting nodes.+		exitingNodes, err := satellite.DB.OverlayCache().GetExitingNodes(ctx)+		require.NoError(t, err)+		require.Len(t, exitingNodes, 0)++		exitingNode, err := findNodeToExit(ctx, planet, 2)+		require.NoError(t, err)++		exitRequest := &overlay.ExitStatusRequest{+			NodeID:          exitingNode.ID(),+			ExitInitiatedAt: time.Now(),+		}++		_, err = satellite.DB.OverlayCache().UpdateExitStatus(ctx, exitRequest)+		require.NoError(t, err)+		err = satellite.DB.GracefulExit().IncrementProgress(ctx, exitingNode.ID(), 0, 0, 0)+		require.NoError(t, err)++		exitingNodes, err = satellite.DB.OverlayCache().GetExitingNodes(ctx)+		require.NoError(t, err)+		require.Len(t, exitingNodes, 1)+		require.Equal(t, exitingNode.ID(), exitingNodes[0].NodeID)++		// trigger the metainfo loop chore so we can get some pieces to transfer+		satellite.GracefulExit.Chore.Loop.TriggerWait()++		// make sure all the pieces are in the transfer queue+		incomplete, err := satellite.DB.GracefulExit().GetIncomplete(ctx, exitingNode.ID(), 10, 0)+		require.NoError(t, err)+		require.Len(t, incomplete, 2)++		err = uplinkPeer.UploadWithConfig(ctx, satellite, rs, "testbucket", "test/path0", testrand.Bytes(5*memory.KiB))

nit - maybe a comment explaining what is going on here: you are updating the first pointer and removing the second pointer so that neither piece has to be transferred from the exiting node anymore.

ethanadams

comment created time in 6 days

Pull request review commentstorj/storj

satellite/gracefulexit: not allow disqualified node to graceful exit

 func TestInvalidStorageNodeSignature(t *testing.T) { 			require.FailNow(t, "should not reach this case: %#v", m) 		} -		// TODO uncomment once progress reflects updated success and fail counts 		// check that the exit has completed and we have the correct transferred/failed values-		// progress, err := satellite.DB.GracefulExit().GetProgress(ctx, exitingNode.ID())-		// require.NoError(t, err)-		//-		// require.Equal(t, int64(0), progress.PiecesTransferred, tt.name)-		// require.Equal(t, int64(1), progress.PiecesFailed, tt.name)+		progress, err := satellite.DB.GracefulExit().GetProgress(ctx, exitingNode.ID())+		require.NoError(t, err)++		require.Equal(t, int64(0), progress.PiecesTransferred)+		require.Equal(t, int64(1), progress.PiecesFailed)+	})+}++func TestExitDisqualifiedNodeFailOnStart(t *testing.T) {+	testplanet.Run(t, testplanet.Config{+		SatelliteCount:   1,+		StorageNodeCount: 2,+		UplinkCount:      1,+	}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {+		satellite := planet.Satellites[0]+		exitingNode := planet.StorageNodes[0]++		disqualifyNode(t, ctx, satellite, exitingNode.ID())++		conn, err := exitingNode.Dialer.DialAddressID(ctx, satellite.Addr(), satellite.Identity.ID)+		require.NoError(t, err)+		defer ctx.Check(conn.Close)++		client := conn.SatelliteGracefulExitClient()+		processClient, err := client.Process(ctx)+		require.NoError(t, err)++		// Process endpoint should return immediately if node is disqualified+		response, err := processClient.Recv()+		require.True(t, errs2.IsRPC(err, rpcstatus.PermissionDenied))+		require.Nil(t, response)++		// check that the exit has completed and we have the correct transferred/failed values+		progress, err := satellite.DB.GracefulExit().GetProgress(ctx, exitingNode.ID())+		require.NoError(t, err)++		require.Equal(t, int64(0), progress.PiecesTransferred)+		require.Equal(t, int64(1), progress.PiecesFailed)+	})++}++func TestExitDisqualifiedNodeFailEventually(t *testing.T) {+	testTransfers(t, numObjects, func(ctx *testcontext.Context, nodeFullIDs map[storj.NodeID]*identity.FullIdentity, satellite *testplanet.SatelliteSystem, processClient exitProcessClient, exitingNode *storagenode.Peer, numPieces int) {+		disqualifyNode(t, ctx, satellite, exitingNode.ID())++		deletedCount := 0+		for {+			response, err := processClient.Recv()+			if errs.Is(err, io.EOF) {+				// Done+				break+			}+			if deletedCount >= numPieces {+				// when a disqualified node has finished transfer all pieces, it should receive an error+				require.True(t, errs2.IsRPC(err, rpcstatus.PermissionDenied))+				break+			} else {+				require.NoError(t, err)+			}++			switch m := response.GetMessage().(type) {+			case *pb.SatelliteMessage_TransferPiece:+				require.NotNil(t, m)++				pieceReader, err := exitingNode.Storage2.Store.Reader(ctx, satellite.ID(), m.TransferPiece.OriginalPieceId)+				require.NoError(t, err)++				header, err := pieceReader.GetPieceHeader()+				require.NoError(t, err)++				orderLimit := header.OrderLimit+				originalPieceHash := &pb.PieceHash{+					PieceId:   orderLimit.PieceId,+					Hash:      header.GetHash(),+					PieceSize: pieceReader.Size(),+					Timestamp: header.GetCreationTime(),+					Signature: header.GetSignature(),+				}++				newPieceHash := &pb.PieceHash{+					PieceId:   m.TransferPiece.AddressedOrderLimit.Limit.PieceId,+					Hash:      originalPieceHash.Hash,+					PieceSize: originalPieceHash.PieceSize,+					Timestamp: time.Now(),+				}++				receivingNodeID := nodeFullIDs[m.TransferPiece.AddressedOrderLimit.Limit.StorageNodeId]+				require.NotNil(t, receivingNodeID)+				signer := signing.SignerFromFullIdentity(receivingNodeID)++				signedNewPieceHash, err := signing.SignPieceHash(ctx, signer, newPieceHash)+				require.NoError(t, err)++				success := &pb.StorageNodeMessage{+					Message: &pb.StorageNodeMessage_Succeeded{+						Succeeded: &pb.TransferSucceeded{+							OriginalPieceId:      m.TransferPiece.OriginalPieceId,+							OriginalPieceHash:    originalPieceHash,+							OriginalOrderLimit:   &orderLimit,+							ReplacementPieceHash: signedNewPieceHash,+						},+					},+				}+				err = processClient.Send(success)+				require.NoError(t, err)+			case *pb.SatelliteMessage_DeletePiece:+				deletedCount+++			case *pb.SatelliteMessage_ExitFailed:

Will the node ever receive an official exit failed message if it is disqualified? If so, this is fine. Otherwise, I think we should remove this case since it could mislead the person reading the test into thinking it will happen.

VinozzZ

comment created time in 6 days

Pull request review commentstorj/storj

satellite/gracefulexit: not allow disqualified node to graceful exit

 func (endpoint *Endpoint) updatePointer(ctx context.Context, originalPointer *pb  	return nil }++func (endpoint *Endpoint) handleDisqualifiedNode(ctx context.Context, nodeID storj.NodeID) (isDisqualified bool, err error) {+	// check if node is disqualified+	nodeInfo, err := endpoint.overlay.Get(ctx, nodeID)+	if err != nil {+		return false, Error.Wrap(err)+	}++	if nodeInfo.Disqualified != nil {+		// update graceful exit status to be failed+		exitStatusRequest := &overlay.ExitStatusRequest{+			NodeID:         nodeID,+			ExitFinishedAt: time.Now().UTC(),+			ExitSuccess:    false,+		}++		err = endpoint.db.IncrementProgress(ctx, nodeID, 0, 0, 1)+		if err != nil {+			return true, Error.Wrap(err)+		}++		_, err = endpoint.overlaydb.UpdateExitStatus(ctx, exitStatusRequest)+		if err != nil {+			return true, Error.Wrap(err)+		}++		// remove remaining items from the queue after notifying nodes about their exit status

I think this comment is misleading because we do not notify the node about their exit status, we just return "permission denied" and delete the transfer queue items.

VinozzZ

comment created time in 6 days

Pull request review commentstorj/storj

satellite/gracefulexit: not allow disqualified node to graceful exit

 func TestInvalidStorageNodeSignature(t *testing.T) { 			require.FailNow(t, "should not reach this case: %#v", m) 		} -		// TODO uncomment once progress reflects updated success and fail counts 		// check that the exit has completed and we have the correct transferred/failed values-		// progress, err := satellite.DB.GracefulExit().GetProgress(ctx, exitingNode.ID())-		// require.NoError(t, err)-		//-		// require.Equal(t, int64(0), progress.PiecesTransferred, tt.name)-		// require.Equal(t, int64(1), progress.PiecesFailed, tt.name)+		progress, err := satellite.DB.GracefulExit().GetProgress(ctx, exitingNode.ID())+		require.NoError(t, err)++		require.Equal(t, int64(0), progress.PiecesTransferred)+		require.Equal(t, int64(1), progress.PiecesFailed)+	})+}++func TestExitDisqualifiedNodeFailOnStart(t *testing.T) {+	testplanet.Run(t, testplanet.Config{+		SatelliteCount:   1,+		StorageNodeCount: 2,+		UplinkCount:      1,+	}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {+		satellite := planet.Satellites[0]+		exitingNode := planet.StorageNodes[0]++		disqualifyNode(t, ctx, satellite, exitingNode.ID())++		conn, err := exitingNode.Dialer.DialAddressID(ctx, satellite.Addr(), satellite.Identity.ID)+		require.NoError(t, err)+		defer ctx.Check(conn.Close)++		client := conn.SatelliteGracefulExitClient()+		processClient, err := client.Process(ctx)+		require.NoError(t, err)++		// Process endpoint should return immediately if node is disqualified+		response, err := processClient.Recv()+		require.True(t, errs2.IsRPC(err, rpcstatus.PermissionDenied))+		require.Nil(t, response)++		// check that the exit has completed and we have the correct transferred/failed values+		progress, err := satellite.DB.GracefulExit().GetProgress(ctx, exitingNode.ID())+		require.NoError(t, err)++		require.Equal(t, int64(0), progress.PiecesTransferred)+		require.Equal(t, int64(1), progress.PiecesFailed)+	})++}++func TestExitDisqualifiedNodeFailEventually(t *testing.T) {+	testTransfers(t, numObjects, func(ctx *testcontext.Context, nodeFullIDs map[storj.NodeID]*identity.FullIdentity, satellite *testplanet.SatelliteSystem, processClient exitProcessClient, exitingNode *storagenode.Peer, numPieces int) {+		disqualifyNode(t, ctx, satellite, exitingNode.ID())++		deletedCount := 0+		for {+			response, err := processClient.Recv()+			if errs.Is(err, io.EOF) {+				// Done+				break+			}+			if deletedCount >= numPieces {+				// when a disqualified node has finished transfer all pieces, it should receive an error+				require.True(t, errs2.IsRPC(err, rpcstatus.PermissionDenied))+				break+			} else {+				require.NoError(t, err)+			}++			switch m := response.GetMessage().(type) {+			case *pb.SatelliteMessage_TransferPiece:+				require.NotNil(t, m)++				pieceReader, err := exitingNode.Storage2.Store.Reader(ctx, satellite.ID(), m.TransferPiece.OriginalPieceId)+				require.NoError(t, err)++				header, err := pieceReader.GetPieceHeader()+				require.NoError(t, err)++				orderLimit := header.OrderLimit+				originalPieceHash := &pb.PieceHash{+					PieceId:   orderLimit.PieceId,+					Hash:      header.GetHash(),+					PieceSize: pieceReader.Size(),+					Timestamp: header.GetCreationTime(),+					Signature: header.GetSignature(),+				}++				newPieceHash := &pb.PieceHash{+					PieceId:   m.TransferPiece.AddressedOrderLimit.Limit.PieceId,+					Hash:      originalPieceHash.Hash,+					PieceSize: originalPieceHash.PieceSize,+					Timestamp: time.Now(),+				}++				receivingNodeID := nodeFullIDs[m.TransferPiece.AddressedOrderLimit.Limit.StorageNodeId]+				require.NotNil(t, receivingNodeID)+				signer := signing.SignerFromFullIdentity(receivingNodeID)++				signedNewPieceHash, err := signing.SignPieceHash(ctx, signer, newPieceHash)+				require.NoError(t, err)++				success := &pb.StorageNodeMessage{+					Message: &pb.StorageNodeMessage_Succeeded{+						Succeeded: &pb.TransferSucceeded{+							OriginalPieceId:      m.TransferPiece.OriginalPieceId,+							OriginalPieceHash:    originalPieceHash,+							OriginalOrderLimit:   &orderLimit,+							ReplacementPieceHash: signedNewPieceHash,+						},+					},+				}+				err = processClient.Send(success)+				require.NoError(t, err)+			case *pb.SatelliteMessage_DeletePiece:+				deletedCount+++			case *pb.SatelliteMessage_ExitFailed:+				require.NotNil(t, m)+			default:+				t.FailNow()+			}+		}++		// check that the exit has completed and we have the correct transferred/failed values+		progress, err := satellite.DB.GracefulExit().GetProgress(ctx, exitingNode.ID())+		require.NoError(t, err)++		require.EqualValues(t, numPieces, progress.PiecesTransferred)+		require.EqualValues(t, numPieces, deletedCount)+		require.EqualValues(t, 1, progress.PiecesFailed)

Should this be 1? In the test, all the pieces transfer successfully.

VinozzZ

comment created time in 6 days

Pull request review commentstorj/storj

satellite/gracefulexit: not allow disqualified node to graceful exit

 func TestChore(t *testing.T) { 		require.NoError(t, err) 		nodeIDs := make(storj.NodeIDList, 0, len(exitingNodes)) 		for _, exitingNode := range exitingNodes {+			// exiting node should not be disqualified+			node, err := satellite.Overlay.Service.Get(ctx, exitingNode.NodeID)+			require.NoError(t, err)+			require.Nil(t, node.Disqualified)

I guess what I was asking is that we should be able to safely assume that no nodes are DQed in this test at any point, so why even bother with the check?

VinozzZ

comment created time in 6 days

Pull request review commentstorj/storj

satellite/gracefulexit: not allow disqualified node to graceful exit

 func TestInvalidStorageNodeSignature(t *testing.T) { 			require.FailNow(t, "should not reach this case: %#v", m) 		} -		// TODO uncomment once progress reflects updated success and fail counts 		// check that the exit has completed and we have the correct transferred/failed values-		// progress, err := satellite.DB.GracefulExit().GetProgress(ctx, exitingNode.ID())-		// require.NoError(t, err)-		//-		// require.Equal(t, int64(0), progress.PiecesTransferred, tt.name)-		// require.Equal(t, int64(1), progress.PiecesFailed, tt.name)+		progress, err := satellite.DB.GracefulExit().GetProgress(ctx, exitingNode.ID())+		require.NoError(t, err)++		require.Equal(t, int64(0), progress.PiecesTransferred)+		require.Equal(t, int64(1), progress.PiecesFailed)+	})+}++func TestExitDisqualifiedNodeFailOnStart(t *testing.T) {+	testplanet.Run(t, testplanet.Config{+		SatelliteCount:   1,+		StorageNodeCount: 2,+		UplinkCount:      1,+	}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {+		satellite := planet.Satellites[0]+		exitingNode := planet.StorageNodes[0]++		disqualifyNode(t, ctx, satellite, exitingNode.ID())++		conn, err := exitingNode.Dialer.DialAddressID(ctx, satellite.Addr(), satellite.Identity.ID)+		require.NoError(t, err)+		defer ctx.Check(conn.Close)++		client := conn.SatelliteGracefulExitClient()+		processClient, err := client.Process(ctx)+		require.NoError(t, err)++		// Process endpoint should return immediately if node is disqualified+		response, err := processClient.Recv()+		require.True(t, errs2.IsRPC(err, rpcstatus.PermissionDenied))+		require.Nil(t, response)++		// check that the exit has completed and we have the correct transferred/failed values+		progress, err := satellite.DB.GracefulExit().GetProgress(ctx, exitingNode.ID())+		require.NoError(t, err)++		require.Equal(t, int64(0), progress.PiecesTransferred)+		require.Equal(t, int64(1), progress.PiecesFailed)+	})++}++func TestExitDisqualifiedNodeFailEventually(t *testing.T) {+	testTransfers(t, numObjects, func(ctx *testcontext.Context, nodeFullIDs map[storj.NodeID]*identity.FullIdentity, satellite *testplanet.SatelliteSystem, processClient exitProcessClient, exitingNode *storagenode.Peer, numPieces int) {+		disqualifyNode(t, ctx, satellite, exitingNode.ID())+		var pieceID storj.PieceID+		failedCount := 0+		deletedCount := 0+		for {+			response, err := processClient.Recv()+			if errs.Is(err, io.EOF) {+				// Done+				break+			}+			if deletedCount >= numPieces {+				// when a disqualified node has finished transfer all pieces, it should receive an error+				require.True(t, errs2.IsRPC(err, rpcstatus.PermissionDenied))+				break+			} else {+				require.NoError(t, err)+			}++			switch m := response.GetMessage().(type) {+			case *pb.SatelliteMessage_TransferPiece:+				require.NotNil(t, m)++				// pick the first one to fail+				if pieceID.IsZero() {+					pieceID = m.TransferPiece.OriginalPieceId+				}++				if failedCount > 0 || pieceID != m.TransferPiece.OriginalPieceId {++					pieceReader, err := exitingNode.Storage2.Store.Reader(ctx, satellite.ID(), m.TransferPiece.OriginalPieceId)+					require.NoError(t, err)++					header, err := pieceReader.GetPieceHeader()+					require.NoError(t, err)++					orderLimit := header.OrderLimit+					originalPieceHash := &pb.PieceHash{+						PieceId:   orderLimit.PieceId,+						Hash:      header.GetHash(),+						PieceSize: pieceReader.Size(),+						Timestamp: header.GetCreationTime(),+						Signature: header.GetSignature(),+					}++					newPieceHash := &pb.PieceHash{+						PieceId:   m.TransferPiece.AddressedOrderLimit.Limit.PieceId,+						Hash:      originalPieceHash.Hash,+						PieceSize: originalPieceHash.PieceSize,+						Timestamp: time.Now(),+					}++					receivingNodeID := nodeFullIDs[m.TransferPiece.AddressedOrderLimit.Limit.StorageNodeId]+					require.NotNil(t, receivingNodeID)+					signer := signing.SignerFromFullIdentity(receivingNodeID)++					signedNewPieceHash, err := signing.SignPieceHash(ctx, signer, newPieceHash)+					require.NoError(t, err)++					success := &pb.StorageNodeMessage{+						Message: &pb.StorageNodeMessage_Succeeded{+							Succeeded: &pb.TransferSucceeded{+								OriginalPieceId:      m.TransferPiece.OriginalPieceId,+								OriginalPieceHash:    originalPieceHash,+								OriginalOrderLimit:   &orderLimit,+								ReplacementPieceHash: signedNewPieceHash,+							},+						},+					}+					err = processClient.Send(success)+					require.NoError(t, err)+				} else {+					failedCount+++					failed := &pb.StorageNodeMessage{

I'm not sure I understand why we fail any piece transfers. This test is ensuring that if the storagenode is disqualified when a graceful exit is already in progress, that eventually graceful exit fails because of the disqualification. Because of that, I think we should only send valid transfer success messages, and still expect graceful exit failure + permission denied at the end.

VinozzZ

comment created time in 6 days

Pull request review commentstorj/storj

satellite/gracefulexit: not allow disqualified node to graceful exit

 func TestFailureUnknownError(t *testing.T) { 			require.FailNow(t, "should not reach this case: %#v", m) 		} -		// TODO uncomment once progress reflects updated success and fail counts 		// check that the exit has completed and we have the correct transferred/failed values-		// progress, err := satellite.DB.GracefulExit().GetProgress(ctx, exitingNode.ID())-		// require.NoError(t, err)-		//-		// require.Equal(t, int64(0), progress.PiecesTransferred, tt.name)-		// require.Equal(t, int64(1), progress.PiecesFailed, tt.name)+		progress, err := satellite.DB.GracefulExit().GetProgress(ctx, exitingNode.ID())+		require.NoError(t, err)++		require.Equal(t, int64(0), progress.PiecesTransferred)+		require.Equal(t, int64(0), progress.PiecesFailed)

hmm why is pieces failed 0?

VinozzZ

comment created time in 6 days

Pull request review commentstorj/storj

satellite/gracefulexit: not allow disqualified node to graceful exit

 func TestChore(t *testing.T) { 		require.NoError(t, err) 		nodeIDs := make(storj.NodeIDList, 0, len(exitingNodes)) 		for _, exitingNode := range exitingNodes {+			// exiting node should not be disqualified+			node, err := satellite.Overlay.Service.Get(ctx, exitingNode.NodeID)+			require.NoError(t, err)+			require.Nil(t, node.Disqualified)

I agree with Nat. This test is not doing anything that would dq a node for any reason so I don't see a need to check this. @VinozzZ could you clarify what you are saying in your comment above? I don't think any nodes will be dqed in this test so waiting on the chore shouldn't make a difference.

VinozzZ

comment created time in 6 days

Pull request review commentstorj/storj

satellite/gracefulexit: Check if pointer has been overwritten or deleted before sending transfer message.

 func TestExitDisabled(t *testing.T) { 	}) } +func TestPointerChangedOrDeleted(t *testing.T) {+	successThreshold := 4+	testplanet.Run(t, testplanet.Config{+		SatelliteCount:   1,+		StorageNodeCount: successThreshold + 1,+		UplinkCount:      1,+	}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {+		uplinkPeer := planet.Uplinks[0]+		satellite := planet.Satellites[0]++		satellite.GracefulExit.Chore.Loop.Pause()++		rs := &uplink.RSConfig{+			MinThreshold:     2,+			RepairThreshold:  3,+			SuccessThreshold: successThreshold,+			MaxThreshold:     successThreshold,+		}++		err := uplinkPeer.UploadWithConfig(ctx, satellite, rs, "testbucket", "test/path0", testrand.Bytes(5*memory.KiB))+		require.NoError(t, err)+		err = uplinkPeer.UploadWithConfig(ctx, satellite, rs, "testbucket", "test/path1", testrand.Bytes(5*memory.KiB))+		require.NoError(t, err)++		// check that there are no exiting nodes.+		exitingNodes, err := satellite.DB.OverlayCache().GetExitingNodes(ctx)+		require.NoError(t, err)+		require.Len(t, exitingNodes, 0)++		exitingNode, err := findNodeToExit(ctx, planet, 2)+		require.NoError(t, err)++		exitRequest := &overlay.ExitStatusRequest{+			NodeID:          exitingNode.ID(),+			ExitInitiatedAt: time.Now(),+		}++		_, err = satellite.DB.OverlayCache().UpdateExitStatus(ctx, exitRequest)+		require.NoError(t, err)+		err = satellite.DB.GracefulExit().IncrementProgress(ctx, exitingNode.ID(), 0, 0, 0)+		require.NoError(t, err)++		exitingNodes, err = satellite.DB.OverlayCache().GetExitingNodes(ctx)+		require.NoError(t, err)+		require.Len(t, exitingNodes, 1)+		require.Equal(t, exitingNode.ID(), exitingNodes[0].NodeID)++		// trigger the metainfo loop chore so we can get some pieces to transfer+		satellite.GracefulExit.Chore.Loop.TriggerWait()++		// make sure all the pieces are in the transfer queue+		incomplete, err := satellite.DB.GracefulExit().GetIncomplete(ctx, exitingNode.ID(), 10, 0)+		require.NoError(t, err)+		require.Len(t, incomplete, 2)++		err = uplinkPeer.UploadWithConfig(ctx, satellite, rs, "testbucket", "test/path0", testrand.Bytes(5*memory.KiB))

Hmm maybe it overwrites it, but I didn't know about that functionality. I would have assumed it would get some sort of metainfo already exists error back.

ethanadams

comment created time in 6 days

Pull request review commentstorj/storj

satellite/gracefulexit: Check if pointer has been overwritten or deleted before sending transfer message.

 func TestExitDisabled(t *testing.T) { 	}) } +func TestPointerChangedOrDeleted(t *testing.T) {+	successThreshold := 4+	testplanet.Run(t, testplanet.Config{+		SatelliteCount:   1,+		StorageNodeCount: successThreshold + 1,+		UplinkCount:      1,+	}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {+		uplinkPeer := planet.Uplinks[0]+		satellite := planet.Satellites[0]++		satellite.GracefulExit.Chore.Loop.Pause()++		rs := &uplink.RSConfig{+			MinThreshold:     2,+			RepairThreshold:  3,+			SuccessThreshold: successThreshold,+			MaxThreshold:     successThreshold,+		}++		err := uplinkPeer.UploadWithConfig(ctx, satellite, rs, "testbucket", "test/path0", testrand.Bytes(5*memory.KiB))+		require.NoError(t, err)+		err = uplinkPeer.UploadWithConfig(ctx, satellite, rs, "testbucket", "test/path1", testrand.Bytes(5*memory.KiB))+		require.NoError(t, err)++		// check that there are no exiting nodes.+		exitingNodes, err := satellite.DB.OverlayCache().GetExitingNodes(ctx)+		require.NoError(t, err)+		require.Len(t, exitingNodes, 0)++		exitingNode, err := findNodeToExit(ctx, planet, 2)+		require.NoError(t, err)++		exitRequest := &overlay.ExitStatusRequest{+			NodeID:          exitingNode.ID(),+			ExitInitiatedAt: time.Now(),+		}++		_, err = satellite.DB.OverlayCache().UpdateExitStatus(ctx, exitRequest)+		require.NoError(t, err)+		err = satellite.DB.GracefulExit().IncrementProgress(ctx, exitingNode.ID(), 0, 0, 0)+		require.NoError(t, err)++		exitingNodes, err = satellite.DB.OverlayCache().GetExitingNodes(ctx)+		require.NoError(t, err)+		require.Len(t, exitingNodes, 1)+		require.Equal(t, exitingNode.ID(), exitingNodes[0].NodeID)++		// trigger the metainfo loop chore so we can get some pieces to transfer+		satellite.GracefulExit.Chore.Loop.TriggerWait()++		// make sure all the pieces are in the transfer queue+		incomplete, err := satellite.DB.GracefulExit().GetIncomplete(ctx, exitingNode.ID(), 10, 0)+		require.NoError(t, err)+		require.Len(t, incomplete, 2)++		err = uplinkPeer.UploadWithConfig(ctx, satellite, rs, "testbucket", "test/path0", testrand.Bytes(5*memory.KiB))+		require.NoError(t, err)+		err = uplinkPeer.Delete(ctx, satellite, "testbucket", "test/path1")+		require.NoError(t, err)++		// reconnect to the satellite.+		conn, err := exitingNode.Dialer.DialAddressID(ctx, satellite.Addr(), satellite.Identity.ID)+		require.NoError(t, err)+		defer ctx.Check(conn.Close)++		client := conn.SatelliteGracefulExitClient()++		c, err := client.Process(ctx)+		require.NoError(t, err)+		defer ctx.Check(c.CloseSend)++		response, err := c.Recv()+		require.NoError(t, err)++		// we expect an exit completed b/c there is nothing to do here+		switch m := response.GetMessage().(type) {+		case *pb.SatelliteMessage_ExitCompleted:+			signee := signing.SigneeFromPeerIdentity(satellite.Identity.PeerIdentity())+			err = signing.VerifyExitCompleted(ctx, signee, m.ExitCompleted)+			require.NoError(t, err)++			exitStatus, err := satellite.DB.OverlayCache().GetExitStatus(ctx, exitingNode.ID())+			require.NoError(t, err)+			require.NotNil(t, exitStatus.ExitFinishedAt)+			require.True(t, exitStatus.ExitSuccess)+		default:+			t.FailNow()+		}

should we check the transfer queue to make sure it is empty at the end of this test?

ethanadams

comment created time in 6 days

Pull request review commentstorj/storj

satellite/gracefulexit: Check if pointer has been overwritten or deleted before sending transfer message.

 func TestExitDisabled(t *testing.T) { 	}) } +func TestPointerChangedOrDeleted(t *testing.T) {+	successThreshold := 4+	testplanet.Run(t, testplanet.Config{+		SatelliteCount:   1,+		StorageNodeCount: successThreshold + 1,+		UplinkCount:      1,+	}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {+		uplinkPeer := planet.Uplinks[0]+		satellite := planet.Satellites[0]++		satellite.GracefulExit.Chore.Loop.Pause()++		rs := &uplink.RSConfig{+			MinThreshold:     2,+			RepairThreshold:  3,+			SuccessThreshold: successThreshold,+			MaxThreshold:     successThreshold,+		}++		err := uplinkPeer.UploadWithConfig(ctx, satellite, rs, "testbucket", "test/path0", testrand.Bytes(5*memory.KiB))+		require.NoError(t, err)+		err = uplinkPeer.UploadWithConfig(ctx, satellite, rs, "testbucket", "test/path1", testrand.Bytes(5*memory.KiB))+		require.NoError(t, err)++		// check that there are no exiting nodes.+		exitingNodes, err := satellite.DB.OverlayCache().GetExitingNodes(ctx)+		require.NoError(t, err)+		require.Len(t, exitingNodes, 0)++		exitingNode, err := findNodeToExit(ctx, planet, 2)+		require.NoError(t, err)++		exitRequest := &overlay.ExitStatusRequest{+			NodeID:          exitingNode.ID(),+			ExitInitiatedAt: time.Now(),+		}++		_, err = satellite.DB.OverlayCache().UpdateExitStatus(ctx, exitRequest)+		require.NoError(t, err)+		err = satellite.DB.GracefulExit().IncrementProgress(ctx, exitingNode.ID(), 0, 0, 0)+		require.NoError(t, err)++		exitingNodes, err = satellite.DB.OverlayCache().GetExitingNodes(ctx)+		require.NoError(t, err)+		require.Len(t, exitingNodes, 1)+		require.Equal(t, exitingNode.ID(), exitingNodes[0].NodeID)++		// trigger the metainfo loop chore so we can get some pieces to transfer+		satellite.GracefulExit.Chore.Loop.TriggerWait()++		// make sure all the pieces are in the transfer queue+		incomplete, err := satellite.DB.GracefulExit().GetIncomplete(ctx, exitingNode.ID(), 10, 0)+		require.NoError(t, err)+		require.Len(t, incomplete, 2)++		err = uplinkPeer.UploadWithConfig(ctx, satellite, rs, "testbucket", "test/path0", testrand.Bytes(5*memory.KiB))

If you intend to replace it, wouldn't it make sense to delete the original item at "test/path0" first?

ethanadams

comment created time in 6 days

Pull request review commentstorj/storj

satellite/gracefulexit: Check if pointer has been overwritten or deleted before sending transfer message.

 model graceful_exit_transfer_queue (     field node_id             blob     field path                blob     field piece_num           int+    field root_piece_id       blob       ( updatable, nullable )

does it need to be updatable? In what situation would it change?

ethanadams

comment created time in 6 days

Pull request review commentstorj/storj

satellite/gracefulexit: Check if pointer has been overwritten or deleted before sending transfer message.

 func TestExitDisabled(t *testing.T) { 	}) } +func TestPointerChangedOrDeleted(t *testing.T) {+	successThreshold := 4+	testplanet.Run(t, testplanet.Config{+		SatelliteCount:   1,+		StorageNodeCount: successThreshold + 1,+		UplinkCount:      1,+	}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {+		uplinkPeer := planet.Uplinks[0]+		satellite := planet.Satellites[0]++		satellite.GracefulExit.Chore.Loop.Pause()++		rs := &uplink.RSConfig{+			MinThreshold:     2,+			RepairThreshold:  3,+			SuccessThreshold: successThreshold,+			MaxThreshold:     successThreshold,+		}++		err := uplinkPeer.UploadWithConfig(ctx, satellite, rs, "testbucket", "test/path0", testrand.Bytes(5*memory.KiB))+		require.NoError(t, err)+		err = uplinkPeer.UploadWithConfig(ctx, satellite, rs, "testbucket", "test/path1", testrand.Bytes(5*memory.KiB))+		require.NoError(t, err)++		// check that there are no exiting nodes.+		exitingNodes, err := satellite.DB.OverlayCache().GetExitingNodes(ctx)+		require.NoError(t, err)+		require.Len(t, exitingNodes, 0)++		exitingNode, err := findNodeToExit(ctx, planet, 2)+		require.NoError(t, err)++		exitRequest := &overlay.ExitStatusRequest{+			NodeID:          exitingNode.ID(),+			ExitInitiatedAt: time.Now(),+		}++		_, err = satellite.DB.OverlayCache().UpdateExitStatus(ctx, exitRequest)+		require.NoError(t, err)+		err = satellite.DB.GracefulExit().IncrementProgress(ctx, exitingNode.ID(), 0, 0, 0)+		require.NoError(t, err)++		exitingNodes, err = satellite.DB.OverlayCache().GetExitingNodes(ctx)+		require.NoError(t, err)+		require.Len(t, exitingNodes, 1)+		require.Equal(t, exitingNode.ID(), exitingNodes[0].NodeID)++		// trigger the metainfo loop chore so we can get some pieces to transfer+		satellite.GracefulExit.Chore.Loop.TriggerWait()++		// make sure all the pieces are in the transfer queue+		incomplete, err := satellite.DB.GracefulExit().GetIncomplete(ctx, exitingNode.ID(), 10, 0)+		require.NoError(t, err)+		require.Len(t, incomplete, 2)++		err = uplinkPeer.UploadWithConfig(ctx, satellite, rs, "testbucket", "test/path0", testrand.Bytes(5*memory.KiB))

Didn't you already upload to this path? Why wouldn't there be an error?

ethanadams

comment created time in 6 days

Pull request review commentstorj/storj

satellite/gracefulexit: select new node filtered by IP

 func (cache *overlaycache) SelectNewStorageNodes(ctx context.Context, count int, 	return nodes, nil } +// GetNodeIPs returns a list of node IP addresses.+func (cache *overlaycache) GetNodeIPs(ctx context.Context, nodeIDs []storj.NodeID) (nodeIPs []string, err error) {+	defer mon.Task()(&ctx)(&err)++	var rows *sql.Rows+	// warning: these node IP addresses might be returned out of order

nit - should this warning be part of the documentation comment above the function?

navillasa

comment created time in 6 days

push eventstorj/storj

Nikolai Siedov

commit sha fd9f860fd6afbe477e728b6a3d2b2812a13be916

token error code fixed (#3514)

view details

Maximillian von Briesen

commit sha 5aa0c38efde142370d7ef5bb37c67bd0b3317586

Merge branch 'master' into ee/metainfo-delete

view details

push time in 7 days

Pull request review commentstorj/storj

satellite/audit: do not audit expired segments

 func (verifier *Verifier) Reverify(ctx context.Context, path storj.Path) (report 		} 		return Report{}, err 	}+	if pointer.ExpirationDate != (time.Time{}) && pointer.ExpirationDate.Before(time.Now().UTC()) {+		errDelete := verifier.metainfo.Delete(ctx, path)

if the data is expired, how could it be valid?

mobyvb

comment created time in 7 days

Pull request review commentstorj/storj

satellite/gracefulexit: select new node filtered by IP

 func TestNodeSelectionGracefulExit(t *testing.T) { 	}) } +func TestFindStorageNodesDistinctIPs(t *testing.T) {+	if runtime.GOOS == "darwin" {+		t.Skip("Test does not work with macOS")+	}+	testplanet.Run(t, testplanet.Config{+		SatelliteCount: 1, StorageNodeCount: 5, UplinkCount: 1,+		Reconfigure: testplanet.Reconfigure{+			// will create 3 storage nodes with same IP; 2 will have unique+			UniqueIPCount: 2,+		},+	}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {+		satellite := planet.Satellites[0]++		var excludedNodes storj.NodeIDList+		addrs := make(map[storj.NodeID]struct{})+		for _, node := range planet.StorageNodes {+			if _, ok := addrs[node.ID()]; ok {+				addrs[node.ID()] = struct{}{}+			} else {+				// Add one ID to the excluded nodes list.+				// The FindStorageNodesDistinctIPs function should+				// also exclude the other nodes that share the same IP.+				excludedNodes = append(excludedNodes, node.ID())+				break+			}+		}++		req := overlay.FindStorageNodesRequest{+			MinimumRequiredNodes: 2,+			RequestedCount:       2,+			ExcludedNodes:        excludedNodes,+		}+		nodes, err := satellite.Overlay.Service.FindStorageNodesDistinctIPs(ctx, req)+		require.NoError(t, err)+		require.Len(t, nodes, 2)+		require.NotEqual(t, nodes[0].LastIp, nodes[1].LastIp)++		req = overlay.FindStorageNodesRequest{+			MinimumRequiredNodes: 4,

This should fail with a request for 3 nodes, so let's use that.

navillasa

comment created time in 7 days

Pull request review commentstorj/storj

satellite/gracefulexit: select new node filtered by IP

 func TestNodeSelectionGracefulExit(t *testing.T) { 	}) } +func TestFindStorageNodesDistinctIPs(t *testing.T) {+	if runtime.GOOS == "darwin" {+		t.Skip("Test does not work with macOS")+	}+	testplanet.Run(t, testplanet.Config{+		SatelliteCount: 1, StorageNodeCount: 5, UplinkCount: 1,+		Reconfigure: testplanet.Reconfigure{+			// will create 3 storage nodes with same IP; 2 will have unique+			UniqueIPCount: 2,+		},+	}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {+		satellite := planet.Satellites[0]++		var excludedNodes storj.NodeIDList+		addrs := make(map[storj.NodeID]struct{})+		for _, node := range planet.StorageNodes {+			if _, ok := addrs[node.ID()]; ok {+				addrs[node.ID()] = struct{}{}+			} else {+				// Add one ID to the excluded nodes list.

Specifically, you are trying to add one of the nodes that has a duplicate IP to the list, correct? Like if you have nodes 1-5, and 1-3 have IP1, 4 has IP2, and 5 has IP3, you are trying to add node 1, 2, or 3 to excludedNodes? If yes, I think we should clarify this as part of this comment.

navillasa

comment created time in 7 days

Pull request review commentstorj/storj

satellite/gracefulexit: select new node filtered by IP

 func TestNodeSelectionGracefulExit(t *testing.T) { 	}) } +func TestFindStorageNodesDistinctIPs(t *testing.T) {+	if runtime.GOOS == "darwin" {+		t.Skip("Test does not work with macOS")+	}+	testplanet.Run(t, testplanet.Config{+		SatelliteCount: 1, StorageNodeCount: 5, UplinkCount: 1,+		Reconfigure: testplanet.Reconfigure{+			// will create 3 storage nodes with same IP; 2 will have unique+			UniqueIPCount: 2,+		},+	}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {+		satellite := planet.Satellites[0]++		var excludedNodes storj.NodeIDList+		addrs := make(map[storj.NodeID]struct{})+		for _, node := range planet.StorageNodes {+			if _, ok := addrs[node.ID()]; ok {

Also node.ID() will be unique for every storagenode. Shouldn't addrs be a map[string]struct{} with IP being the key instead?

navillasa

comment created time in 7 days

Pull request review commentstorj/storj

satellite/gracefulexit: select new node filtered by IP

 func TestNodeSelectionGracefulExit(t *testing.T) { 	}) } +func TestFindStorageNodesDistinctIPs(t *testing.T) {+	if runtime.GOOS == "darwin" {+		t.Skip("Test does not work with macOS")+	}+	testplanet.Run(t, testplanet.Config{+		SatelliteCount: 1, StorageNodeCount: 5, UplinkCount: 1,+		Reconfigure: testplanet.Reconfigure{+			// will create 3 storage nodes with same IP; 2 will have unique+			UniqueIPCount: 2,+		},+	}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {+		satellite := planet.Satellites[0]++		var excludedNodes storj.NodeIDList+		addrs := make(map[storj.NodeID]struct{})+		for _, node := range planet.StorageNodes {+			if _, ok := addrs[node.ID()]; ok {+				addrs[node.ID()] = struct{}{}+			} else {+				// Add one ID to the excluded nodes list.+				// The FindStorageNodesDistinctIPs function should+				// also exclude the other nodes that share the same IP.+				excludedNodes = append(excludedNodes, node.ID())+				break+			}+		}++		req := overlay.FindStorageNodesRequest{+			MinimumRequiredNodes: 2,+			RequestedCount:       2,+			ExcludedNodes:        excludedNodes,+		}+		nodes, err := satellite.Overlay.Service.FindStorageNodesDistinctIPs(ctx, req)+		require.NoError(t, err)+		require.Len(t, nodes, 2)+		require.NotEqual(t, nodes[0].LastIp, nodes[1].LastIp)

We also need to check that the IP of the excluded node is not equal to either of these IPs.

navillasa

comment created time in 7 days

Pull request review commentstorj/storj

satellite/gracefulexit: select new node filtered by IP

 func TestNodeSelectionGracefulExit(t *testing.T) { 	}) } +func TestFindStorageNodesDistinctIPs(t *testing.T) {+	if runtime.GOOS == "darwin" {+		t.Skip("Test does not work with macOS")+	}+	testplanet.Run(t, testplanet.Config{+		SatelliteCount: 1, StorageNodeCount: 5, UplinkCount: 1,+		Reconfigure: testplanet.Reconfigure{+			// will create 3 storage nodes with same IP; 2 will have unique+			UniqueIPCount: 2,+		},+	}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {+		satellite := planet.Satellites[0]++		var excludedNodes storj.NodeIDList+		addrs := make(map[storj.NodeID]struct{})+		for _, node := range planet.StorageNodes {+			if _, ok := addrs[node.ID()]; ok {

shouldn't the condition be !ok? I don't see how anything can be added to addrs otherwise

navillasa

comment created time in 7 days

Pull request review commentstorj/storj

satellite/gracefulexit: select new node filtered by IP

 func (endpoint *Endpoint) Process(stream pb.SatelliteGracefulExit_ProcessServer) 	return endpoint.doProcess(stream) } -// Process is called by storage nodes to receive pieces to transfer to new nodes and get exit status.+// Process is called by storage nodes to receive pieceIDs to transfer pieces to new nodes and get exit status.

nit - while the original comment wasn't 100% accurate, I think these changes make it more confusing. Not sure how to reword it off the top of my head, but I'll try to think of something by my next review of this.

navillasa

comment created time in 7 days

Pull request review commentstorj/storj

satellite/gracefulexit: select new node filtered by IP

 func (endpoint *Endpoint) doProcess(stream processStream) (err error) { 		return nil 	} +	// maps pieceIDs to pendingTransfer

nit - I think something like maps pieceIDs to pendingTransfers to keep track of ongoing piece transfer requests could be more useful for providing the reader with context.

navillasa

comment created time in 7 days

Pull request review commentstorj/storj

satellite/gracefulexit: separate functional code

 func (endpoint *Endpoint) doProcess(stream processStream) (err error) { 		endpoint.connections.delete(nodeID) 	}() -	eofHandler := func(err error) error {-		if err == io.EOF {-			endpoint.log.Debug("received EOF when trying to receive messages from storage node", zap.Stringer("node ID", nodeID))-			return nil-		}-		if err != nil {-			return rpcstatus.Error(rpcstatus.Unknown, Error.Wrap(err).Error())-		}-		return nil-	}--	exitStatus, err := endpoint.overlaydb.GetExitStatus(ctx, nodeID)+	// check initial state+	msg, err := endpoint.checkInitialState(ctx, nodeID)

I'm in favor of that. checkExitStatus is more specific/clear

VinozzZ

comment created time in 7 days

Pull request review commentstorj/storj

logging: unify logging around satellite ID, node ID and piece ID

 func (s *Service) retainPieces(ctx context.Context, req Request) (err error) { 		pieceID := access.PieceID() 		if !filter.Contains(pieceID) { 			s.log.Debug("About to delete piece id",-				zap.String("satellite", satelliteID.String()),-				zap.String("pieceID", pieceID.String()),-				zap.String("status", s.config.Status.String()))+				zap.Stringer("Satellite ID", satelliteID),+				zap.Stringer("Piece ID", pieceID),+				zap.String("Status", s.config.Status.String()))

nit - can this be zap.Stringer("Status", s.config.Status)?

littleskunk

comment created time in 7 days

Pull request review commentstorj/storj

logging: unify logging around satellite ID, node ID and piece ID

 func (s *Service) retainPieces(ctx context.Context, req Request) (err error) { 		pieceID := access.PieceID() 		if !filter.Contains(pieceID) { 			s.log.Debug("About to delete piece id",-				zap.String("satellite", satelliteID.String()),-				zap.String("pieceID", pieceID.String()),-				zap.String("status", s.config.Status.String()))+				zap.Stringer("Satellite ID", satelliteID),+				zap.Stringer("Piece ID", pieceID),+				zap.String("Status", s.config.Status.String()))

Same comment for line 411

littleskunk

comment created time in 7 days

Pull request review commentstorj/storj

satellite/audit: do not audit expired segments

 func TestReverifyDifferentShare(t *testing.T) { 		require.Equal(t, report.Fails[0], selectedNode) 	}) }++// TestReverifyExpired1 tests the case where the segment passed into Reverify is expired+func TestReverifyExpired1(t *testing.T) {+	testplanet.Run(t, testplanet.Config{+		SatelliteCount: 1, StorageNodeCount: 4, UplinkCount: 1,+	}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {+		satellite := planet.Satellites[0]+		audits := satellite.Audit+		queue := audits.Queue++		audits.Worker.Loop.Pause()++		ul := planet.Uplinks[0]+		testData := testrand.Bytes(8 * memory.KiB)++		err := ul.Upload(ctx, satellite, "testbucket", "test/path", testData)+		require.NoError(t, err)++		audits.Chore.Loop.TriggerWait()+		path, err := queue.Next()+		require.NoError(t, err)++		// set pointer's expiration date to be already expired+		pointer, err := satellite.Metainfo.Service.Get(ctx, path)+		require.NoError(t, err)+		oldPointerBytes, err := proto.Marshal(pointer)+		require.NoError(t, err)+		newPointer := &pb.Pointer{}+		err = proto.Unmarshal(oldPointerBytes, newPointer)+		require.NoError(t, err)+		newPointer.ExpirationDate = time.Now().UTC().Add(-1 * time.Hour)+		newPointerBytes, err := proto.Marshal(newPointer)+		require.NoError(t, err)+		err = satellite.Metainfo.Service.DB.CompareAndSwap(ctx, storage.Key(path), oldPointerBytes, newPointerBytes)+		require.NoError(t, err)++		report, err := audits.Verifier.Reverify(ctx, path)+		require.Error(t, err)+		require.True(t, audit.ErrSegmentExpired.Has(err))++		// Reverify should delete the expired segment+		pointer, err = satellite.Metainfo.Service.Get(ctx, path)+		require.Error(t, err)+		require.Nil(t, pointer)++		assert.Len(t, report.Successes, 0)+		assert.Len(t, report.Fails, 0)+		assert.Len(t, report.Offlines, 0)+		assert.Len(t, report.PendingAudits, 0)+	})+}++// TestReverifyExpired2 tests the case where the segment passed into Reverify is not expired,+// but the segment a node is contained for has expired.+func TestReverifyExpired2(t *testing.T) {+	testplanet.Run(t, testplanet.Config{+		SatelliteCount: 1, StorageNodeCount: 4, UplinkCount: 1,+	}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {+		satellite := planet.Satellites[0]+		audits := satellite.Audit+		queue := audits.Queue++		audits.Worker.Loop.Pause()++		ul := planet.Uplinks[0]+		testData1 := testrand.Bytes(8 * memory.KiB)+		testData2 := testrand.Bytes(8 * memory.KiB)+		// upload to three nodes so there is definitely at least one node overlap between the two files+		rs := &uplink.RSConfig{+			MinThreshold:     1,+			RepairThreshold:  2,+			SuccessThreshold: 3,+			MaxThreshold:     3,+		}+		err := ul.UploadWithConfig(ctx, satellite, rs, "testbucket", "test/path1", testData1)+		require.NoError(t, err)++		err = ul.UploadWithConfig(ctx, satellite, rs, "testbucket", "test/path2", testData2)+		require.NoError(t, err)++		audits.Chore.Loop.TriggerWait()+		path1, err := queue.Next()+		require.NoError(t, err)+		path2, err := queue.Next()+		require.NoError(t, err)+		require.NotEqual(t, path1, path2)++		pointer1, err := satellite.Metainfo.Service.Get(ctx, path1)+		require.NoError(t, err)+		pointer2, err := satellite.Metainfo.Service.Get(ctx, path2)+		require.NoError(t, err)++		// find a node that contains a piece for both files+		// save that node ID and the piece number associated with it for pointer1+		var selectedNode storj.NodeID+		var selectedPieceNum int32+		p1Nodes := make(map[storj.NodeID]int32)+		for _, piece := range pointer1.GetRemote().GetRemotePieces() {+			p1Nodes[piece.NodeId] = piece.PieceNum+		}+		for _, piece := range pointer2.GetRemote().GetRemotePieces() {+			pieceNum, ok := p1Nodes[piece.NodeId]+			if ok {+				selectedNode = piece.NodeId+				selectedPieceNum = pieceNum+				break+			}+		}+		require.NotEqual(t, selectedNode, storj.NodeID{})

yeah it's just a sanity check. I don't think this should ever fail

mobyvb

comment created time in 7 days

Pull request review commentstorj/storj

satellite/audit: do not audit expired segments

 func TestReverifyDifferentShare(t *testing.T) { 		require.Equal(t, report.Fails[0], selectedNode) 	}) }++// TestReverifyExpired1 tests the case where the segment passed into Reverify is expired+func TestReverifyExpired1(t *testing.T) {+	testplanet.Run(t, testplanet.Config{+		SatelliteCount: 1, StorageNodeCount: 4, UplinkCount: 1,+	}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {+		satellite := planet.Satellites[0]+		audits := satellite.Audit+		queue := audits.Queue++		audits.Worker.Loop.Pause()++		ul := planet.Uplinks[0]+		testData := testrand.Bytes(8 * memory.KiB)++		err := ul.Upload(ctx, satellite, "testbucket", "test/path", testData)+		require.NoError(t, err)++		audits.Chore.Loop.TriggerWait()+		path, err := queue.Next()+		require.NoError(t, err)++		// set pointer's expiration date to be already expired+		pointer, err := satellite.Metainfo.Service.Get(ctx, path)+		require.NoError(t, err)+		oldPointerBytes, err := proto.Marshal(pointer)+		require.NoError(t, err)+		newPointer := &pb.Pointer{}+		err = proto.Unmarshal(oldPointerBytes, newPointer)+		require.NoError(t, err)+		newPointer.ExpirationDate = time.Now().UTC().Add(-1 * time.Hour)+		newPointerBytes, err := proto.Marshal(newPointer)+		require.NoError(t, err)+		err = satellite.Metainfo.Service.DB.CompareAndSwap(ctx, storage.Key(path), oldPointerBytes, newPointerBytes)+		require.NoError(t, err)++		report, err := audits.Verifier.Reverify(ctx, path)+		require.Error(t, err)+		require.True(t, audit.ErrSegmentExpired.Has(err))++		// Reverify should delete the expired segment+		pointer, err = satellite.Metainfo.Service.Get(ctx, path)+		require.Error(t, err)+		require.Nil(t, pointer)++		assert.Len(t, report.Successes, 0)+		assert.Len(t, report.Fails, 0)+		assert.Len(t, report.Offlines, 0)+		assert.Len(t, report.PendingAudits, 0)+	})+}++// TestReverifyExpired2 tests the case where the segment passed into Reverify is not expired,+// but the segment a node is contained for has expired.+func TestReverifyExpired2(t *testing.T) {+	testplanet.Run(t, testplanet.Config{+		SatelliteCount: 1, StorageNodeCount: 4, UplinkCount: 1,+	}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {+		satellite := planet.Satellites[0]+		audits := satellite.Audit+		queue := audits.Queue++		audits.Worker.Loop.Pause()++		ul := planet.Uplinks[0]+		testData1 := testrand.Bytes(8 * memory.KiB)+		testData2 := testrand.Bytes(8 * memory.KiB)+		// upload to three nodes so there is definitely at least one node overlap between the two files+		rs := &uplink.RSConfig{+			MinThreshold:     1,+			RepairThreshold:  2,+			SuccessThreshold: 3,+			MaxThreshold:     3,+		}+		err := ul.UploadWithConfig(ctx, satellite, rs, "testbucket", "test/path1", testData1)+		require.NoError(t, err)++		err = ul.UploadWithConfig(ctx, satellite, rs, "testbucket", "test/path2", testData2)+		require.NoError(t, err)++		audits.Chore.Loop.TriggerWait()+		path1, err := queue.Next()+		require.NoError(t, err)+		path2, err := queue.Next()+		require.NoError(t, err)+		require.NotEqual(t, path1, path2)++		pointer1, err := satellite.Metainfo.Service.Get(ctx, path1)+		require.NoError(t, err)+		pointer2, err := satellite.Metainfo.Service.Get(ctx, path2)+		require.NoError(t, err)++		// find a node that contains a piece for both files+		// save that node ID and the piece number associated with it for pointer1+		var selectedNode storj.NodeID+		var selectedPieceNum int32+		p1Nodes := make(map[storj.NodeID]int32)+		for _, piece := range pointer1.GetRemote().GetRemotePieces() {+			p1Nodes[piece.NodeId] = piece.PieceNum+		}+		for _, piece := range pointer2.GetRemote().GetRemotePieces() {+			pieceNum, ok := p1Nodes[piece.NodeId]+			if ok {+				selectedNode = piece.NodeId+				selectedPieceNum = pieceNum+				break+			}+		}+		require.NotEqual(t, selectedNode, storj.NodeID{})

That's just to make sure that we ended up selecting a node, since it starts as storj.NodeID{}

mobyvb

comment created time in 7 days

push eventstorj/storj

Matt Robinson

commit sha ce89c44c0252db14164ec94ff1e5ce138d5e709c

Rebuild faster by removing weird make calls (#3502)

view details

Ethan Adams

commit sha 2eb0cc56fe246e6ee800e56b5efe5b74312084e4

satellite/gracefulexit: Check if node already has a piece in the pointer (#3434)

view details

Moby von Briesen

commit sha ca3365717c31710c2ed43a496da4d29206a1df50

Revert "update testplanet.Upload and testplanet.UploadWithConfig to use an expiration time of an hour from now" This reverts commit e9066151cf84afbff0929a6007e641711a56b6e5.

view details

Moby von Briesen

commit sha 1177ea90ba8ce2f84135fc6305b44433f8d6f805

do not count ExpirationDate=time.Time{} as expired

view details

Moby von Briesen

commit sha a5ccbc16beb80d1b9f7c9fbbe39bb86a573b9806

Merge branch 'master' of github.com:storj/storj into green/audit-expired

view details

push time in 7 days

Pull request review commentstorj/storj

satellite/gracefulexit: not allow disqualified node to graceful exit

 func (endpoint *Endpoint) doProcess(stream processStream) (err error) { 		endpoint.connections.delete(nodeID) 	}() +	// check if node is disqualified+	nodeInfo, err := endpoint.overlay.Get(ctx, nodeID)+	if err != nil {+		return rpcstatus.Error(rpcstatus.Internal, Error.Wrap(err).Error())+	}+	if nodeInfo.Disqualified != nil {+		return rpcstatus.Error(rpcstatus.PermissionDenied, "Only undisqualified node allowed for graceful exit")

should we go ahead and update these nodes to have completed + failed graceful exit? Otherwise couldn't they remain as "gracefully exiting" and never finish because of this error?

VinozzZ

comment created time in 7 days

push eventstorj/storj

Vitalii Shpital

commit sha 466cc8ab0b5458baf5700e18f1115381c044a5a3

web/storagenode: text selection on specific elements disabled (#3492)

view details

Vitalii Shpital

commit sha 0e3f0eeb12e8e36a549f83e8f08323a3a0b147b0

web/satellite: project description enhanced (#3494)

view details

Moby von Briesen

commit sha 665481a2ac2f0ab9fb0932692809875b66b8d789

delete "main" reverify segment and return error if expired

view details

Moby von Briesen

commit sha c8b8465119cf1e66ecc6e3648a6985a577c83a0f

delete contained nodes and pointers when pointers to audit are expired

view details

Moby von Briesen

commit sha e9066151cf84afbff0929a6007e641711a56b6e5

update testplanet.Upload and testplanet.UploadWithConfig to use an expiration time of an hour from now

view details

Moby von Briesen

commit sha de079072d9235022b01a0e014b360d7c3ced79dc

Merge branch 'master' of github.com:storj/storj into green/audit-expired

view details

push time in 7 days

PR opened storj/storj

Reviewers
satellite/audit: do not audit expired segments Bug cla-signed

What:

  • When a segment is audited, if it has already expired, do not audit nodes for that segment and delete the segment.

Why: This case was not carried over during an audit refactor: https://github.com/storj/storj/pull/1559

Please describe the tests:

  • Test 1: Select a segment to call audit.Verify on. Update it to have an expired timestamp. Expect that audit.Verify returns an "expired" error, and deletes the segment from metainfo.
  • Test 2:

Please describe the performance impact:

Code Review Checklist (to be filled out by reviewer)

  • [ ] NEW: Are there any Satellite database migrations? Are they forwards and backwards compatible?
  • [ ] Does the PR describe what changes are being made?
  • [ ] Does the PR describe why the changes are being made?
  • [ ] Does the code follow our style guide?
  • [ ] Does the code follow our testing guide?
  • [ ] Is the PR appropriately sized? (If it could be broken into smaller PRs it should be)
  • [ ] Does the new code have enough tests? (every PR should have tests or justification otherwise. Bug-fix PRs especially)
  • [ ] Does the new code have enough documentation that answers "how do I use it?" and "what does it do?"? (both source documentation and higher level, diagrams?)
  • [ ] Does any documentation need updating?
  • [ ] Do the database access patterns make sense?
+62 -0

0 comment

2 changed files

pr created time in 8 days

create barnchstorj/storj

branch : green/audit-expired

created branch time in 8 days

push eventstorj/storj

Maximillian von Briesen

commit sha 257d3946d5e287b7237e7b3b86ee8b7174d56f66

storagenode/gracefulexit: allow storagenodes to concurrently transfer pieces for graceful exit (#3478)

view details

push time in 8 days

delete branch storj/storj

delete branch : green/ge-sn-concurrency

delete time in 8 days

push eventstorj/storj

Michal Niewrzal

commit sha ab5c623ac73c595c00d18aaf4da0e06e63467f51

cli: should return non-zero code for error (#3469)

view details

Kaloyan Raev

commit sha d6b5d49ff1df6967f3c3678ec3ce45241b4c656b

Change "Ethereum wallet address" label to "STORJ payout address" (#3482)

view details

Kaloyan Raev

commit sha 19a59c9d59a4c36a602cd3ab2476facfdde1b339

Fix hyperlink's background color on Welcome dialog (#3483)

view details

Maximillian von Briesen

commit sha 78fedf5db3fbf3f10448a86213cb133fb02a9b57

satellite/gracefulexit: handle piece not found messages from storagenode (#3456) * If a node claims to fail a transfer due to piece not found, remove that node from the pointer, delete the transfer queue item. * If the pointer is piece hash verified, penalize the node. Otherwise, do not penalize the node.

view details

Maximillian von Briesen

commit sha 2b6170450c6b33ba057894224689cf72d84330e2

Merge branch 'master' into green/ge-sn-concurrency

view details

push time in 8 days

push eventstorj/storj

Maximillian von Briesen

commit sha 78fedf5db3fbf3f10448a86213cb133fb02a9b57

satellite/gracefulexit: handle piece not found messages from storagenode (#3456) * If a node claims to fail a transfer due to piece not found, remove that node from the pointer, delete the transfer queue item. * If the pointer is piece hash verified, penalize the node. Otherwise, do not penalize the node.

view details

push time in 8 days

delete branch storj/storj

delete branch : green/ge-piece-does-not-exist

delete time in 8 days

PR merged storj/storj

Reviewers
satellite/gracefulexit: handle piece not found messages from storagenode Request Code Review cla-signed

What:

  • If a node claims to fail a transfer due to piece not found, remove that node from the pointer, delete the transfer queue item.
  • If the pointer is piece hash verified, penalize the node. Otherwise, do not penalize the node.

Why: A node might not have ever received a piece that ended up in an old, non-piece-hash-verified pointer. We do not want to unfairly penalize nodes for these pieces. https://storjlabs.atlassian.net/browse/V3-3087

Please describe the tests:

  • Test 1: TestFailureNotFoundPieceHashVerified - Initiate graceful exit for a node holding one piece. The node responds to a transfer request with a transfer failed message and a NOT_FOUND error. Expect that the node is no longer in the pointer, and that the node has been marked as failing to transfer a piece.
  • Test 2: TestFailureNotFoundPieceHashUnverified - Initiate graceful exit for a node holding one piece. Update the pointer to have PieceHashesVerified=false. The node responds to a transfer request with a transfer failed message and a NOT_FOUND error. Expect that the node is no longer in the pointer, and that the node has not been marked as failing to transfer a piece.

Please describe the performance impact:

Code Review Checklist (to be filled out by reviewer)

  • [x] NEW: Are there any Satellite database migrations? Are they forwards and backwards compatible?
  • [x] Does the PR describe what changes are being made?
  • [x] Does the PR describe why the changes are being made?
  • [x] Does the code follow our style guide?
  • [x] Does the code follow our testing guide?
  • [x] Is the PR appropriately sized? (If it could be broken into smaller PRs it should be)
  • [x] Does the new code have enough tests? (every PR should have tests or justification otherwise. Bug-fix PRs especially)
  • [x] Does the new code have enough documentation that answers "how do I use it?" and "what does it do?"? (both source documentation and higher level, diagrams?)
  • [x] Does any documentation need updating?
  • [x] Do the database access patterns make sense?
+216 -1

0 comment

2 changed files

mobyvb

pr closed time in 8 days

push eventstorj/storj

Kaloyan Raev

commit sha 19a59c9d59a4c36a602cd3ab2476facfdde1b339

Fix hyperlink's background color on Welcome dialog (#3483)

view details

Maximillian von Briesen

commit sha 6535174c34a2ca575562a19fc53c7bb5ab79b602

Merge branch 'master' into green/ge-piece-does-not-exist

view details

push time in 8 days

push eventstorj/storj

Egon Elbre

commit sha 9c59efd33d85a2d93ee8a8391e951ea938bf481b

satellite/rewards: ensure that partner information is asked from a service (#3275)

view details

Yaroslav Vorobiov

commit sha 35edc2bcc3f33b10bd12e3f4ae24e20552276572

satellite/payments: invoice creation (#3468)

view details

Michal Niewrzal

commit sha ab5c623ac73c595c00d18aaf4da0e06e63467f51

cli: should return non-zero code for error (#3469)

view details

Kaloyan Raev

commit sha d6b5d49ff1df6967f3c3678ec3ce45241b4c656b

Change "Ethereum wallet address" label to "STORJ payout address" (#3482)

view details

Maximillian von Briesen

commit sha b5f03e11dfb2e0b681d088b77aeb42ea372b3c21

Merge branch 'master' into green/ge-piece-does-not-exist

view details

push time in 8 days

PR merged storj/storj

storagenode/gracefulexit: allow storagenodes to concurrently transfer pieces for graceful exit Request Code Review cla-signed

What: Add a limiter to allow a storagenode to process multiple transfer piece requests from a single satellite at once.

Why: Graceful exit will take too long for some nodes if limited to a single piece transfer at a time. https://storjlabs.atlassian.net/browse/V3-3086

Please describe the tests:

  • Test 1:
  • Test 2:

Please describe the performance impact:

Code Review Checklist (to be filled out by reviewer)

  • [x] NEW: Are there any Satellite database migrations? Are they forwards and backwards compatible?
  • [x] Does the PR describe what changes are being made?
  • [x] Does the PR describe why the changes are being made?
  • [x] Does the code follow our style guide?
  • [x] Does the code follow our testing guide?
  • [x] Is the PR appropriately sized? (If it could be broken into smaller PRs it should be)
  • [x] Does the new code have enough tests? (every PR should have tests or justification otherwise. Bug-fix PRs especially)
  • [x] Does the new code have enough documentation that answers "how do I use it?" and "what does it do?"? (both source documentation and higher level, diagrams?)
  • [x] Does any documentation need updating?
  • [x] Do the database access patterns make sense?
+55 -34

0 comment

5 changed files

mobyvb

pr closed time in 8 days

push eventstorj/storj

Egon Elbre

commit sha 9c59efd33d85a2d93ee8a8391e951ea938bf481b

satellite/rewards: ensure that partner information is asked from a service (#3275)

view details

Yaroslav Vorobiov

commit sha 35edc2bcc3f33b10bd12e3f4ae24e20552276572

satellite/payments: invoice creation (#3468)

view details

Maximillian von Briesen

commit sha c6b6743283fa0913c51881b2d4f5448d2a4dc73a

Merge branch 'master' into green/ge-sn-concurrency

view details

push time in 8 days

Pull request review commentstorj/storj

storagenode/gracefulexit: allow storagenodes to concurrently transfer pieces for graceful exit

 func (worker *Worker) Run(ctx context.Context, done func()) (err error) { 		case *pb.SatelliteMessage_NotReady: 			break // wait until next worker execution 		case *pb.SatelliteMessage_TransferPiece:-			err = worker.transferPiece(ctx, msg.TransferPiece, c)-			if err != nil {-				continue-			}+			transferPieceMsg := msg.TransferPiece+			worker.limiter.Go(ctx, func() {+				err = worker.transferPiece(ctx, transferPieceMsg, c)+				if err != nil {+					worker.log.Error("failed to transfer piece.", zap.Stringer("Satellite ID", worker.satelliteID), zap.Error(errs.Wrap(err)))+				}+			}) 		case *pb.SatelliteMessage_DeletePiece: 			pieceID := msg.DeletePiece.OriginalPieceId 			err := worker.deleteOnePieceOrAll(ctx, &pieceID)

I added it so deletions don't cause a bottleneck

mobyvb

comment created time in 8 days

push eventstorj/storj

Moby von Briesen

commit sha 5a24037bdab832c1da3617e1e0cf607660a462f9

linter fix

view details

Moby von Briesen

commit sha 766362401ddb79458ff0ce37057cfffad583beff

add piece deletion to limiter

view details

push time in 8 days

Pull request review commentstorj/storj

storagenode/gracefulexit: allow storagenodes to concurrently transfer pieces for graceful exit

 func (worker *Worker) Run(ctx context.Context, done func()) (err error) { 		case *pb.SatelliteMessage_NotReady: 			break // wait until next worker execution 		case *pb.SatelliteMessage_TransferPiece:-			err = worker.transferPiece(ctx, msg.TransferPiece, c)-			if err != nil {-				continue-			}+			transferPieceMsg := msg.TransferPiece+			worker.limiter.Go(ctx, func() {+				err = worker.transferPiece(ctx, transferPieceMsg, c)+				if err != nil {+					worker.log.Error("failed to transfer piece.", zap.Stringer("Satellite ID", worker.satelliteID), zap.Error(errs.Wrap(err)))+				}+			}) 		case *pb.SatelliteMessage_DeletePiece: 			pieceID := msg.DeletePiece.OriginalPieceId 			err := worker.deleteOnePieceOrAll(ctx, &pieceID)

My main reason was that since a deletion involves only local operations, it would be relatively quick compared to a piece transfer. But I don't think it would hurt to add it to the limiter. Do you think we should?

mobyvb

comment created time in 8 days

Pull request review commentstorj/storj

satellite/gracefulexit: Check if node already has a piece in the pointer

 func TestUpdatePointerFailure_DuplicatedNodeID(t *testing.T) { 			// validate we get a new node to transfer too 			require.True(t, m.TransferPiece.OriginalPieceId == pieceID) 			require.True(t, m.TransferPiece.AddressedOrderLimit.Limit.StorageNodeId != firstRecNodeID)+			fmt.Printf("EEEE %v %v\n", m.TransferPiece.AddressedOrderLimit.Limit.StorageNodeId.String(), firstRecNodeID.String())

lol whoops was looking at a single commit. Ignore

ethanadams

comment created time in 8 days

Pull request review commentstorj/storj

satellite/gracefulexit: Check if node already has a piece in the pointer

 func TestUpdatePointerFailure_DuplicatedNodeID(t *testing.T) { 			// validate we get a new node to transfer too 			require.True(t, m.TransferPiece.OriginalPieceId == pieceID) 			require.True(t, m.TransferPiece.AddressedOrderLimit.Limit.StorageNodeId != firstRecNodeID)+			fmt.Printf("EEEE %v %v\n", m.TransferPiece.AddressedOrderLimit.Limit.StorageNodeId.String(), firstRecNodeID.String())

Forgot to remove this print statement

ethanadams

comment created time in 8 days

Pull request review commentstorj/storj

storagenode/gracefulexit: allow storagenodes to concurrently transfer pieces for graceful exit

 func (worker *Worker) Run(ctx context.Context, done func()) (err error) { 		case *pb.SatelliteMessage_NotReady: 			break // wait until next worker execution 		case *pb.SatelliteMessage_TransferPiece:-			err = worker.transferPiece(ctx, msg.TransferPiece, c)-			if err != nil {-				continue-			}+			transferPieceMsg := msg.TransferPiece+			worker.limiter.Go(ctx, func() {+				err = worker.transferPiece(ctx, transferPieceMsg, c)+				if err != nil {+					worker.log.Error("failed to transfer piece.", zap.Stringer("Satellite ID", worker.satelliteID), zap.Error(errs.Wrap(err)))

Maybe we should remove all logs from inside worker.transferPiece and just have a single log here

mobyvb

comment created time in 8 days

PR opened storj/storj

Reviewers
storagenode/gracefulexit: allow storagenodes to concurrently transfer pieces for graceful exit Request Code Review

What: Add a limiter to allow a storagenode to process multiple transfer piece requests from a single satellite at once.

Why: Graceful exit will take too long for some nodes if limited to a single piece transfer at a time. https://storjlabs.atlassian.net/browse/V3-3086

Please describe the tests:

  • Test 1:
  • Test 2:

Please describe the performance impact:

Code Review Checklist (to be filled out by reviewer)

  • [ ] NEW: Are there any Satellite database migrations? Are they forwards and backwards compatible?
  • [ ] Does the PR describe what changes are being made?
  • [ ] Does the PR describe why the changes are being made?
  • [ ] Does the code follow our style guide?
  • [ ] Does the code follow our testing guide?
  • [ ] Is the PR appropriately sized? (If it could be broken into smaller PRs it should be)
  • [ ] Does the new code have enough tests? (every PR should have tests or justification otherwise. Bug-fix PRs especially)
  • [ ] Does the new code have enough documentation that answers "how do I use it?" and "what does it do?"? (both source documentation and higher level, diagrams?)
  • [ ] Does any documentation need updating?
  • [ ] Do the database access patterns make sense?
+46 -29

0 comment

5 changed files

pr created time in 8 days

create barnchstorj/storj

branch : green/ge-sn-concurrency

created branch time in 8 days

Pull request review commentstorj/storj

satellite/gracefulexit: separate functional code

 func (endpoint *Endpoint) updatePointer(ctx context.Context, originalPointer *pb  	return nil }++func (endpoint *Endpoint) checkInitialState(ctx context.Context, nodeID storj.NodeID) (*pb.SatelliteMessage, error) {+	exitStatus, err := endpoint.overlaydb.GetExitStatus(ctx, nodeID)+	if err != nil {+		return nil, Error.Wrap(err)+	}++	if exitStatus.ExitFinishedAt != nil {+		// TODO maybe we should store the reason in the DB so we know how it originally failed.+		return endpoint.getFinishedMessage(ctx, endpoint.signer, nodeID, *exitStatus.ExitFinishedAt, exitStatus.ExitSuccess, -1)+	}++	if exitStatus.ExitInitiatedAt == nil {+		request := &overlay.ExitStatusRequest{NodeID: nodeID, ExitInitiatedAt: time.Now().UTC()}+		node, err := endpoint.overlaydb.UpdateExitStatus(ctx, request)+		if err != nil {+			return nil, Error.Wrap(err)+		}+		err = endpoint.db.IncrementProgress(ctx, nodeID, 0, 0, 0)+		if err != nil {+			return nil, Error.Wrap(err)+		}++		// graceful exit initiation metrics+		age := time.Now().UTC().Sub(node.CreatedAt.UTC())+		mon.FloatVal("graceful_exit_init_node_age_seconds").Observe(age.Seconds())+		mon.IntVal("graceful_exit_init_node_audit_success_count").Observe(node.Reputation.AuditSuccessCount)+		mon.IntVal("graceful_exit_init_node_audit_total_count").Observe(node.Reputation.AuditCount)+		mon.IntVal("graceful_exit_init_node_piece_count").Observe(node.PieceCount)++		return &pb.SatelliteMessage{Message: &pb.SatelliteMessage_NotReady{NotReady: &pb.NotReady{}}}, nil+	}++	if exitStatus.ExitLoopCompletedAt == nil {+		return &pb.SatelliteMessage{Message: &pb.SatelliteMessage_NotReady{NotReady: &pb.NotReady{}}}, nil+	}++	return nil, nil+}++func (endpoint *Endpoint) generateExitStatusRequest(ctx context.Context, progress *Progress) (*overlay.ExitStatusRequest, error) {+	processed := progress.PiecesFailed + progress.PiecesTransferred++	if processed > 0 {+		mon.IntVal("graceful_exit_successful_pieces_transfer_ratio").Observe(progress.PiecesTransferred / processed)+	}++	exitStatusRequest := &overlay.ExitStatusRequest{+		NodeID:         progress.NodeID,+		ExitFinishedAt: time.Now().UTC(),+	}+	// check node's exiting progress to see if it has failed passed max failure threshold+	if processed > 0 && float64(progress.PiecesFailed)/float64(processed)*100 >= float64(endpoint.config.OverallMaxFailuresPercentage) {+		exitStatusRequest.ExitSuccess = false+	} else {+		exitStatusRequest.ExitSuccess = true+	}++	if exitStatusRequest.ExitSuccess {+		mon.Meter("graceful_exit_success").Mark(1)+	} else {+		mon.Meter("graceful_exit_fail_max_failures_percentage").Mark(1)+	}++	return exitStatusRequest, nil++}++func (endpoint *Endpoint) validatePointer(ctx context.Context, pointer *pb.Pointer, incomplete *TransferQueueItem) (*pb.RemotePiece, error) {+	remote := pointer.GetRemote()

What if remote == nil? Should the caller be responsible for ensuring that it is a remote pointer or is that a responsibility of validatePointer?

VinozzZ

comment created time in 8 days

Pull request review commentstorj/storj

satellite/gracefulexit: separate functional code

 func (endpoint *Endpoint) updatePointer(ctx context.Context, originalPointer *pb  	return nil }++func (endpoint *Endpoint) checkInitialState(ctx context.Context, nodeID storj.NodeID) (*pb.SatelliteMessage, error) {+	exitStatus, err := endpoint.overlaydb.GetExitStatus(ctx, nodeID)+	if err != nil {+		return nil, Error.Wrap(err)+	}++	if exitStatus.ExitFinishedAt != nil {+		// TODO maybe we should store the reason in the DB so we know how it originally failed.+		return endpoint.getFinishedMessage(ctx, endpoint.signer, nodeID, *exitStatus.ExitFinishedAt, exitStatus.ExitSuccess, -1)+	}++	if exitStatus.ExitInitiatedAt == nil {+		request := &overlay.ExitStatusRequest{NodeID: nodeID, ExitInitiatedAt: time.Now().UTC()}+		node, err := endpoint.overlaydb.UpdateExitStatus(ctx, request)+		if err != nil {+			return nil, Error.Wrap(err)+		}+		err = endpoint.db.IncrementProgress(ctx, nodeID, 0, 0, 0)+		if err != nil {+			return nil, Error.Wrap(err)+		}++		// graceful exit initiation metrics+		age := time.Now().UTC().Sub(node.CreatedAt.UTC())+		mon.FloatVal("graceful_exit_init_node_age_seconds").Observe(age.Seconds())+		mon.IntVal("graceful_exit_init_node_audit_success_count").Observe(node.Reputation.AuditSuccessCount)+		mon.IntVal("graceful_exit_init_node_audit_total_count").Observe(node.Reputation.AuditCount)+		mon.IntVal("graceful_exit_init_node_piece_count").Observe(node.PieceCount)++		return &pb.SatelliteMessage{Message: &pb.SatelliteMessage_NotReady{NotReady: &pb.NotReady{}}}, nil+	}++	if exitStatus.ExitLoopCompletedAt == nil {+		return &pb.SatelliteMessage{Message: &pb.SatelliteMessage_NotReady{NotReady: &pb.NotReady{}}}, nil+	}++	return nil, nil+}++func (endpoint *Endpoint) generateExitStatusRequest(ctx context.Context, progress *Progress) (*overlay.ExitStatusRequest, error) {+	processed := progress.PiecesFailed + progress.PiecesTransferred++	if processed > 0 {+		mon.IntVal("graceful_exit_successful_pieces_transfer_ratio").Observe(progress.PiecesTransferred / processed)+	}++	exitStatusRequest := &overlay.ExitStatusRequest{+		NodeID:         progress.NodeID,+		ExitFinishedAt: time.Now().UTC(),+	}+	// check node's exiting progress to see if it has failed passed max failure threshold+	if processed > 0 && float64(progress.PiecesFailed)/float64(processed)*100 >= float64(endpoint.config.OverallMaxFailuresPercentage) {+		exitStatusRequest.ExitSuccess = false+	} else {+		exitStatusRequest.ExitSuccess = true+	}++	if exitStatusRequest.ExitSuccess {+		mon.Meter("graceful_exit_success").Mark(1)+	} else {+		mon.Meter("graceful_exit_fail_max_failures_percentage").Mark(1)+	}++	return exitStatusRequest, nil++}++func (endpoint *Endpoint) validatePointer(ctx context.Context, pointer *pb.Pointer, incomplete *TransferQueueItem) (*pb.RemotePiece, error) {+	remote := pointer.GetRemote()+	nodeID := incomplete.NodeID++	pieces := remote.GetRemotePieces()+	var nodePiece *pb.RemotePiece+	for _, piece := range pieces {+		if piece.NodeId == nodeID && piece.PieceNum == incomplete.PieceNum {+			nodePiece = piece+		}+	}++	if nodePiece == nil {+		endpoint.log.Debug("piece no longer held by node", zap.Stringer("node ID", nodeID), zap.ByteString("path", incomplete.Path), zap.Int32("piece num", incomplete.PieceNum))++		err := endpoint.db.DeleteTransferQueueItem(ctx, nodeID, incomplete.Path, incomplete.PieceNum)+		if err != nil {+			return nil, Error.Wrap(err)+		}++	}+	return nodePiece, nil+}++func (endpoint *Endpoint) generateFindNodesRequest(ctx context.Context, pointer *pb.Pointer, incomplete *TransferQueueItem) (*overlay.FindStorageNodesRequest, error) {

I think this name is misleading since this function also handles the logic of removing a piece from the pointer and transfer queue if the redundancy is high enough. I think either the name should be changed or the logic related to the find nodes request should be separated from the other logic here.

VinozzZ

comment created time in 8 days

Pull request review commentstorj/storj

satellite/gracefulexit: separate functional code

 func (endpoint *Endpoint) updatePointer(ctx context.Context, originalPointer *pb  	return nil }++func (endpoint *Endpoint) checkInitialState(ctx context.Context, nodeID storj.NodeID) (*pb.SatelliteMessage, error) {+	exitStatus, err := endpoint.overlaydb.GetExitStatus(ctx, nodeID)+	if err != nil {+		return nil, Error.Wrap(err)+	}++	if exitStatus.ExitFinishedAt != nil {+		// TODO maybe we should store the reason in the DB so we know how it originally failed.+		return endpoint.getFinishedMessage(ctx, endpoint.signer, nodeID, *exitStatus.ExitFinishedAt, exitStatus.ExitSuccess, -1)+	}++	if exitStatus.ExitInitiatedAt == nil {+		request := &overlay.ExitStatusRequest{NodeID: nodeID, ExitInitiatedAt: time.Now().UTC()}+		node, err := endpoint.overlaydb.UpdateExitStatus(ctx, request)+		if err != nil {+			return nil, Error.Wrap(err)+		}+		err = endpoint.db.IncrementProgress(ctx, nodeID, 0, 0, 0)+		if err != nil {+			return nil, Error.Wrap(err)+		}++		// graceful exit initiation metrics+		age := time.Now().UTC().Sub(node.CreatedAt.UTC())+		mon.FloatVal("graceful_exit_init_node_age_seconds").Observe(age.Seconds())+		mon.IntVal("graceful_exit_init_node_audit_success_count").Observe(node.Reputation.AuditSuccessCount)+		mon.IntVal("graceful_exit_init_node_audit_total_count").Observe(node.Reputation.AuditCount)+		mon.IntVal("graceful_exit_init_node_piece_count").Observe(node.PieceCount)++		return &pb.SatelliteMessage{Message: &pb.SatelliteMessage_NotReady{NotReady: &pb.NotReady{}}}, nil+	}++	if exitStatus.ExitLoopCompletedAt == nil {+		return &pb.SatelliteMessage{Message: &pb.SatelliteMessage_NotReady{NotReady: &pb.NotReady{}}}, nil+	}++	return nil, nil+}++func (endpoint *Endpoint) generateExitStatusRequest(ctx context.Context, progress *Progress) (*overlay.ExitStatusRequest, error) {+	processed := progress.PiecesFailed + progress.PiecesTransferred++	if processed > 0 {+		mon.IntVal("graceful_exit_successful_pieces_transfer_ratio").Observe(progress.PiecesTransferred / processed)+	}++	exitStatusRequest := &overlay.ExitStatusRequest{+		NodeID:         progress.NodeID,+		ExitFinishedAt: time.Now().UTC(),+	}+	// check node's exiting progress to see if it has failed passed max failure threshold+	if processed > 0 && float64(progress.PiecesFailed)/float64(processed)*100 >= float64(endpoint.config.OverallMaxFailuresPercentage) {+		exitStatusRequest.ExitSuccess = false+	} else {+		exitStatusRequest.ExitSuccess = true+	}++	if exitStatusRequest.ExitSuccess {+		mon.Meter("graceful_exit_success").Mark(1)+	} else {+		mon.Meter("graceful_exit_fail_max_failures_percentage").Mark(1)+	}++	return exitStatusRequest, nil++}++func (endpoint *Endpoint) validatePointer(ctx context.Context, pointer *pb.Pointer, incomplete *TransferQueueItem) (*pb.RemotePiece, error) {+	remote := pointer.GetRemote()+	nodeID := incomplete.NodeID++	pieces := remote.GetRemotePieces()+	var nodePiece *pb.RemotePiece+	for _, piece := range pieces {+		if piece.NodeId == nodeID && piece.PieceNum == incomplete.PieceNum {+			nodePiece = piece+		}+	}++	if nodePiece == nil {+		endpoint.log.Debug("piece no longer held by node", zap.Stringer("node ID", nodeID), zap.ByteString("path", incomplete.Path), zap.Int32("piece num", incomplete.PieceNum))++		err := endpoint.db.DeleteTransferQueueItem(ctx, nodeID, incomplete.Path, incomplete.PieceNum)+		if err != nil {+			return nil, Error.Wrap(err)+		}++	}+	return nodePiece, nil+}++func (endpoint *Endpoint) generateFindNodesRequest(ctx context.Context, pointer *pb.Pointer, incomplete *TransferQueueItem) (*overlay.FindStorageNodesRequest, error) {+	// validate pointer state+	nodePiece, err := endpoint.validatePointer(ctx, pointer, incomplete)+	if err != nil {+		return nil, Error.Wrap(err)+	}++	redundancy, err := eestream.NewRedundancyStrategyFromProto(pointer.GetRemote().GetRedundancy())+	if err != nil {+		return nil, Error.Wrap(err)+	}++	nodeID := incomplete.NodeID+	pieces := pointer.GetRemote().GetRemotePieces()+	if len(pieces) > redundancy.OptimalThreshold() {+		endpoint.log.Debug("pointer has more pieces than required. removing node from pointer.", zap.Stringer("node ID", nodeID), zap.ByteString("path", incomplete.Path), zap.Int32("piece num", incomplete.PieceNum))++		_, err = endpoint.metainfo.UpdatePieces(ctx, string(incomplete.Path), pointer, nil, []*pb.RemotePiece{nodePiece})+		if err != nil {+			return nil, Error.Wrap(err)+		}++		err = endpoint.db.DeleteTransferQueueItem(ctx, nodeID, incomplete.Path, incomplete.PieceNum)+		if err != nil {+			return nil, Error.Wrap(err)+		}++		return nil, nil+	}++	// calculate piece size+	pieceSize := eestream.CalcPieceSize(pointer.GetSegmentSize(), redundancy)++	// populate excluded node IDs+	excludedNodeIDs := make([]storj.NodeID, len(pieces))+	for i, piece := range pieces {+		excludedNodeIDs[i] = piece.NodeId+	}++	return &overlay.FindStorageNodesRequest{+		RequestedCount: 1,+		FreeBandwidth:  pieceSize,+		FreeDisk:       pieceSize,+		ExcludedNodes:  excludedNodeIDs,+	}, nil+}++func (endpoint *Endpoint) verifyPieceTransferred(ctx context.Context, message *pb.StorageNodeMessage_Succeeded, transfer *pendingTransfer) error {

Maybe we can move this and validatePointer (and any other similar functions) to a file called validation.go. Similar to the metainfo package.

VinozzZ

comment created time in 8 days

Pull request review commentstorj/storj

satellite/gracefulexit: separate functional code

 func (endpoint *Endpoint) doProcess(stream processStream) (err error) { 		endpoint.connections.delete(nodeID) 	}() -	eofHandler := func(err error) error {-		if err == io.EOF {-			endpoint.log.Debug("received EOF when trying to receive messages from storage node", zap.Stringer("node ID", nodeID))-			return nil-		}-		if err != nil {-			return rpcstatus.Error(rpcstatus.Unknown, Error.Wrap(err).Error())-		}-		return nil-	}--	exitStatus, err := endpoint.overlaydb.GetExitStatus(ctx, nodeID)+	// check initial state+	msg, err := endpoint.checkInitialState(ctx, nodeID)

In my opinion, it would be better to keep some of the "initial state" control logic in this main function, and just pull out the code for each state. e.g.

exitStatus, err := endpoint.overlaydb.GetExitStatus(ctx, nodeID)
..
if exitStatus.ExitFinishedAt != nil {
    return endpoint.handleExitAlreadyFinished(stream...)
}
if exitStatus.ExitInitiatedAt == nil {
    return endpoint.handleInitiateExit(stream...)
}
if exitStatus.ExitLoopCompletedAt != nil {
    return endpoint.handleExitLoopIncomplete(stream...)
}

Just an idea, but I think it'd be nice to keep as much framework control logic as possible without making doProcess too bulky.

VinozzZ

comment created time in 8 days

Pull request review commentstorj/storj

satellite/gracefulexit: separate functional code

 func (endpoint *Endpoint) doProcess(stream processStream) (err error) { 		}()  		timer := time.NewTimer(endpoint.recvTimeout)++		eofHandler := func(err error) error {

The primary purpose of this PR is to move stuff and not to change the logic, so I would understand if we just keep it for now.

VinozzZ

comment created time in 8 days

Pull request review commentstorj/storj

satellite/gracefulexit: separate functional code

 func (endpoint *Endpoint) doProcess(stream processStream) (err error) { 		}()  		timer := time.NewTimer(endpoint.recvTimeout)++		eofHandler := func(err error) error {

Instead of moving this handler function, maybe we could just do the if err == io.EOF stuff from inside the Process function. What do you think?

VinozzZ

comment created time in 8 days

push eventstorj/storj

Bryan White

commit sha f6a4155c469ae727c347346d7b409fc0a33c9373

certificates: move db test to separate file (#3439)

view details

Maximillian von Briesen

commit sha 1136dc9d86e4981c25c6007766da89d85af673c2

Merge branch 'master' into green/ge-piece-does-not-exist

view details

push time in 8 days

push eventstorj/storj

Bryan White

commit sha f6a4155c469ae727c347346d7b409fc0a33c9373

certificates: move db test to separate file (#3439)

view details

push time in 8 days

PR merged storj/storj

Reviewers
certificates: move db test to separate file Code Review In-Progress Reviewer Can Merge cla-signed

What:

  • Move DB related tests to db_test.go instead of authorizations_test.go
  • Rename MaxClockOffset to MaxClockSkew as per @egonelbre's suggestion

Why: Better organization.

Please describe the tests: N/A

Please describe the performance impact: N/A

Code Review Checklist (to be filled out by reviewer)

  • [x] NEW: Are there any Satellite database migrations? Are they forwards and backwards compatible?
  • [x] Does the PR describe what changes are being made?
  • [x] Does the PR describe why the changes are being made?
  • [x] Does the code follow our style guide?
  • [x] Does the code follow our testing guide?
  • [x] Is the PR appropriately sized? (If it could be broken into smaller PRs it should be)
  • [x] Does the new code have enough tests? (every PR should have tests or justification otherwise. Bug-fix PRs especially)
  • [x] Does the new code have enough documentation that answers "how do I use it?" and "what does it do?"? (both source documentation and higher level, diagrams?)
  • [x] Does any documentation need updating?
  • [x] Do the database access patterns make sense?
+418 -402

0 comment

5 changed files

bryanchriswhite

pr closed time in 8 days

Pull request review commentstorj/storj

satellite/gracefulexit: Check if node already has a piece in the pointer

 func TestUpdatePointerFailure_DuplicatedNodeID(t *testing.T) { 		count, ok := pieceMap[exitingNodeID] 		require.True(t, ok) 		require.Equal(t, 1, count)-		count, ok = pieceMap[recNodeID]+		count, ok = pieceMap[firstRecNodeID] 		require.True(t, ok) 		require.Equal(t, 1, count) 	}) } -func testTransfers(t *testing.T, objects int, verifier func(ctx *testcontext.Context, nodeFullIDs map[storj.NodeID]*identity.FullIdentity, satellite *testplanet.SatelliteSystem, processClient exitProcessClient, exitingNode *storagenode.Peer, numPieces int)) {-	successThreshold := 4

nit - it seems weird to me for success threshold to be passed in when the min/repair thresholds used in the RSConfig to upload are kind of dependent on the value of the success threshold. I don't really have an alternative solution at the moment, but thought I'd mention that it gave me pause. I am okay approving with this change.

ethanadams

comment created time in 8 days

push eventstorj/storj

Matt Robinson

commit sha e8f9a182da80e5459b1861fb05c0bf852edf3089

Bump go version to 1.13.4 (#3450)

view details

Brandon Iglesias

commit sha 281b8b6967aa6a052749fc20cd64e7aeff5e94a2

update to the CLA file adding Rafael Gomes (#3474) update to the CLA file adding Rafael Gomes

view details

Jess G

commit sha 5abb91afcfee913709aac5fda6d51f4c4994ce2e

satellite: change the Peer name to Core (#3472) * change satellite.Peer name to Core * change to Core in testplanet * missed a few places * keep shared stuff in peer.go to stay consistent with storj/docs

view details

Maximillian von Briesen

commit sha fe85853eb0acaba4d1f9c92fe4ce647c04b9c779

Merge branch 'master' into green/ge-piece-does-not-exist

view details

push time in 8 days

push eventstorj/storj

Yehor Butko

commit sha 761cec5ea3fadf342dadb35e94d69c013f53ecae

satellite/payments: archview comments updated (#3464)

view details

Jennifer Li Johnson

commit sha 0621830f0543dc6f4d7484e18f022dcb3b8e895a

cmd/storj-sim: allow user to designate which redis db they want to start at (#3458)

view details

Yehor Butko

commit sha 0c2e498f09a8217dfbff256c4b09e2671a39d2fe

satellite/satellitedb: console tables archview comments updated (#3465)

view details

Isaac Hess

commit sha 4d26d0a6a67f1d7a5ad15261977fd9ed7e26e8d0

storagenode/pieces: Add migration from v0 piece to v1 piece (#3401)

view details

Matt Robinson

commit sha e8f9a182da80e5459b1861fb05c0bf852edf3089

Bump go version to 1.13.4 (#3450)

view details

Brandon Iglesias

commit sha 281b8b6967aa6a052749fc20cd64e7aeff5e94a2

update to the CLA file adding Rafael Gomes (#3474) update to the CLA file adding Rafael Gomes

view details

Jess G

commit sha 5abb91afcfee913709aac5fda6d51f4c4994ce2e

satellite: change the Peer name to Core (#3472) * change satellite.Peer name to Core * change to Core in testplanet * missed a few places * keep shared stuff in peer.go to stay consistent with storj/docs

view details

Maximillian von Briesen

commit sha 7a406f5dfc95a508e0ac0d56d169b49a652ba0b0

Merge branch 'master' into bryan/CA-test-reorg

view details

push time in 8 days

push eventstorj/storj

Egon Elbre

commit sha be2dd1ca728d50846f7b6dd9ee71964262a29e2b

cmd/storj-sim: add --redis flag (#3451)

view details

Maximillian von Briesen

commit sha f9df0ea591d038663a0aaabb42718c11a0223f9c

satellite/gracefulexit: check for unknown error in graceful exit disable test Allow error in graceful exit disable test to be rpcstatus.Unimplemented (grpc) or rpcstatus.Unknown (drpc)

view details

JT Olio

commit sha 41c0093e5b72a88c0642bc72fc2f16a5ae1f6b33

drpc: enable by default (#3452)

view details

Jeff Wendling

commit sha 17e9044c0f3ef30b130bdbd59e34fe44820896fa

pkg/rpc/rpcpeer: check both drpc and grpc for peers on a context we don't know if an incoming connection is from drpc or grpc during the migration time, so check both. Change-Id: I2418dde8b651dcc4a23726057178465224a48103

view details

Jeff Wendling

commit sha d820b5abf0faefcd618270fef66c06f50d6c44a3

drpc: bump version Change-Id: I4c368a0200ea07dc4e1b1d14a750e64377e513dc

view details

Jeff Wendling

commit sha abd27d496cace827f49a7919a932a22c2b6e1524

satellite: make outgoing connections with grpc (#3457)

view details

Jess G

commit sha 8d92c288e294c7c2a9490297c8aa212ed1eb335e

satellitedb: separate migration into subcommand (#3436) * separate sadb migration, add version check * update checkversion to do same validation as migration * changes per CR * add sa migration to storj-sim * add different debug port in storj-sim for migration * add wait for exit for storj-sim migration * update sa docker entrypoint to support migration * storj-sim satellite parts all wait for migration * upgrade golang-migrate/migrate to v4 because bug * fix go mod tidy

view details

Kaloyan Raev

commit sha 857c7f3cd7d7f59e04684f3b36732c681d919cbd

storagenode/updater: disable self-autoupdate (#3459)

view details

Kaloyan Raev

commit sha 84fea5820f38cd7cdd2b03319c71de0bd25b147f

Fix error when storagenode-updater service is stopped (#3440)

view details

Bryan White

commit sha 3a842bf53fe236a7839dddb390c6865bcada1eb2

change MaxClockOffset (renamed) to 15 min and use duration type (#3438)

view details

Yehor Butko

commit sha 5cb46d2ce3f6d7f0747986ea5f606a9b069e2d0d

satellite/payments: mock payment service created, api calls from frontend returned (#3448)

view details

Yehor Butko

commit sha 761cec5ea3fadf342dadb35e94d69c013f53ecae

satellite/payments: archview comments updated (#3464)

view details

Jennifer Li Johnson

commit sha 0621830f0543dc6f4d7484e18f022dcb3b8e895a

cmd/storj-sim: allow user to designate which redis db they want to start at (#3458)

view details

Yehor Butko

commit sha 0c2e498f09a8217dfbff256c4b09e2671a39d2fe

satellite/satellitedb: console tables archview comments updated (#3465)

view details

Moby von Briesen

commit sha 3da134532401d9e5507dead1e0cd6a32ba04daf9

Merge branch 'master' of github.com:storj/storj into green/ge-piece-does-not-exist

view details

Isaac Hess

commit sha 4d26d0a6a67f1d7a5ad15261977fd9ed7e26e8d0

storagenode/pieces: Add migration from v0 piece to v1 piece (#3401)

view details

Moby von Briesen

commit sha c41488286f9ee13ee17286922244ee9e98132005

Merge branch 'master' of github.com:storj/storj into green/ge-piece-does-not-exist

view details

Moby von Briesen

commit sha 624783bbb6389bc3bebdcb414f39f952e4ec8bea

delete tx queue items and pending map if node not in pointer

view details

push time in 8 days

Pull request review commentstorj/storj

satellite/gracefulexit: handle piece not found messages from storagenode

 func (endpoint *Endpoint) handleFailed(ctx context.Context, pending *pendingMap,  	errorCode := int(pb.TransferFailed_Error_value[message.Failed.Error.String()]) -	// TODO if error code is NOT_FOUND, the node no longer has the piece. remove the queue item and the pointer+	// If the error code is NOT_FOUND, the node no longer has the piece.+	// Remove the queue item and remove the node from the pointer.+	// If the pointer is not piece hash verified, do not count this as a failure.+	if pb.TransferFailed_Error(errorCode) == pb.TransferFailed_NOT_FOUND {+		endpoint.log.Debug("piece not found on node", zap.Stringer("node ID", nodeID), zap.ByteString("path", transfer.path), zap.Int32("piece num", transfer.pieceNum))+		pointer, err := endpoint.metainfo.Get(ctx, string(transfer.path))+		if err != nil {+			return Error.Wrap(err)+		}+		remote := pointer.GetRemote()+		if remote == nil {+			return nil+		}+		pieces := remote.GetRemotePieces()++		var nodePiece *pb.RemotePiece+		for _, piece := range pieces {+			if piece.NodeId == nodeID && piece.PieceNum == transfer.pieceNum {+				nodePiece = piece+			}+		}+		if nodePiece == nil {+			return nil+		}++		_, err = endpoint.metainfo.UpdatePieces(ctx, string(transfer.path), pointer, nil, []*pb.RemotePiece{nodePiece})+		if err != nil {+			return Error.Wrap(err)+		}++		// If the pointer was piece hash verified, we know this node definitely should have the piece+		// Otherwise, no penalty.+		if pointer.PieceHashesVerified {

Shouldn't we do that as part of V3-3088? @ethanadams @VinozzZ

mobyvb

comment created time in 9 days

delete branch storj/storj

delete branch : green/ge-fix-test-disable

delete time in 11 days

push eventstorj/storj

Maximillian von Briesen

commit sha f9df0ea591d038663a0aaabb42718c11a0223f9c

satellite/gracefulexit: check for unknown error in graceful exit disable test Allow error in graceful exit disable test to be rpcstatus.Unimplemented (grpc) or rpcstatus.Unknown (drpc)

view details

push time in 11 days

PR merged storj/storj

Reviewers
satellite/gracefulexit: check for unkown error in graceful exit disable test Request Code Review Reviewer Can Merge cla-signed

What: Allow error in graceful exit disable test to be rpcstatus.Unknown or rpcstatus.Unimplemented

Why: grpc returns Unimplemented and drpc returns Unknown

Please describe the tests:

  • Test 1:
  • Test 2:

Please describe the performance impact:

Code Review Checklist (to be filled out by reviewer)

  • [x] NEW: Are there any Satellite database migrations? Are they forwards and backwards compatible?
  • [x] Does the PR describe what changes are being made?
  • [x] Does the PR describe why the changes are being made?
  • [x] Does the code follow our style guide?
  • [x] Does the code follow our testing guide?
  • [x] Is the PR appropriately sized? (If it could be broken into smaller PRs it should be)
  • [x] Does the new code have enough tests? (every PR should have tests or justification otherwise. Bug-fix PRs especially)
  • [x] Does the new code have enough documentation that answers "how do I use it?" and "what does it do?"? (both source documentation and higher level, diagrams?)
  • [x] Does any documentation need updating?
  • [x] Do the database access patterns make sense?
+4 -1

0 comment

1 changed file

mobyvb

pr closed time in 11 days

push eventstorj/storj

Moby von Briesen

commit sha 180ca11e3df13e37e57d0a8ec0111c879fef9963

check that nodes were deleted from pointer in tests

view details

push time in 11 days

PR opened storj/storj

satellite/gracefulexit: handle piece not found messages from storagenode

What:

  • If a node claims to fail a transfer due to piece not found, remove that node from the pointer, delete the transfer queue item.
  • If the pointer is piece hash verified, penalize the node. Otherwise, do not penalize the node.

Why: A node might not have ever received a piece that ended up in an old, non-piece-hash-verified pointer. We do not want to unfairly penalize nodes for these pieces. https://storjlabs.atlassian.net/browse/V3-3087

Please describe the tests:

  • Test 1: TestFailureNotFoundPieceHashVerified - Initiate graceful exit for a node holding one piece. The node responds to a transfer request with a transfer failed message and a NOT_FOUND error. Expect that the node is no longer in the pointer, and that the node has been marked as failing to transfer a piece.
  • Test 2: TestFailureNotFoundPieceHashUnverified - Initiate graceful exit for a node holding one piece. Update the pointer to have PieceHashesVerified=false. The node responds to a transfer request with a transfer failed message and a NOT_FOUND error. Expect that the node is no longer in the pointer, and that the node has not been marked as failing to transfer a piece.

Please describe the performance impact:

Code Review Checklist (to be filled out by reviewer)

  • [ ] NEW: Are there any Satellite database migrations? Are they forwards and backwards compatible?
  • [ ] Does the PR describe what changes are being made?
  • [ ] Does the PR describe why the changes are being made?
  • [ ] Does the code follow our style guide?
  • [ ] Does the code follow our testing guide?
  • [ ] Is the PR appropriately sized? (If it could be broken into smaller PRs it should be)
  • [ ] Does the new code have enough tests? (every PR should have tests or justification otherwise. Bug-fix PRs especially)
  • [ ] Does the new code have enough documentation that answers "how do I use it?" and "what does it do?"? (both source documentation and higher level, diagrams?)
  • [ ] Does any documentation need updating?
  • [ ] Do the database access patterns make sense?
+168 -1

0 comment

2 changed files

pr created time in 11 days

push eventstorj/storj

Jennifer Li Johnson

commit sha 76b64b79ba7ed1086922acbb241217c3f9c24770

cmd/identity: allow using redis for RevocationDB (#3259)

view details

Egon Elbre

commit sha be2dd1ca728d50846f7b6dd9ee71964262a29e2b

cmd/storj-sim: add --redis flag (#3451)

view details

Maximillian von Briesen

commit sha a8fd66ee796b0c1298fa2121a98cec8959afe651

Merge branch 'master' into green/ge-fix-test-disable

view details

push time in 11 days

Pull request review commentstorj/storj

satellite/gracefulexit: make disabled test more permissive

 func TestExitDisabled(t *testing.T) {  		// Process endpoint should return immediately if GE is disabled 		response, err := processClient.Recv()-		require.True(t, errs2.IsRPC(err, rpcstatus.Unimplemented))+		require.Error(t, err)

This is the branch I made earlier today. I wouldn't mind approving this PR, but I'd prefer the changes in the otherone: https://github.com/storj/storj/pull/3455

zeebo

comment created time in 11 days

Pull request review commentstorj/storj

satellite/gracefulexit: make disabled test more permissive

 func TestExitDisabled(t *testing.T) {  		// Process endpoint should return immediately if GE is disabled 		response, err := processClient.Recv()-		require.True(t, errs2.IsRPC(err, rpcstatus.Unimplemented))+		require.Error(t, err)

Only 2 out of the ~30 places where rpc errors are returned in the graceful exit endpoint are Unknown

zeebo

comment created time in 11 days

PR opened storj/storj

satellite/gracefulexit: check for unkown error in graceful exit disable test

What: Allow error in graceful exit disable test to be rpcstatus.Unknown or rpcstatus.Unimplemented

Why: grpc returns Unimplemented and drpc returns Unknown

Please describe the tests:

  • Test 1:
  • Test 2:

Please describe the performance impact:

Code Review Checklist (to be filled out by reviewer)

  • [ ] NEW: Are there any Satellite database migrations? Are they forwards and backwards compatible?
  • [ ] Does the PR describe what changes are being made?
  • [ ] Does the PR describe why the changes are being made?
  • [ ] Does the code follow our style guide?
  • [ ] Does the code follow our testing guide?
  • [ ] Is the PR appropriately sized? (If it could be broken into smaller PRs it should be)
  • [ ] Does the new code have enough tests? (every PR should have tests or justification otherwise. Bug-fix PRs especially)
  • [ ] Does the new code have enough documentation that answers "how do I use it?" and "what does it do?"? (both source documentation and higher level, diagrams?)
  • [ ] Does any documentation need updating?
  • [ ] Do the database access patterns make sense?
+4 -1

0 comment

1 changed file

pr created time in 11 days

Pull request review commentstorj/storj

satellite/gracefulexit: make disabled test more permissive

 func TestExitDisabled(t *testing.T) {  		// Process endpoint should return immediately if GE is disabled 		response, err := processClient.Recv()-		require.True(t, errs2.IsRPC(err, rpcstatus.Unimplemented))+		require.Error(t, err)

Because then there is no guarantee that it is an error related to the endpoint being unimplemented on the server side as opposed to some other arbitrary graceful exit error.

zeebo

comment created time in 11 days

more