mirror of
				https://github.com/facebookresearch/sam2.git
				synced 2025-11-04 11:32:12 +08:00 
			
		
		
		
	open README.md with unicode (to support Hugging Face emoji); fix various typos (#218)
				
					
				
			(close #217, #66, #67, #69, #91, #126, #127, #145)
This commit is contained in:
		
							parent
							
								
									0db838b117
								
							
						
					
					
						commit
						7e1596c0b6
					
				@ -16,7 +16,7 @@ from torch import nn
 | 
				
			|||||||
class PositionEmbeddingSine(nn.Module):
 | 
					class PositionEmbeddingSine(nn.Module):
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
    This is a more standard version of the position embedding, very similar to the one
 | 
					    This is a more standard version of the position embedding, very similar to the one
 | 
				
			||||||
    used by the Attention is all you need paper, generalized to work on images.
 | 
					    used by the Attention Is All You Need paper, generalized to work on images.
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def __init__(
 | 
					    def __init__(
 | 
				
			||||||
 | 
				
			|||||||
@ -642,7 +642,7 @@ class SAM2Base(torch.nn.Module):
 | 
				
			|||||||
                pix_feat_with_mem = pix_feat_with_mem.permute(1, 2, 0).view(B, C, H, W)
 | 
					                pix_feat_with_mem = pix_feat_with_mem.permute(1, 2, 0).view(B, C, H, W)
 | 
				
			||||||
                return pix_feat_with_mem
 | 
					                return pix_feat_with_mem
 | 
				
			||||||
 | 
					
 | 
				
			||||||
            # Use a dummy token on the first frame (to avoid emtpy memory input to tranformer encoder)
 | 
					            # Use a dummy token on the first frame (to avoid empty memory input to tranformer encoder)
 | 
				
			||||||
            to_cat_memory = [self.no_mem_embed.expand(1, B, self.mem_dim)]
 | 
					            to_cat_memory = [self.no_mem_embed.expand(1, B, self.mem_dim)]
 | 
				
			||||||
            to_cat_memory_pos_embed = [self.no_mem_pos_enc.expand(1, B, self.mem_dim)]
 | 
					            to_cat_memory_pos_embed = [self.no_mem_pos_enc.expand(1, B, self.mem_dim)]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
				
			|||||||
@ -183,7 +183,7 @@ class SAM2ImagePredictor:
 | 
				
			|||||||
        normalize_coords=True,
 | 
					        normalize_coords=True,
 | 
				
			||||||
    ) -> Tuple[List[np.ndarray], List[np.ndarray], List[np.ndarray]]:
 | 
					    ) -> Tuple[List[np.ndarray], List[np.ndarray], List[np.ndarray]]:
 | 
				
			||||||
        """This function is very similar to predict(...), however it is used for batched mode, when the model is expected to generate predictions on multiple images.
 | 
					        """This function is very similar to predict(...), however it is used for batched mode, when the model is expected to generate predictions on multiple images.
 | 
				
			||||||
        It returns a tupele of lists of masks, ious, and low_res_masks_logits.
 | 
					        It returns a tuple of lists of masks, ious, and low_res_masks_logits.
 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
        assert self._is_batch, "This function should only be used when in batched mode"
 | 
					        assert self._is_batch, "This function should only be used when in batched mode"
 | 
				
			||||||
        if not self._is_image_set:
 | 
					        if not self._is_image_set:
 | 
				
			||||||
 | 
				
			|||||||
@ -44,7 +44,7 @@ class SAM2VideoPredictor(SAM2Base):
 | 
				
			|||||||
        offload_state_to_cpu=False,
 | 
					        offload_state_to_cpu=False,
 | 
				
			||||||
        async_loading_frames=False,
 | 
					        async_loading_frames=False,
 | 
				
			||||||
    ):
 | 
					    ):
 | 
				
			||||||
        """Initialize a inference state."""
 | 
					        """Initialize an inference state."""
 | 
				
			||||||
        compute_device = self.device  # device of the model
 | 
					        compute_device = self.device  # device of the model
 | 
				
			||||||
        images, video_height, video_width = load_video_frames(
 | 
					        images, video_height, video_width = load_video_frames(
 | 
				
			||||||
            video_path=video_path,
 | 
					            video_path=video_path,
 | 
				
			||||||
@ -589,7 +589,7 @@ class SAM2VideoPredictor(SAM2Base):
 | 
				
			|||||||
        # to `propagate_in_video_preflight`).
 | 
					        # to `propagate_in_video_preflight`).
 | 
				
			||||||
        consolidated_frame_inds = inference_state["consolidated_frame_inds"]
 | 
					        consolidated_frame_inds = inference_state["consolidated_frame_inds"]
 | 
				
			||||||
        for is_cond in [False, True]:
 | 
					        for is_cond in [False, True]:
 | 
				
			||||||
            # Separately consolidate conditioning and non-conditioning temp outptus
 | 
					            # Separately consolidate conditioning and non-conditioning temp outputs
 | 
				
			||||||
            storage_key = "cond_frame_outputs" if is_cond else "non_cond_frame_outputs"
 | 
					            storage_key = "cond_frame_outputs" if is_cond else "non_cond_frame_outputs"
 | 
				
			||||||
            # Find all the frames that contain temporary outputs for any objects
 | 
					            # Find all the frames that contain temporary outputs for any objects
 | 
				
			||||||
            # (these should be the frames that have just received clicks for mask inputs
 | 
					            # (these should be the frames that have just received clicks for mask inputs
 | 
				
			||||||
@ -598,7 +598,7 @@ class SAM2VideoPredictor(SAM2Base):
 | 
				
			|||||||
            for obj_temp_output_dict in temp_output_dict_per_obj.values():
 | 
					            for obj_temp_output_dict in temp_output_dict_per_obj.values():
 | 
				
			||||||
                temp_frame_inds.update(obj_temp_output_dict[storage_key].keys())
 | 
					                temp_frame_inds.update(obj_temp_output_dict[storage_key].keys())
 | 
				
			||||||
            consolidated_frame_inds[storage_key].update(temp_frame_inds)
 | 
					            consolidated_frame_inds[storage_key].update(temp_frame_inds)
 | 
				
			||||||
            # consolidate the temprary output across all objects on this frame
 | 
					            # consolidate the temporary output across all objects on this frame
 | 
				
			||||||
            for frame_idx in temp_frame_inds:
 | 
					            for frame_idx in temp_frame_inds:
 | 
				
			||||||
                consolidated_out = self._consolidate_temp_output_across_obj(
 | 
					                consolidated_out = self._consolidate_temp_output_across_obj(
 | 
				
			||||||
                    inference_state, frame_idx, is_cond=is_cond, run_mem_encoder=True
 | 
					                    inference_state, frame_idx, is_cond=is_cond, run_mem_encoder=True
 | 
				
			||||||
 | 
				
			|||||||
@ -68,7 +68,7 @@ def mask_to_box(masks: torch.Tensor):
 | 
				
			|||||||
    compute bounding box given an input mask
 | 
					    compute bounding box given an input mask
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    Inputs:
 | 
					    Inputs:
 | 
				
			||||||
    - masks: [B, 1, H, W] boxes, dtype=torch.Tensor
 | 
					    - masks: [B, 1, H, W] masks, dtype=torch.Tensor
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    Returns:
 | 
					    Returns:
 | 
				
			||||||
    - box_coords: [B, 1, 4], contains (x, y) coordinates of top left and bottom right box corners, dtype=torch.Tensor
 | 
					    - box_coords: [B, 1, 4], contains (x, y) coordinates of top left and bottom right box corners, dtype=torch.Tensor
 | 
				
			||||||
@ -120,7 +120,7 @@ class AsyncVideoFrameLoader:
 | 
				
			|||||||
        self.offload_video_to_cpu = offload_video_to_cpu
 | 
					        self.offload_video_to_cpu = offload_video_to_cpu
 | 
				
			||||||
        self.img_mean = img_mean
 | 
					        self.img_mean = img_mean
 | 
				
			||||||
        self.img_std = img_std
 | 
					        self.img_std = img_std
 | 
				
			||||||
        # items in `self._images` will be loaded asynchronously
 | 
					        # items in `self.images` will be loaded asynchronously
 | 
				
			||||||
        self.images = [None] * len(img_paths)
 | 
					        self.images = [None] * len(img_paths)
 | 
				
			||||||
        # catch and raise any exceptions in the async loading thread
 | 
					        # catch and raise any exceptions in the async loading thread
 | 
				
			||||||
        self.exception = None
 | 
					        self.exception = None
 | 
				
			||||||
 | 
				
			|||||||
@ -72,7 +72,7 @@ parser.add_argument(
 | 
				
			|||||||
parser.add_argument(
 | 
					parser.add_argument(
 | 
				
			||||||
    "--do_not_skip_first_and_last_frame",
 | 
					    "--do_not_skip_first_and_last_frame",
 | 
				
			||||||
    help="In SA-V val and test, we skip the first and the last annotated frames in evaluation. "
 | 
					    help="In SA-V val and test, we skip the first and the last annotated frames in evaluation. "
 | 
				
			||||||
    "Set this to true for evaluation on settings that doen't skip first and last frames",
 | 
					    "Set this to true for evaluation on settings that doesn't skip first and last frames",
 | 
				
			||||||
    action="store_true",
 | 
					    action="store_true",
 | 
				
			||||||
)
 | 
					)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
				
			|||||||
@ -183,7 +183,7 @@ def _seg2bmap(seg, width=None, height=None):
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
    assert not (
 | 
					    assert not (
 | 
				
			||||||
        width > w | height > h | abs(ar1 - ar2) > 0.01
 | 
					        width > w | height > h | abs(ar1 - ar2) > 0.01
 | 
				
			||||||
    ), "Can" "t convert %dx%d seg to %dx%d bmap." % (w, h, width, height)
 | 
					    ), "Cannot convert %dx%d seg to %dx%d bmap." % (w, h, width, height)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    e = np.zeros_like(seg)
 | 
					    e = np.zeros_like(seg)
 | 
				
			||||||
    s = np.zeros_like(seg)
 | 
					    s = np.zeros_like(seg)
 | 
				
			||||||
 | 
				
			|||||||
							
								
								
									
										2
									
								
								setup.py
									
									
									
									
									
								
							
							
						
						
									
										2
									
								
								setup.py
									
									
									
									
									
								
							@ -17,7 +17,7 @@ AUTHOR_EMAIL = "segment-anything@meta.com"
 | 
				
			|||||||
LICENSE = "Apache 2.0"
 | 
					LICENSE = "Apache 2.0"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# Read the contents of README file
 | 
					# Read the contents of README file
 | 
				
			||||||
with open("README.md", "r") as f:
 | 
					with open("README.md", "r", encoding="utf-8") as f:
 | 
				
			||||||
    LONG_DESCRIPTION = f.read()
 | 
					    LONG_DESCRIPTION = f.read()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# Required dependencies
 | 
					# Required dependencies
 | 
				
			||||||
 | 
				
			|||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user